In [205]:
# !pip install kaggle
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c titanic
# !ls

In [206]:
# The usuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

from sklearn import preprocessing

# LightGBM\XGBoost\GBDT
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier

# Sklearn tools for model training and assessment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import cross_val_score, ShuffleSplit

from sklearn.ensemble import VotingClassifier

In [207]:
trainRaw = pd.read_csv('train.csv')
testRaw = pd.read_csv('test.csv')

# Concatenate together
TrainRow = trainRaw.shape[0]
allsets = pd.concat([trainRaw, testRaw], axis=0,sort=True)

In [208]:
# Build new features from Cabin
# Seperate Canbin to Letter and Number for who has Canbin
def CabSplit(s):
    """
    Function to try and extract cabin letter and number from the cabin column.
    Runs a regular expression that finds letters and numbers in the
    string. These are held in match.group, if they exist.
    """
    match = re.match(r"([a-z]+)([0-9]+)",s,re.I) #re.I 使匹配对大小写不敏感

    try:
        letter = match.group(1) #满足([a-z]+)的部分
    except:
        letter = ''

    try:
        number = match.group(2) #满足([0-9]+)的部分
    except:
        number = 9999
    return letter, number

# Count the number of Canbin
def DR(s):
    """
    From the cabin string, try and extract letter, number, and number of cabins
    """
    # Check contents
    if isinstance(s, (int,float)): 
      # 若s是int或者float(只有数字形式) 
      # 说明没有Cabin信息 为空 
       letter = ''
       number = ''
       nCabins = 9999
    else:
       # If field isn't empty, split string to letter and unmber. 
       # Some strings contain multiple cabins.
       s = s.split(' ')
       # Count the cabins based on number of splits
       nCabins = len(s)
       # Just take first cabin for letter/number extraction
       s = s[0]

       letter, number = CabSplit(s)

    return [letter, number, nCabins]

# Apply DR function to each cell in Cabin column using pandas apply method.
out = allsets['Cabin'].apply(DR)

# Output tuple with 3 values for each row, convert this to pandas df
out = out.apply(pd.Series)

# name the columns
out.columns = ['Cabletter','Cabnumber','Cabcount']

# Concatenate these columns to the dataset
allsets = pd.concat([allsets,out],axis = 1)      


In [209]:
# Add some family features directly to new columns in the dataset

#size
allsets['fsize'] = allsets['SibSp'] + allsets['Parch'] + 1

#Ratio
allsets['fRatio'] = (allsets['Parch'] + 1)/(allsets['SibSp'] + 1)

#Adult
allsets['Adult'] = allsets['Age'] > 18

In [210]:
# Extract titles from Name column, standardise
titleDict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Sir",
    "Don": "Sir",
    "Sir": "Sir",
    "Dr": "Dr",
    "Rev": "Rev",
    "theCountess": "Lady",
    "Dona": "Lady",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Lady"
}

def NameSplit(s, titleDict):
    """
    Extract title from name, replace with value in title dictionary. Also
    return surname.
    """

    # Remove '.' from name string
    s = s.replace('.','')
    # Split on spaces
    s = s.split(' ')
    # get surname
    surname = s[0]

    #get title - loop over titleDict
    #if s matches a key, take the corresponding value as the tile
    title = []
    for k, t in titleDict.items():
      if str(k) in s:
        title.append(t)
        
    if title == []:
        title = 'other'
    else:
        # Title is a list, so extract contents
        title = title[0]
    return title

out = allsets['Name'].apply(NameSplit,args=[titleDict])

out = out.apply(pd.Series)
out.columns = ['Title']
allsets = pd.concat([allsets, out],axis=1)


In [211]:
TicketDict = {
    "A./5.": "A",
    "A.5.": "A",
    "A/4": "A",
    "A/4.": "A",
    "A/5": "A",
    "A/5.": "A",
    "A/S": "A",
    "A4.": "A",
    "C": "C",
    "C.A.":"C",
    "CA.": "C",
    "C.A./": "C",
    "CA": "C",
    "F.C.": "FC",
    "F.C.C.": "FC",
    "LINE":"LINE",
    "PC": "PC",
    "PP": "PC",
    "SOTON/O.Q.": "SO",
    "SOTON/O2": "SO",
    "SOTON/OQ": "SO",
    "STON/O" : "ST",
    "STON/O2.": "ST"
}

def splitTic(s):

    match = re.match(r"([0-9]+)", s, re.I)

    try:
        number = int(match.group(1))
    except:
        number = s

    return number

def SP(s, TicketDict):
    s = splitTic(s)
    if isinstance(s, (int)):
       if s <= 9999:
          tic = 'a'
       if ((s >= 10000) and (s < 99999)):
          tic = 'b'
       if ((s >= 100000) and (s < 999999)):
          tic = 'c'
       else:
          tic = 'd'
    else:
       s = s.split(' ')
      
       tic = [t for k, t in TicketDict.items() if str(k) in s]
      
       if tic == []:
        tic = 'Other'
       else:
        # Title is a list, so extract contents
        tic = tic[0]
      
    return tic

# out = allsets['Ticket'].apply(splitTic, args=[TicketDict])
out = allsets['Ticket'].apply(SP, args = [TicketDict])
out = out.apply(pd.Series)
out.columns = ['Tickettype']

allsets = pd.concat([allsets,out],axis=1)



In [212]:
# def Farelevel(s):
#     if s <= 10:
#        fee = 'aa'
#     elif s <= 80:
#        fee = 'ee'
#     elif s <= 200:
#        fee = 'gg'
#     else:
#        fee = 'hh'
#     return fee

# out = allsets['Fare'].apply(Farelevel)
# out = out.apply(pd.Series)
# out.columns = ['Fee']

# allsets = pd.concat([allsets, out],axis=1)

In [213]:
catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Title', 'Tickettype']
# catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype','Fee']

for c in catCols:
    allsets[c] = pd.factorize(allsets[c])[0]

# Replace missing age value with median
allsets['Age'] = allsets['Age'].fillna(allsets['Age'].median())
allsets['Fare'] = allsets['Fare'].fillna(allsets['Fare'].median())
allsets = allsets.drop(['Ticket', 'Cabin', 'Name','PassengerId'],axis=1)

In [216]:
# Split datasets
train = allsets.iloc[0:TrainRow,:]
test = allsets.iloc[TrainRow:,:]
test = test.drop(['Survived'],axis=1)

In [219]:
# Validation set 
X_trainData, X_validData, y_train, y_valid = train_test_split(train.drop(['Survived'],axis=1),
                                             train.Survived, 
                                             test_size = 0.3, 
                                             stratify=train.Survived,
                                             random_state=28)

LightGBM

In [220]:
lgbmodel = lgb.LGBMClassifier(
          boosting_type = 'gbdt', 
          max_depth = 10, 
          objective = 'binary',
          nthread = 3, 
          num_leaves = 31, 
          learning_rate =  0.1, 
          max_bin = 512, 
          subsample_for_bin = 200,
          subsample = 0.75,
          subsample_freq = 1,
          colsample_bytree = 0.65, 
          reg_alpha = 3, 
          reg_lambda = 3, 
          min_split_gain = 0.5,
          min_child_weight = 1,
          min_child_samples = 2,
          scale_pos_weight = 1,
          num_class = 1,
          metric = 'binary_error',
          n_estimators = 200)

lgbmodel.fit(X_trainData, y_train, eval_set=[(X_trainData, y_train), (X_validData, y_valid)], eval_metric='error',early_stopping_rounds=50)
 

[1]	training's binary_error: 0.383628	valid_1's binary_error: 0.384328
Training until validation scores don't improve for 50 rounds.
[2]	training's binary_error: 0.383628	valid_1's binary_error: 0.384328
[3]	training's binary_error: 0.383628	valid_1's binary_error: 0.384328
[4]	training's binary_error: 0.268058	valid_1's binary_error: 0.309701
[5]	training's binary_error: 0.216693	valid_1's binary_error: 0.272388
[6]	training's binary_error: 0.205457	valid_1's binary_error: 0.257463
[7]	training's binary_error: 0.194222	valid_1's binary_error: 0.25
[8]	training's binary_error: 0.17817	valid_1's binary_error: 0.231343
[9]	training's binary_error: 0.179775	valid_1's binary_error: 0.227612
[10]	training's binary_error: 0.165329	valid_1's binary_error: 0.208955
[11]	training's binary_error: 0.163724	valid_1's binary_error: 0.208955
[12]	training's binary_error: 0.17817	valid_1's binary_error: 0.220149
[13]	training's binary_error: 0.17817	valid_1's binary_error: 0.220149
[14]	training's bi

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.65,
               importance_type='split', learning_rate=0.1, max_bin=512,
               max_depth=10, metric='binary_error', min_child_samples=2,
               min_child_weight=1, min_split_gain=0.5, n_estimators=200,
               n_jobs=-1, nthread=3, num_class=1, num_leaves=31,
               objective='binary', random_state=None, reg_alpha=3, reg_lambda=3,
               scale_pos_weight=1, silent=True, subsample=0.75,
               subsample_for_bin=200, subsample_freq=1)

In [221]:
lgbpredict = lgbmodel.predict(test)
lgbpredict

array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

XGBoost

In [222]:
xgbmodel = xgb.XGBClassifier( 
            max_depth = 5,
            learning_rate = 0.5,
            n_estimators = 100,
            silent = True,
            objective = 'binary:logistic',
            nthread = -1,
            gamma = 0,
            min_child_weight = 5,
            max_delta_step = 0,
            subsampl = 0.7,
            colsample_bytre = 0.5,
            colsample_bylevel = 1,
            reg_alpha = 5,
            reg_lambda = 3,
            scale_pos_weight = 1,
            seed = 28,
            missing = None
)
xgbmodel.fit(X_trainData, y_train, eval_set=[(X_trainData, y_train), (X_validData, y_valid)], eval_metric='error',early_stopping_rounds=50)


[0]	validation_0-error:0.17496	validation_1-error:0.220149
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.17496	validation_1-error:0.220149
[2]	validation_0-error:0.17496	validation_1-error:0.220149
[3]	validation_0-error:0.152488	validation_1-error:0.186567
[4]	validation_0-error:0.152488	validation_1-error:0.186567
[5]	validation_0-error:0.141252	validation_1-error:0.182836
[6]	validation_0-error:0.128411	validation_1-error:0.186567
[7]	validation_0-error:0.134831	validation_1-error:0.190299
[8]	validation_0-error:0.120385	validation_1-error:0.190299
[9]	validation_0-error:0.123596	validation_1-error:0.197761
[10]	validation_0-error:0.123596	validation_1-error:0.190299
[11]	validation_0-error:0.117175	validation_1-error:0.190299
[12]	validation_0-error:0.110754	validation_1-error:0.190299
[13]	validation_0-error:0.110754	validation_1-error:0.19029

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytre=0.5, colsample_bytree=1,
              gamma=0, learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=-1, objective='binary:logistic', random_state=0,
              reg_alpha=5, reg_lambda=3, scale_pos_weight=1, seed=28,
              silent=True, subsampl=0.7, subsample=1, verbosity=1)

In [223]:
xgbpredict = xgbmodel.predict(test)
xgbpredict

array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       1., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

GBDT

In [232]:
gbdt = GradientBoostingClassifier(subsample=0.7,learning_rate=0.03,
                                  random_state=28, n_estimators=100,
                                  min_samples_leaf=3)
gbdt.fit(X_trainData, y_train)
train_pred = gbdt.predict(X_trainData)
valid_pred = gbdt.predict(X_validData)

acc_train = gbdt.score(X_trainData, y_train)
acc_valid = gbdt.score(X_validData, y_valid)

print(acc_train)
print(acc_valid)

0.8908507223113965
0.8134328358208955


In [225]:
gbdtpredict = gbdt.predict(test)
gbdtpredict

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0.

RF

In [226]:
rf = RandomForestClassifier(oob_score=True, max_depth=10, random_state=28, n_estimators=100, max_features=None)
rf.fit(train.drop(['Survived'],axis=1), train['Survived'])
print(rf.oob_score_)

0.8361391694725028


In [234]:
# lr = LogisticRegression()
# sclf = StackingClassifier(classifiers=[lgbmodel, xgbmodel, gbdt, rf], 
#                           meta_classifier=lr)
# sclf.fit(X_trainData, y_train)


# print('5-fold cross validation:\n')
 
# for basemodel, label in zip([lgbmodel, xgbmodel, gbdt, rf, sclf], 
#                       ['lgb', 
#                        'xgb', 
#                        'gbdt',
#                        'Random Forest',
#                        'StackingClassifier']):
 
#     scores = cross_val_score(basemodel, X_validData, y_valid, cv=5, scoring='accuracy')
#     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))


5-fold cross validation:

Accuracy: 0.81 (+/- 0.05) [lgb]
Accuracy: 0.79 (+/- 0.05) [xgb]
Accuracy: 0.80 (+/- 0.05) [gbdt]
Accuracy: 0.77 (+/- 0.06) [Random Forest]
Accuracy: 0.77 (+/- 0.06) [StackingClassifier]


In [228]:
_N_FOLDS = 5  # 采用5折交叉验证
kf = KFold(n_splits=_N_FOLDS, random_state=42)  # sklearn的交叉验证模块，用于划分数据


def get_oof(clf, X_train, y_train, X_test):
    # X_train: 1000 * 10
    # y_train: 1 * 1000
    # X_test : 500 * 10
    oof_train = np.zeros((X_train.shape[0], 1))  # 1000 * 1  Stacking后训练数据的输出
    oof_test_skf = np.empty((_N_FOLDS, X_test.shape[0], 1))  # 5 * 500 * 1，oof_test_skf[i]代表第i折交叉验证产生的模型对测试集预测结果

    for i, (train_index, test_index) in enumerate(kf.split(X_train)): # 交叉验证划分此时的训练集和验证集
        kf_X_train = X_train.loc[train_index]  # 800 * 10 训练集
        kf_y_train = y_train.loc[train_index]  # 1 * 800 训练集对应的输出
        kf_X_val = X_train.loc[test_index]  # 200 * 10  验证集

        clf.fit(kf_X_train, kf_y_train)  # 当前模型进行训练

        oof_train[test_index] = clf.predict(kf_X_val).reshape(-1, 1)  # 对当前验证集进行预测， 200 * 1
        oof_test_skf[i, :] = clf.predict(X_test).reshape(-1, 1)  # 对测试集预测 oof_test_skf[i, :] : 500 * 1

    oof_test = oof_test_skf.mean(axis=0)  # 对每一则交叉验证的结果取平均
    return oof_train, oof_test  # 返回当前分类器对训练集和测试集的预测结果



In [229]:
# 将你的每个分类器都调用get_oof函数，并把它们的结果合并，就得到了新的训练和测试数据new_train,new_test
finaltrain = train.drop(['Survived'],axis=1)
finallabel = train.Survived
# finaltest = np.array(test)
new_train, new_test = [], []
for clf in [lgbmodel, xgbmodel,gbdt,rf]:
    oof_train, oof_test = get_oof(clf, finaltrain, finallabel, test)
    new_train.append(oof_train)
    new_test.append(oof_test)

new_train = np.concatenate(new_train, axis=1)
new_test = np.int32(np.concatenate(new_test, axis=1) > 0.5)

# 用新的训练数据new_train作为新的模型的输入，stacking第二层
clf = RandomForestClassifier()
clf.fit(new_train, finallabel)
predict = clf.predict(new_test)


In [None]:
predict

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = testRaw['PassengerId']
submission['Survived'] = np.int32(predict >= 0.5)

In [None]:
submission

In [None]:
submission.to_csv('Titanic.csv',index=False)
from google.colab import files
files.download('Titanic.csv')