In [None]:
# !pip install kaggle
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c titanic
# !ls

In [None]:
# The usuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

# LightGBM\XGBoost\GBDT
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.model_selection import 

# Sklearn tools for model training and assessment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn import metrics


In [None]:
trainRaw = pd.read_csv('train.csv')
testRaw = pd.read_csv('test.csv')

# Concatenate together
TrainRow = trainRaw.shape[0]
allsets = pd.concat([trainRaw, testRaw], axis=0,sort=True)

In [None]:
# Build new features from Cabin
# Seperate Canbin to Letter and Number for who has Canbin
def CabSplit(s):
    """
    Function to try and extract cabin letter and number from the cabin column.
    Runs a regular expression that finds letters and numbers in the
    string. These are held in match.group, if they exist.
    """
    match = re.match(r"([a-z]+)([0-9]+)",s,re.I) #re.I 使匹配对大小写不敏感

    try:
        letter = match.group(1) #满足([a-z]+)的部分
    except:
        letter = ''

    try:
        number = match.group(2) #满足([0-9]+)的部分
    except:
        number = 9999
    return letter, number

# Count the number of Canbin
def DR(s):
    """
    From the cabin string, try and extract letter, number, and number of cabins
    """
    # Check contents
    if isinstance(s, (int,float)): 
      # 若s是int或者float(只有数字形式) 
      # 说明没有Cabin信息 为空 
       letter = ''
       number = ''
       nCabins = 9999
    else:
       # If field isn't empty, split string to letter and unmber. 
       # Some strings contain multiple cabins.
       s = s.split(' ')
       # Count the cabins based on number of splits
       nCabins = len(s)
       # Just take first cabin for letter/number extraction
       s = s[0]

       letter, number = CabSplit(s)

    return [letter, number, nCabins]

# Apply DR function to each cell in Cabin column using pandas apply method.
out = allsets['Cabin'].apply(DR)

# Output tuple with 3 values for each row, convert this to pandas df
out = out.apply(pd.Series)

# name the columns
out.columns = ['Cabletter','Cabnumber','Cabcount']

# Concatenate these columns to the dataset
allsets = pd.concat([allsets,out],axis = 1)      


In [None]:
# Add some family features directly to new columns in the dataset

#size
allsets['fsize'] = allsets['SibSp'] + allsets['Parch'] + 1

#Ratio
allsets['fRatio'] = (allsets['Parch'] + 1)/(allsets['SibSp'] + 1)

#Adult
allsets['Adult'] = allsets['Age'] > 18

In [None]:
titleDict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Sir",
    "Don": "Sir",
    "Sir": "Sir",
    "Dr": "Dr",
    "Rev": "Rev",
    "theCountess": "Lady",
    "Dona": "Lady",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Lady"
}

def NameSplit(s, titleDict):
    """
    Extract title from name, replace with value in title dictionary. Also
    return surname.
    """

    # Remove '.' from name string
    s = s.replace('.','')
    # Split on spaces
    s = s.split(' ')
    # get surname
    surname = s[0]

    #get title - loop over titleDict
    #if s matches a key, take the corresponding value as the tile
    title = []
    for k, t in titleDict.items():
      if str(k) in s:
        title.append(t)
        
    if title == []:
        title = 'other'
    else:
        # Title is a list, so extract contents
        title = title[0]
    return surname.strip(','), title

out = allsets['Name'].apply(NameSplit,args=[titleDict])

out = out.apply(pd.Series)
out.columns = ['Surname','Title']
allsets = pd.concat([allsets, out],axis=1)


In [None]:
TicketDict = {
    "A./5.": "A",
    "A.5.": "A",
    "A/4": "A",
    "A/4.": "A",
    "A/5": "A",
    "A/5.": "A",
    "A/S": "A",
    "A4.": "A",
    "C": "C",
    "C.A.":"C",
    "CA.": "C",
    "C.A./": "C",
    "CA": "C",
    "F.C.": "FC",
    "F.C.C.": "FC",
    "LINE":"LINE",
    "PC": "PC",
    "PP": "PC",
    "SOTON/O.Q.": "SO",
    "SOTON/O2": "SO",
    "SOTON/OQ": "SO",
    "STON/O" : "ST",
    "STON/O2.": "ST"
}

def splitTic(s):

    match = re.match(r"([0-9]+)", s, re.I)

    try:
        number = int(match.group(1))
    except:
        number = s

    return number


def SP(s, TicketDict):
    s = splitTic(s)
    if isinstance(s, (int)):
       if s <= 9999:
          tic = 'a'
       if ((s >= 10000) and (s < 99999)):
          tic = 'b'
       if ((s >= 100000) and (s < 999999)):
          tic = 'c'
       else:
          tic = 'd'
    else:
       s = s.split(' ')
      
       tic = [t for k, t in TicketDict.items() if str(k) in s]
      
       if tic == []:
        tic = 'Other'
       else:
        # Title is a list, so extract contents
        tic = tic[0]
      
    return tic

# out = allsets['Ticket'].apply(splitTic, args=[TicketDict])
out = allsets['Ticket'].apply(SP, args = [TicketDict])
out = out.apply(pd.Series)
out.columns = ['Tickettype']

allsets = pd.concat([allsets,out],axis=1)


In [None]:
# def Farelevel(s):
#     if s <= 10:
#        fee = 'aa'
#     elif s <= 20:
#        fee = 'bb'
#     elif s <= 30:
#        fee = 'cc'
#     elif s <= 40:
#        fee = 'dd'
#     elif s <= 80:
#        fee = 'ee'
#     elif s <= 100:
#        fee = 'ff'
#     elif s <= 200:
#        fee = 'gg'
#     else:
#        fee = 'hh'
#     return fee

# out = allsets['Fare'].apply(Farelevel)
# out = out.apply(pd.Series)
# out.columns = ['Fee']

# allsets = pd.concat([allsets, out],axis=1)

In [None]:
# List of categorical columns to record
catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype']
# catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype','Fee']


for c in catCols:
    allsets[c] = pd.factorize(allsets[c])[0]
# Generate a logical index of categorical columns to maybe use with LightGBM later
# catCols = [i for i,v in enumerate(allsets.dtypes) if str(v)=='category']

# Replace missing age value with median
allsets['Age'] = allsets['Age'].fillna(allsets['Age'].median())


In [None]:
allsets

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
0,22.0,,0,7.2500,"Braund, Mr. Owen Harris",0,1,3,0,1,0.0,A/5 21171,0,0,9999,2,0.5,True,0,0,0
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1.0,PC 17599,1,1,1,2,0.5,True,1,1,1
2,26.0,,0,7.9250,"Heikkinen, Miss. Laina",0,3,3,1,0,1.0,STON/O2. 3101282,0,0,9999,1,1.0,True,2,2,2
3,35.0,C123,0,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1.0,113803,1,2,1,2,0.5,True,3,1,3
4,35.0,,0,8.0500,"Allen, Mr. William Henry",0,5,3,0,0,0.0,373450,0,0,9999,1,1.0,True,4,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,,0,8.0500,"Spector, Mr. Woolf",0,1305,3,0,0,,A.5. 3236,0,0,9999,1,1.0,False,863,0,0
414,39.0,C105,1,108.9000,"Oliva y Ocana, Dona. Fermina",0,1306,1,1,0,,PC 17758,1,105,1,1,1.0,True,864,8,1
415,38.5,,0,7.2500,"Saether, Mr. Simon Sivertsen",0,1307,3,0,0,,SOTON/O.Q. 3101262,0,0,9999,1,1.0,True,865,0,7
416,28.0,,0,8.0500,"Ware, Mr. Frederick",0,1308,3,0,0,,359309,0,0,9999,1,1.0,False,810,0,3


In [None]:
# Split datasets
train = allsets.iloc[0:TrainRow,:]
test = allsets.iloc[TrainRow:,:]

# Prepare data
def prepGBX(data, classCol='', fDrop=[]):
    
    # Drop class column
    # 对于train和valid来说 有label项 Survived 所以要加入fDrop
    if classCol != '':
       labels = data[classCol]
       fDrop = fDrop + [classCol]
    # 对于test来说 没有label项 所以label为空
    else:
       labels = []
    
    # 对于train来说 Survived会被drop
    if fDrop != []:
       data = data.drop(fDrop, axis = 1)
      
    # Creat xgb Dmatrix
    xgbData = xgb.DMatrix(data, label=labels)
    # xgbData 是xgb格式的完整数据，train valid里包含label和除drop以外的特征 
    # labels 是普通格式的labels 与data对应
    # data是不包含labels的数据集
    return xgbData, labels, data

# Specify columns to drop
fDrop = ['Ticket', 'Cabin', 'Name']

In [None]:

# Validation set used for early stopping
trainData, validData = train_test_split(train, test_size=0.3, 
                                        stratify = train.Survived,
                                        random_state = 28)

# Prepare the data sets
# Train datasets
trainDataxgb, trainLabels, trainData = prepGBX(trainData,
                                               classCol='Survived',
                                               fDrop = fDrop)

# Valid datasets
validDataxgb, validLabels, validData = prepGBX(validData,
                                               classCol='Survived',
                                               fDrop = fDrop)


# Test datasets
testDataxgb, _, testData = prepGBX(test,
                                   classCol='Survived',
                                   fDrop = fDrop)

# All train datasets
allTrainDataxgb, allTrainLabels, allTrainData = prepGBX(train,
                                               classCol='Survived',
                                               fDrop = fDrop)


In [None]:
trainData

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
520,30.0,0,93.5000,0,521,1,1,0,6,69,1,1,1.00,True,424,2,4
241,28.0,2,15.5000,0,242,3,1,1,0,0,9999,2,0.50,False,211,2,3
88,23.0,0,263.0000,2,89,1,1,3,1,7,3,6,0.75,True,26,2,4
20,35.0,0,26.0000,0,21,2,0,0,0,0,9999,1,1.00,True,20,0,3
874,28.0,1,24.0000,0,875,2,1,1,0,0,9999,2,0.50,True,266,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,42.0,0,8.4042,1,198,3,0,0,0,0,9999,2,2.00,True,140,0,4
224,38.0,0,90.0000,0,225,1,0,1,1,33,1,2,0.50,True,198,0,4
522,28.0,1,7.2250,0,523,3,0,0,0,0,9999,1,1.00,False,426,0,4
512,36.0,0,26.2875,0,513,1,0,0,2,68,1,1,1.00,True,416,0,1


In [None]:
#trainDataxgb 的数据类型： <xgboost.core.DMatrix at 0x7faea350d550>

确定学习速率和tree_base参数调优的估计器数目

In [None]:
# Set default value
xgbmodel = xgb.XGBClassifier(
            max_depth=10,
            learning_rate=0.01,
            n_estimators=100,
            silent=True,
            objective='binary:logistic',
            nthread=-1,
            gamma=0,
            min_child_weight=1,
            max_delta_step=0,
            subsample=0.85,
            colsample_bytree=0.7,
            colsample_bylevel=1,
            reg_alpha=0,
            reg_lambda=1,
            scale_pos_weight=1,
            seed=28,
            missing=None)


In [None]:
 params = {
            'max_depth':10,
            'eta':0.01,
            'n_estimators':200,
            'silent':True,
            'objective':'binary:logistic',
            'nthread':-1,
            'gamma':0,
            'min_child_weight':1,
            'max_delta_step':0,
            'subsample':0.85,
            'colsample_bytree':0.7,
            'colsample_bylevel':1,
            'alpha':0,
            'lambda':1,
            'scale_pos_weight':1,
            'seed':28,
            'missing':None
 }

In [None]:
gridparams = {
              'max_depth': [3, 5, 8],
              'learning_rate': [0.01, 0.05, 0.1],
              'min_child_weight': [0, 1, 2, 5],
              'subsample': [0.7, 0.8],
              'colsample_bytree': [0.5, 0.7, 1],
              'reg_alpha': [0, 0.5, 1],
              'reg_lambda': [0, 0.5, 1],
}

In [None]:
# Creat the grid
grid = GridSearchCV(xgbmodel, param_grid=gridparams,
                    verbose=0, scoring='accuracy',
                    cv = 4)

# Run the grid
grid.fit(allTrainData, allTrainLabels)

print(grid.best_params_)
print(grid.best_score_)


{'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 8, 'min_child_weight': 5, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 0.8}
0.823814285137155


In [None]:
params['max_depth'] = grid.best_params_['max_depth']
params['eta'] = grid.best_params_['learning_rate']
params['min_child_weight'] = grid.best_params_['min_child_weight']
params['subsample'] = grid.best_params_['subsample']
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['alpha'] = grid.best_params_['reg_alpha']
params['lambda'] = grid.best_params_['reg_lambda']

print('Fitting with params: ')
print(params)

Fitting with params: 
{'max_depth': 8, 'eta': 0.01, 'n_estimators': 200, 'silent': True, 'objective': 'binary:logistic', 'nthread': -1, 'gamma': 0, 'min_child_weight': 5, 'max_delta_step': 0, 'subsample': 0.8, 'colsample_bytree': 0.5, 'colsample_bylevel': 1, 'alpha': 0, 'lambda': 0.5, 'scale_pos_weight': 1, 'seed': 28, 'missing': None}


In [None]:
evallist = [(trainDataxgb, 'train'), (validDataxgb, 'eval')]
num_round = 500
xgbm = xgb.train(params,
                 trainDataxgb,
                 num_round, 
                 evallist,
                 early_stopping_rounds=50,
                 verbose_eval=True
                 )


[0]	train-error:0.191011	eval-error:0.19403
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 50 rounds.
[1]	train-error:0.18138	eval-error:0.19403
[2]	train-error:0.17175	eval-error:0.208955
[3]	train-error:0.166934	eval-error:0.216418
[4]	train-error:0.160514	eval-error:0.208955
[5]	train-error:0.160514	eval-error:0.190299
[6]	train-error:0.17175	eval-error:0.19403
[7]	train-error:0.163724	eval-error:0.19403
[8]	train-error:0.168539	eval-error:0.190299
[9]	train-error:0.166934	eval-error:0.190299
[10]	train-error:0.168539	eval-error:0.190299
[11]	train-error:0.162119	eval-error:0.190299
[12]	train-error:0.163724	eval-error:0.190299
[13]	train-error:0.160514	eval-error:0.190299
[14]	train-error:0.162119	eval-error:0.190299
[15]	train-error:0.160514	eval-error:0.186567
[16]	train-error:0.157303	eval-error:0.186567
[17]	train-error:0.155698	eval-error:0.182836
[18]	train-error:0.152488	eval-error:0.19029

In [None]:
pred = xgbm.predict(testDataxgb)

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = np.int32(pred >= 0.5)

In [None]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
submission.to_csv('Titanic.csv',index=False)
from google.colab import files
files.download('Titanic.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>