In [None]:
# !pip install kaggle
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c titanic
# !ls

Imports

In [None]:
# The usuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

# LightGBM\XGBoost\GBDT
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.model_selection import 

# Sklearn tools for model training and assessment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score


Input Data

In [None]:
trainRaw = pd.read_csv('train.csv')
testRaw = pd.read_csv('test.csv')

# Concatenate together
TrainRow = trainRaw.shape[0]
allsets = pd.concat([trainRaw, testRaw], axis=0,sort=True)

In [None]:
allsets

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450
...,...,...,...,...,...,...,...,...,...,...,...,...
413,,,S,8.0500,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
414,39.0,C105,C,108.9000,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
415,38.5,,S,7.2500,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
416,,,S,8.0500,"Ware, Mr. Frederick",0,1308,3,male,0,,359309


Create new features

1.Cabin

In [None]:
# Build new features from Cabin
# Seperate Canbin to Letter and Number for who has Canbin
def CabSplit(s):
    """
    Function to try and extract cabin letter and number from the cabin column.
    Runs a regular expression that finds letters and numbers in the
    string. These are held in match.group, if they exist.
    """
    match = re.match(r"([a-z]+)([0-9]+)",s,re.I) #re.I 使匹配对大小写不敏感

    try:
        letter = match.group(1) #满足([a-z]+)的部分
    except:
        letter = ''

    try:
        number = match.group(2) #满足([0-9]+)的部分
    except:
        number = 9999
    return letter, number

# Count the number of Canbin
def DR(s):
    """
    From the cabin string, try and extract letter, number, and number of cabins
    """
    # Check contents
    if isinstance(s, (int,float)): 
      # 若s是int或者float(只有数字形式) 
      # 说明没有Cabin信息 为空 
       letter = ''
       number = ''
       nCabins = 9999
    else:
       # If field isn't empty, split string to letter and unmber. 
       # Some strings contain multiple cabins.
       s = s.split(' ')
       # Count the cabins based on number of splits
       nCabins = len(s)
       # Just take first cabin for letter/number extraction
       s = s[0]

       letter, number = CabSplit(s)

    return [letter, number, nCabins]

# Apply DR function to each cell in Cabin column using pandas apply method.
out = allsets['Cabin'].apply(DR)

# Output tuple with 3 values for each row, convert this to pandas df
out = out.apply(pd.Series)

# name the columns
out.columns = ['Cabletter','Cabnumber','Cabcount']

# Concatenate these columns to the dataset
allsets = pd.concat([allsets,out],axis = 1)      


In [None]:
# allsets

2.Family & Age

In [None]:
# Add some family features directly to new columns in the dataset

#size
allsets['fsize'] = allsets['SibSp'] + allsets['Parch'] + 1

#Ratio
allsets['fRatio'] = (allsets['Parch'] + 1)/(allsets['SibSp'] + 1)

#Adult
allsets['Adult'] = allsets['Age'] > 18

In [None]:
# allsets

3.Name

In [None]:
# Extract titles from Name column, standardise

In [None]:
titleDict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Sir",
    "Don": "Sir",
    "Sir": "Sir",
    "Dr": "Dr",
    "Rev": "Rev",
    "theCountess": "Lady",
    "Dona": "Lady",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Lady"
}

def NameSplit(s, titleDict):
    """
    Extract title from name, replace with value in title dictionary. Also
    return surname.
    """

    # Remove '.' from name string
    s = s.replace('.','')
    # Split on spaces
    s = s.split(' ')
    # get surname
    surname = s[0]

    #get title - loop over titleDict
    #if s matches a key, take the corresponding value as the tile
    title = []
    for k, t in titleDict.items():
      if str(k) in s:
        title.append(t)
        
    if title == []:
        title = 'other'
    else:
        # Title is a list, so extract contents
        title = title[0]
    return surname.strip(','), title

out = allsets['Name'].apply(NameSplit,args=[titleDict])

out = out.apply(pd.Series)
out.columns = ['Surname','Title']
allsets = pd.concat([allsets, out],axis=1)


In [None]:
allsets

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,,,9999,2,0.5,True,Braund,Mr
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,C,85,1,2,0.5,True,Cumings,Mrs
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,,,9999,1,1.0,True,Heikkinen,Miss
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,C,123,1,2,0.5,True,Futrelle,Mrs
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,,,9999,1,1.0,True,Allen,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,,,S,8.0500,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236,,,9999,1,1.0,False,Spector,Mr
414,39.0,C105,C,108.9000,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758,C,105,1,1,1.0,True,Oliva,Lady
415,38.5,,S,7.2500,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262,,,9999,1,1.0,True,Saether,Mr
416,,,S,8.0500,"Ware, Mr. Frederick",0,1308,3,male,0,,359309,,,9999,1,1.0,False,Ware,Mr


4.Tickets

In [None]:
TicketDict = {
    "A./5.": "A",
    "A.5.": "A",
    "A/4": "A",
    "A/4.": "A",
    "A/5": "A",
    "A/5.": "A",
    "A/S": "A",
    "A4.": "A",
    "C": "C",
    "C.A.":"C",
    "CA.": "C",
    "C.A./": "C",
    "CA": "C",
    "F.C.": "FC",
    "F.C.C.": "FC",
    "LINE":"LINE",
    "PC": "PC",
    "PP": "PC",
    "SOTON/O.Q.": "SO",
    "SOTON/O2": "SO",
    "SOTON/OQ": "SO",
    "STON/O" : "ST",
    "STON/O2.": "ST"
}

def splitTic(s):

    match = re.match(r"([0-9]+)", s, re.I)

    try:
        number = int(match.group(1))
    except:
        number = s

    return number


def SP(s, TicketDict):
    s = splitTic(s)
    if isinstance(s, (int)):
       if s <= 9999:
          tic = 'a'
       if ((s >= 10000) and (s < 99999)):
          tic = 'b'
       if ((s >= 100000) and (s < 999999)):
          tic = 'c'
       else:
          tic = 'd'
    else:
       s = s.split(' ')
      
       tic = [t for k, t in TicketDict.items() if str(k) in s]
      
       if tic == []:
        tic = 'Other'
       else:
        # Title is a list, so extract contents
        tic = tic[0]
      
    return tic

# out = allsets['Ticket'].apply(splitTic, args=[TicketDict])
out = allsets['Ticket'].apply(SP, args = [TicketDict])
out = out.apply(pd.Series)
out.columns = ['Tickettype']

allsets = pd.concat([allsets,out],axis=1)


In [None]:
allsets['Tickettype']

0       A
1      PC
2      ST
3       c
4       c
       ..
413     A
414    PC
415    SO
416     c
417     d
Name: Tickettype, Length: 1309, dtype: object

In [None]:
# List of categorical columns to record
catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype']

for c in catCols:
    # Convert column to pd.Catgorical
    # 找出有几个类别
    allsets[c] = pd.Categorical(allsets[c])
    # Extract the cat.codes and replace the column with these
    # 把类别转换成数字
    allsets[c] = allsets[c].cat.codes
    # Convert the cat codes to categorical 
    # 把dtype 从int8改为category
    allsets[c] = pd.Categorical(allsets[c])


# Generate a logical index of categorical columns to maybe use with LightGBM later
catCols = [i for i,v in enumerate(allsets.dtypes) if str(v)=='category']

# Replace missing age value with median
allsets['Age'] = allsets['Age'].fillna(allsets['Age'].median())


In [None]:
# cor = allsets.corr()
# train_cor = cor['Survived'].to_dict()
# print(train_cor)

{'Age': -0.06491041993052575, 'Embarked': -0.1765092251688822, 'Fare': 0.2573065223849618, 'Parch': 0.08162940708348222, 'PassengerId': -0.005006660767066476, 'Pclass': -0.33848103596101586, 'Sex': -0.5433513806577526, 'SibSp': -0.03532249888573588, 'Survived': 1.0, 'Cabletter': 0.3052837254015841, 'Cabnumber': 0.265330768116824, 'Cabcount': -0.316912438231453, 'fsize': 0.01663898928274531, 'fRatio': 0.09207828202224275, 'Adult': -0.0034108597372871463, 'Surname': -0.05794171163989489, 'Title': -0.07380310227652323, 'Tickettype': 0.07273666414567713}


In [None]:
# Split datasets
train = allsets.iloc[0:TrainRow,:]
test = allsets.iloc[TrainRow:,:]

# Prepare data
def prepLGB(data, classCol='', IDCol='',fDrop=[]):
    
    # Drop class column
    if classCol != '':
        labels = data[classCol]
        fDrop = fDrop + [classCol]
    else:
        labels = []
    
    if IDCol != '':
        IDs = data[IDCol]
    else:
        IDs = []

    if fDrop != []:
       data = data.drop(fDrop, axis = 1)

    # Creat LGB mats
    # 创建成lgb特征的数据集格式
    lgbData = lgb.Dataset(data, label=labels, 
                          free_raw_data=False,
                          feature_name=list(data.columns),
                          categorical_feature = 'auto')
    
    # lgbData是lgb特征的数据集格式,包含label
    # data就是drop了label之后的正常train set
    # labels 就是train里的 'y' 
    return lgbData, labels, IDs, data


# Specify columns to drop
fDrop = ['Ticket', 'Cabin', 'Name']

In [None]:
# Split training data into training and validation sets
# Validation set is used for early stopping
trainData, validData = train_test_split(train, test_size = 0.3, 
                                        stratify=train.Survived)

# Prepare the data sets
trainDatalgb, trainLabels, trainIDs, trainData = prepLGB(trainData,
                                                         classCol = 'Survived',
                                                         IDCol = 'PassengerId', 
                                                         fDrop = fDrop)

validDatalgb, validLabels, validIDs, validData = prepLGB(validData,
                                                         classCol = 'Survived',
                                                         IDCol = 'PassengerId',
                                                         fDrop = fDrop)

testDatalgb, _, _, testData = prepLGB(test, 
                                      classCol='Survived',
                                      IDCol='PassengerId',
                                      fDrop=fDrop)

# Prepare data set using all the training data
allTrainDatalgb, allTrainLabels, _, allTrainData = prepLGB(train, 
                                                           classCol='Survived',
                                                           IDCol='PassengerId',
                                                           fDrop=fDrop)






In [None]:
# trainDatalgb 的数据类型：<lightgbm.basic.Dataset at 0x7f0556a2a650>

In [None]:
# Without tuning and early stopping
params = {'boosting_type': 'gbdt', # traditional GBDT
          'max_depth' : -1, # <= 0 means no limit
          'objective': 'binary', # 二分类
          'nthread': 3, # Updated from nthread
          'num_leaves': 64, # Maximum tree leaves for base learners
          'learning_rate': 0.05, 
          'max_bin': 512, 
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8, # Subsample ratio of columns when constructing each tree
          'reg_alpha': 5, # L1 regularization term on weights
          'reg_lambda': 10, # L2 regularization term on weights.
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}


In [None]:
# Create parameters to search 进行调参
gridParams = {
    'learning_rate':[0.005,0.05,0.1],
    'n_estimators':[40],
    'num_leaves':[6,16,24,31],
    'boosting_type':['gbdt'],
    'objective':['binary'],
    'random_state':[28],
    'colsample_bytree': [0.65,0.66],
    'subsample':[0.7,0.75],
    'reg_alpha':[0,1,1.2],
    'reg_lambda':[0,1,1.2,1.4]
    }

In [None]:
# Create classifier to use
lgbmodel = lgb.LGBMClassifier(boosting_type = 'gbdt',
                              objective = 'binary',
                              n_job = 3,
                              silent = True,
                              max_depth = params['max_depth'],
                              max_bin = params['max_bin'],
                              subsample_for_bin = params['subsample_for_bin'],
                              subsample = params['subsample'],
                              subsample_freq = params['subsample_freq'],
                              min_split_gain = params['min_split_gain'],
                              min_child_weight = params['min_child_weight'],
                              min_child_samples = params['min_child_samples'],
                              scale_pos_weight = params['scale_pos_weight'])


In [None]:
# View the default model params:
lgbmodel.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_bin': 512,
 'max_depth': -1,
 'min_child_samples': 5,
 'min_child_weight': 1,
 'min_split_gain': 0.5,
 'n_estimators': 100,
 'n_job': 3,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'scale_pos_weight': 1,
 'silent': True,
 'subsample': 1,
 'subsample_for_bin': 200,
 'subsample_freq': 1}

In [None]:
# Create the grid
grid = GridSearchCV(lgbmodel, gridParams,
                    verbose = 0,
                    cv = 4,
                    n_jobs = 2)

# Run the grid
grid.fit(allTrainData, allTrainLabels)

# Print the best parameters founf
print(grid.best_params_)
print(grid.best_score_)

# grid.fit()：运行网格搜索
# grid_scores_：给出不同参数情况下的评价结果
# best_params_：描述了已取得最佳结果的参数的组合
# best_score_：提供优化过程期间观察到的最好的评分


{'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.05, 'n_estimators': 40, 'num_leaves': 6, 'objective': 'binary', 'random_state': 28, 'reg_alpha': 1, 'reg_lambda': 0, 'subsample': 0.7}
0.8159364521472144


In [None]:
# Using parameters already set above, replace in the best from the grid search
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['learning_rate'] = grid.best_params_['learning_rate']
# params['max_bin'] = grid.best_params_['max_bin']
params['num_leaves'] = grid.best_params_['num_leaves']
params['reg_alpha'] = grid.best_params_['reg_alpha']
params['reg_lambda'] = grid.best_params_['reg_lambda']
params['subsample'] = grid.best_params_['subsample']
# params['subsample_for_bin'] = grid.best_params_['subsample_for_bin']

print('Fitting with params: ')
print(params)

Fitting with params: 
{'boosting_type': 'gbdt', 'max_depth': -1, 'objective': 'binary', 'nthread': 3, 'num_leaves': 6, 'learning_rate': 0.05, 'max_bin': 512, 'subsample_for_bin': 200, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.65, 'reg_alpha': 1, 'reg_lambda': 0, 'min_split_gain': 0.5, 'min_child_weight': 1, 'min_child_samples': 5, 'scale_pos_weight': 1, 'num_class': 1, 'metric': 'binary_error'}


In [None]:
gbm = lgb.train(params,
                trainDatalgb,
                100000,
                valid_sets=[trainDatalgb, validDatalgb],
                early_stopping_rounds=50,
                verbose_eval=4)

Training until validation scores don't improve for 50 rounds.
[4]	training's binary_error: 0.383628	valid_1's binary_error: 0.384328
[8]	training's binary_error: 0.219904	valid_1's binary_error: 0.197761
[12]	training's binary_error: 0.211878	valid_1's binary_error: 0.186567
[16]	training's binary_error: 0.17817	valid_1's binary_error: 0.160448
[20]	training's binary_error: 0.179775	valid_1's binary_error: 0.152985
[24]	training's binary_error: 0.176565	valid_1's binary_error: 0.149254
[28]	training's binary_error: 0.170144	valid_1's binary_error: 0.164179
[32]	training's binary_error: 0.168539	valid_1's binary_error: 0.156716
[36]	training's binary_error: 0.170144	valid_1's binary_error: 0.152985
[40]	training's binary_error: 0.170144	valid_1's binary_error: 0.152985
[44]	training's binary_error: 0.165329	valid_1's binary_error: 0.156716
[48]	training's binary_error: 0.158909	valid_1's binary_error: 0.164179
[52]	training's binary_error: 0.152488	valid_1's binary_error: 0.164179
[56]	



In [None]:
predtest = gbm.predict(testData)

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = np.int32(predtest >= 0.5)

In [None]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
# submission.to_csv('Titanic.csv',index=False)
# from google.colab import files
# files.download('Titanic.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>