In [None]:
# !pip install kaggle
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c titanic
# !ls

In [None]:
# The usuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

from sklearn import preprocessing

# LightGBM\XGBoost\GBDT
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

# Sklearn tools for model training and assessment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import cross_val_score, ShuffleSplit

In [None]:
trainRaw = pd.read_csv('train.csv')
testRaw = pd.read_csv('test.csv')

# Concatenate together
TrainRow = trainRaw.shape[0]
allsets = pd.concat([trainRaw, testRaw], axis=0,sort=True).reset_index(drop=True)

trainRaw.name = 'Training Set'
testRaw.name = 'Test Set'
allsets.name = 'All Set'

all = [trainRaw, testRaw]

In [None]:
age_by_pclass_sex = allsets.groupby(['Sex', 'Pclass']).median()['Age']
age_by_pclass_sex

Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64

In [None]:
allsets['Age'] = allsets.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

In [None]:
# Filling the missing values in Embarked with S (Google)
allsets['Embarked'] = allsets['Embarked'].fillna('S')

In [None]:
# Fill the missing value in Farewith the median Fare of 3rd class alone passenger
med_fare = allsets.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]

allsets['Fare'] = allsets['Fare'].fillna(med_fare)

In [None]:
# Creating Deck column from the first letter of the Cabin column 
#(M stands for Missing)
allsets['Deck'] = allsets['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')


In [None]:
# Passenger in the T deck is changed to A
idx = allsets[allsets['Deck'] == 'T'].index
allsets.loc[idx, 'Deck'] = 'A'

In [None]:
# Divide Fare to 13 range
allsets['Fare'] = pd.qcut(allsets['Fare'], 13)

In [None]:
allsets['Age'] = pd.qcut(allsets['Age'], 10)

In [None]:
allsets['Deck'] = allsets['Deck'].replace(['A', 'B', 'C'], 'ABC')
allsets['Deck'] = allsets['Deck'].replace(['D', 'E'], 'DE')
allsets['Deck'] = allsets['Deck'].replace(['F', 'G'], 'FG')

allsets['Deck'].value_counts()

M      1014
ABC     182
DE       87
FG       26
Name: Deck, dtype: int64

In [None]:
# Build new features from Cabin
# Seperate Canbin to Letter and Number for who has Canbin
def CabSplit(s):
    """
    Function to try and extract cabin letter and number from the cabin column.
    Runs a regular expression that finds letters and numbers in the
    string. These are held in match.group, if they exist.
    """
    match = re.match(r"([a-z]+)([0-9]+)",s,re.I) #re.I 使匹配对大小写不敏感

    try:
        letter = match.group(1) #满足([a-z]+)的部分
    except:
        letter = ''

    try:
        number = match.group(2) #满足([0-9]+)的部分
    except:
        number = 9999
    return letter, number

# Count the number of Canbin
def DR(s):
    """
    From the cabin string, try and extract letter, number, and number of cabins
    """
    # Check contents
    if isinstance(s, (int,float)): 
      # 若s是int或者float(只有数字形式) 
      # 说明没有Cabin信息 为空 
       letter = ''
       number = ''
       nCabins = 9999
    else:
       # If field isn't empty, split string to letter and unmber. 
       # Some strings contain multiple cabins.
       s = s.split(' ')
       # Count the cabins based on number of splits
       nCabins = len(s)
       # Just take first cabin for letter/number extraction
       s = s[0]

       letter, number = CabSplit(s)

    return [letter, number, nCabins]

# Apply DR function to each cell in Cabin column using pandas apply method.
out = allsets['Cabin'].apply(DR)

# Output tuple with 3 values for each row, convert this to pandas df
out = out.apply(pd.Series)

# name the columns
out.columns = ['Cabletter','Cabnumber','Cabcount']

# Concatenate these columns to the dataset
allsets = pd.concat([allsets,out],axis = 1)      


In [None]:
# Add some family features directly to new columns in the dataset

#size
allsets['fsize'] = allsets['SibSp'] + allsets['Parch'] + 1

#Ratio
# allsets['fRatio'] = (allsets['Parch'] + 1)/(allsets['SibSp'] + 1)


family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
allsets['fsize_Grouped'] = allsets['fsize'].map(family_map)


In [None]:
# Count the number of different tickets
allsets['Ticket_Frequency'] = allsets.groupby('Ticket')['Ticket'].transform('count')


In [None]:
# Extract titles from Name column, standardise
titleDict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Sir",
    "Don": "Sir",
    "Sir": "Sir",
    "Dr": "Dr",
    "Rev": "Rev",
    "theCountess": "Lady",
    "Dona": "Lady",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Lady"
}

def NameSplit(s, titleDict):
    """
    Extract title from name, replace with value in title dictionary. Also
    return surname.
    """

    # Remove '.' from name string
    s = s.replace('.','')
    # Split on spaces
    s = s.split(' ')
    # get surname
    surname = s[0]

    #get title - loop over titleDict
    #if s matches a key, take the corresponding value as the tile
    title = []
    for k, t in titleDict.items():
      if str(k) in s:
        title.append(t)
        
    if title == []:
        title = 'other'
    else:
        # Title is a list, so extract contents
        title = title[0]
    return title

out = allsets['Name'].apply(NameSplit,args=[titleDict])

out = out.apply(pd.Series)
out.columns = ['Title']
allsets = pd.concat([allsets, out],axis=1)


In [None]:
allsets['Is_Married'] = 0
allsets['Is_Married'].loc[allsets['Title'] == 'Mrs'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
allsets['Title'] = allsets['Title'].replace(['Miss', 'Mrs','Ms', 'Lady'], 'Miss/Mrs/Ms')
allsets['Title'] = allsets['Title'].replace(['Officer','Sir','Dr','Rev'], 'Dr/Military/Noble/Clergy')

Feature engineering for LightGBM - lgbsets

In [None]:
# List of categorical columns to record
catCols = ['Age','Sex', 'Embarked', 'Fare',
           'Title','Deck',
           'fsize_Grouped','Cabletter','Cabnumber']
lgbsets = allsets.copy()

for c in catCols:
    # Convert column to pd.Catgorical
    # 找出有几个类别
    lgbsets[c] = pd.Categorical(lgbsets[c])
    # Extract the cat.codes and replace the column with these
    # 把类别转换成数字
    lgbsets[c] = lgbsets[c].cat.codes
    # Convert the cat codes to categorical 
    # 把dtype 从int8改为category
    lgbsets[c] = pd.Categorical(lgbsets[c])



In [None]:
lgbsets

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck,Cabletter,Cabnumber,Cabcount,fsize,fsize_Grouped,Ticket_Frequency,Title,Is_Married
0,2,,2,0,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,3,0,1,9999,2,3,1,3,0
1,7,C85,0,11,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,0,3,92,1,2,3,2,2,1
2,4,,2,3,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,3,0,1,9999,1,0,1,2,0
3,7,C123,2,10,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,0,3,16,1,2,3,2,2,1
4,7,,2,3,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,3,0,1,9999,1,0,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,,2,3,"Spector, Mr. Woolf",0,1305,3,1,0,,A.5. 3236,3,0,1,9999,1,0,1,3,0
1305,7,C105,0,12,"Oliva y Ocana, Dona. Fermina",0,1306,1,0,0,,PC 17758,0,3,7,1,1,0,3,2,0
1306,7,,2,0,"Saether, Mr. Simon Sivertsen",0,1307,3,1,0,,SOTON/O.Q. 3101262,3,0,1,9999,1,0,1,3,0
1307,3,,2,3,"Ware, Mr. Frederick",0,1308,3,1,0,,359309,3,0,1,9999,1,0,1,3,0


Feature engineering for XGB - xgbsets

In [None]:
xgbsets = allsets.copy()
for c in catCols:
    xgbsets[c] = pd.factorize(xgbsets[c])[0]

In [None]:
xgbsets

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck,Cabletter,Cabnumber,Cabcount,fsize,fsize_Grouped,Ticket_Frequency,Title,Is_Married
0,0,,0,0,"Braund, Mr. Owen Harris",0,1,3,0,1,0.0,A/5 21171,0,0,0,9999,2,0,1,0,0
1,1,C85,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1.0,PC 17599,1,1,1,1,2,0,2,1,1
2,2,,0,2,"Heikkinen, Miss. Laina",0,3,3,1,0,1.0,STON/O2. 3101282,0,0,0,9999,1,1,1,1,0
3,1,C123,0,3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1.0,113803,1,1,2,1,2,0,2,1,1
4,1,,0,2,"Allen, Mr. William Henry",0,5,3,0,0,0.0,373450,0,0,0,9999,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,,0,2,"Spector, Mr. Woolf",0,1305,3,0,0,,A.5. 3236,0,0,0,9999,1,1,1,0,0
1305,1,C105,1,10,"Oliva y Ocana, Dona. Fermina",0,1306,1,1,0,,PC 17758,1,1,105,1,1,1,3,1,0
1306,1,,0,0,"Saether, Mr. Simon Sivertsen",0,1307,3,0,0,,SOTON/O.Q. 3101262,0,0,0,9999,1,1,1,0,0
1307,3,,0,2,"Ware, Mr. Frederick",0,1308,3,0,0,,359309,0,0,0,9999,1,1,1,0,0


Feature engineering for RF/GBDT - allsets

In [None]:
for c in catCols:
    allsets[c] = pd.factorize(allsets[c])[0]


allsets = allsets.drop(['Ticket', 'Cabin', 'Name','PassengerId'],axis=1)

In [None]:
allsets

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Deck,Cabletter,Cabnumber,Cabcount,fsize,fsize_Grouped,Ticket_Frequency,Title,Is_Married
0,0,0,0,0,3,0,1,0.0,0,0,0,9999,2,0,1,0,0
1,1,1,1,0,1,1,1,1.0,1,1,1,1,2,0,2,1,1
2,2,0,2,0,3,1,0,1.0,0,0,0,9999,1,1,1,1,0
3,1,0,3,0,1,1,1,1.0,1,1,2,1,2,0,2,1,1
4,1,0,2,0,3,0,0,0.0,0,0,0,9999,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,2,0,3,0,0,,0,0,0,9999,1,1,1,0,0
1305,1,1,10,0,1,1,0,,1,1,105,1,1,1,3,1,0
1306,1,0,0,0,3,0,0,,0,0,0,9999,1,1,1,0,0
1307,3,0,2,0,3,0,0,,0,0,0,9999,1,1,1,0,0


LightGBM

In [None]:
# Split datasets
lgbtrain = lgbsets.iloc[0:TrainRow,:]
lgbtest = lgbsets.iloc[TrainRow:,:]

# Prepare data
def prepLGB(data, classCol='', fDrop=[]):
    
    # Drop class column
    if classCol != '':
        labels = data[classCol]
        fDrop = fDrop + [classCol] 
    else:
        labels = []


    if fDrop != []:
       data = data.drop(fDrop, axis = 1)

    # Creat LGB mats
    # 创建成lgb特征的数据集格式
    lgbData = lgb.Dataset(data, label=labels, 
                          free_raw_data=False,
                          feature_name=list(data.columns),
                          categorical_feature = 'auto')
    
    # lgbData是lgb特征的数据集格式,包含label
    # data就是drop了label之后的正常train set
    # labels 就是train里的 'y' 
    return lgbData, labels, data


# Specify columns to drop
fDrop = ['Ticket', 'Cabin', 'Name', 'PassengerId']

In [None]:
lgbtrain

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck,Cabletter,Cabnumber,Cabcount,fsize,fsize_Grouped,Ticket_Frequency,Title,Is_Married
0,2,,2,0,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,3,0,1,9999,2,3,1,3,0
1,7,C85,0,11,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,0,3,92,1,2,3,2,2,1
2,4,,2,3,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,3,0,1,9999,1,0,1,2,0
3,7,C123,2,10,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,0,3,16,1,2,3,2,2,1
4,7,,2,3,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,3,0,1,9999,1,0,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,5,,2,5,"Montvila, Rev. Juozas",0,887,2,1,0,0.0,211536,3,0,1,9999,1,0,1,0,0
887,1,B42,2,9,"Graham, Miss. Margaret Edith",0,888,1,0,0,1.0,112053,0,2,54,1,1,0,1,2,0
888,2,,2,8,"Johnston, Miss. Catherine Helen ""Carrie""",2,889,3,0,1,0.0,W./C. 6607,3,0,1,9999,4,3,4,2,0
889,4,C148,0,9,"Behr, Mr. Karl Howell",0,890,1,1,0,1.0,111369,0,3,24,1,1,0,1,3,0


In [None]:
# Split training data into training and validation sets
# Validation set is used for early stopping
lgb_trainData, lgb_validData = train_test_split(lgbtrain, test_size = 0.3, 
                                                stratify=lgbtrain.Survived,
                                                random_state=28)

# Prepare the data sets
trainData_lgb, trainLabels_lgb, lgbtrainData = prepLGB(lgb_trainData,
                                                    classCol = 'Survived',
                                                    fDrop = fDrop)

validData_lgb, validLabels_lgb, lgbvalidData = prepLGB(lgb_validData,
                                                    classCol = 'Survived',
                                                    fDrop = fDrop)

testData_lgb, _, lgbtestData = prepLGB(lgbtest, 
                                    classCol='Survived',
                                    fDrop=fDrop)

# Prepare data set using all the training data
allTrainData_lgb, allTrainLabels_lgb, lgballTrainData = prepLGB(lgbtrain, 
                                                        classCol='Survived',
                                                        fDrop=fDrop)

In [None]:
trainData_lgb.data

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Deck,Cabletter,Cabnumber,Cabcount,fsize,fsize_Grouped,Ticket_Frequency,Title,Is_Married
520,6,2,12,0,1,0,0,0,2,84,1,1,0,4,2,0
241,2,1,6,0,3,0,1,3,0,1,9999,2,3,2,2,0
88,3,2,12,2,1,0,3,0,3,34,3,6,2,6,2,0
20,7,2,8,0,2,1,0,3,0,1,9999,1,0,2,3,0
874,5,0,8,0,2,0,1,3,0,1,9999,2,3,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,8,2,4,1,3,1,0,3,0,1,9999,2,3,1,3,0
224,7,2,12,0,1,1,1,0,3,100,1,2,3,2,3,0
522,3,0,0,0,3,1,0,3,0,1,9999,1,0,1,3,0
512,7,2,8,0,1,1,0,1,5,36,1,1,0,1,3,0


In [None]:
lgbparams = {'boosting_type': 'gbdt', # traditional GBDT
          'max_depth' : -1, # <= 0 means no limit
          'objective': 'binary', # 二分类
          'nthread': 3, # Updated from nthread
          'num_leaves': 16, 
          'learning_rate': 0.1, 
          'max_bin': 512, 
          'subsample_for_bin': 200,
          'subsample': 0.75,
          'subsample_freq': 1,
          'colsample_bytree': 0.65, 
          'reg_alpha': 0, # L1 regularization term on weights
          'reg_lambda': 1, # L2 regularization term on weights.
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 2,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error',
          'n_estimators':200
          }


In [None]:
gbm = lgb.train(lgbparams,
                trainData_lgb,
                100000,
                valid_sets=[trainData_lgb, validData_lgb],
                early_stopping_rounds=50,
                verbose_eval=4)




Training until validation scores don't improve for 50 rounds.
[4]	training's binary_error: 0.205457	valid_1's binary_error: 0.253731
[8]	training's binary_error: 0.168539	valid_1's binary_error: 0.208955
[12]	training's binary_error: 0.155698	valid_1's binary_error: 0.208955
[16]	training's binary_error: 0.144462	valid_1's binary_error: 0.212687
[20]	training's binary_error: 0.142857	valid_1's binary_error: 0.205224
[24]	training's binary_error: 0.136437	valid_1's binary_error: 0.201493
[28]	training's binary_error: 0.134831	valid_1's binary_error: 0.197761
[32]	training's binary_error: 0.130016	valid_1's binary_error: 0.19403
[36]	training's binary_error: 0.126806	valid_1's binary_error: 0.190299
[40]	training's binary_error: 0.125201	valid_1's binary_error: 0.186567
[44]	training's binary_error: 0.12199	valid_1's binary_error: 0.179104
[48]	training's binary_error: 0.117175	valid_1's binary_error: 0.190299
[52]	training's binary_error: 0.113965	valid_1's binary_error: 0.190299
[56]	t

In [None]:
lgbpredict = np.int32(gbm.predict(lgbtestData)>=0.5)
lgbpredict

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

XGBoost

In [None]:
# Split datasets
xgbtrain = xgbsets.iloc[0:TrainRow,:]
xgbtest = xgbsets.iloc[TrainRow:,:]

# Prepare data
def prepGBX(data, classCol='', fDrop=[]):
    
    # Drop class column
    # 对于train和valid来说 有label项 Survived 所以要加入fDrop
    if classCol != '':
       labels = data[classCol]
       fDrop = fDrop + [classCol]
    # 对于test来说 没有label项 所以label为空
    else:
       labels = []
    
    # 对于train来说 Survived会被drop
    if fDrop != []:
       data = data.drop(fDrop, axis = 1)
      
    # Creat xgb Dmatrix
    xgbData = xgb.DMatrix(data, label=labels)
    # xgbData 是xgb格式的完整数据，train valid里包含label和除drop以外的特征 
    # labels 是普通格式的labels 与data对应
    # data是不包含labels的数据集
    return xgbData, labels, data

# Specify columns to drop
fDrop = ['Ticket', 'Cabin', 'Name', 'PassengerId']

In [None]:

# Validation set used for early stopping
xgb_trainData, xgb_validData = train_test_split(xgbtrain, test_size=0.3, 
                                                stratify = xgbtrain.Survived,
                                                random_state = 28)

# Prepare the data sets
# Train datasets
trainData_xgb, trainLabels_xgb, xgbtrainData = prepGBX(xgb_trainData,
                                               classCol='Survived',
                                               fDrop = fDrop)

# Valid datasets
validData_xgb, validLabels_xgb, xgbvalidData = prepGBX(xgb_validData,
                                               classCol='Survived',
                                               fDrop = fDrop)


# Test datasets
testData_xgb, _, xgbtestData = prepGBX(xgbtest,
                                   classCol='Survived',
                                   fDrop = fDrop)



In [None]:
xgbtrainData

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Deck,Cabletter,Cabnumber,Cabcount,fsize,fsize_Grouped,Ticket_Frequency,Title,Is_Married
520,8,0,10,0,1,1,0,1,6,69,1,1,1,4,1,0
241,0,2,12,0,3,1,1,0,0,0,9999,2,0,2,1,0
88,3,0,10,2,1,1,3,1,1,7,3,6,2,6,1,0
20,1,0,8,0,2,0,0,0,0,0,9999,1,1,2,0,0
874,6,1,8,0,2,1,1,0,0,0,9999,2,0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,9,0,4,1,3,0,0,0,0,0,9999,2,0,1,0,0
224,1,0,10,0,1,0,1,1,1,33,1,2,0,2,0,0
522,3,1,0,0,3,0,0,0,0,0,9999,1,1,1,0,0
512,1,0,8,0,1,0,0,2,2,68,1,1,1,1,0,0


In [None]:
 xgbparams = {
            'max_depth':8,
            'eta':0.01,
            'n_estimators':200,
            'silent':True,
            'objective':'binary:logistic',
            'nthread':-1,
            'gamma':0,
            'min_child_weight':5,
            'max_delta_step':0,
            'subsample':0.8,
            'colsample_bytree':0.5,
            'colsample_bylevel':1,
            'alpha':0,
            'lambda':0.5,
            'scale_pos_weight':1,
            'seed':28,
            'missing':None
 }

In [None]:
evallist = [(trainData_xgb, 'train'), (validData_xgb, 'eval')]
num_round = 500
xgbm = xgb.train(xgbparams,
                 trainData_xgb,
                 num_round, 
                 evallist,
                 early_stopping_rounds=50,
                 verbose_eval=True
                 )

[0]	train-error:0.184591	eval-error:0.179104
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 50 rounds.
[1]	train-error:0.17817	eval-error:0.242537
[2]	train-error:0.157303	eval-error:0.208955
[3]	train-error:0.150883	eval-error:0.205224
[4]	train-error:0.152488	eval-error:0.205224
[5]	train-error:0.149278	eval-error:0.201493
[6]	train-error:0.146067	eval-error:0.216418
[7]	train-error:0.152488	eval-error:0.212687
[8]	train-error:0.154093	eval-error:0.231343
[9]	train-error:0.154093	eval-error:0.235075
[10]	train-error:0.155698	eval-error:0.220149
[11]	train-error:0.152488	eval-error:0.231343
[12]	train-error:0.154093	eval-error:0.238806
[13]	train-error:0.147673	eval-error:0.238806
[14]	train-error:0.149278	eval-error:0.227612
[15]	train-error:0.147673	eval-error:0.227612
[16]	train-error:0.147673	eval-error:0.223881
[17]	train-error:0.144462	eval-error:0.216418
[18]	train-error:0.141252	eval-error:0

In [None]:
xgbmpredict = np.int32(xgbm.predict(testData_xgb)>=0.5)
xgbmpredict

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,

GBDT

In [None]:
# Split datasets
train = allsets.iloc[0:TrainRow,:]
test = allsets.iloc[TrainRow:,:]
test = test.drop(['Survived'],axis=1)

In [None]:
test

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Deck,Cabletter,Cabnumber,Cabcount,fsize,fsize_Grouped,Ticket_Frequency,Title,Is_Married
891,1,2,9,0,3,0,0,0,0,0,9999,1,1,1,0,0
892,9,0,0,0,3,1,1,0,0,0,9999,2,0,1,1,1
893,4,2,4,0,2,0,0,0,0,0,9999,1,1,1,0,0
894,6,0,4,0,3,0,0,0,0,0,9999,1,1,1,0,0
895,0,0,6,1,3,1,1,0,0,0,9999,3,0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,2,0,3,0,0,0,0,0,9999,1,1,1,0,0
1305,1,1,10,0,1,1,0,1,1,105,1,1,1,3,1,0
1306,1,0,0,0,3,0,0,0,0,0,9999,1,1,1,0,0
1307,3,0,2,0,3,0,0,0,0,0,9999,1,1,1,0,0


In [None]:
# Validation set 
X_trainData, X_validData, y_train, y_valid = train_test_split(train.drop(['Survived'],axis=1),
                                             train.Survived, 
                                             test_size = 0.3, 
                                             stratify=train.Survived,
                                             random_state=28)

In [None]:
gbdt = GradientBoostingClassifier(subsample=0.7,learning_rate=0.03,
                                  random_state=28, n_estimators=500,
                                  min_samples_leaf=3)
gbdt.fit(X_trainData, y_train)
train_pred = gbdt.predict(X_trainData)
valid_pred = gbdt.predict(X_validData)

acc_train = gbdt.score(X_trainData, y_train)
acc_valid = gbdt.score(X_validData, y_valid)

print(acc_train)
print(acc_valid)

0.9325842696629213
0.8022388059701493


In [None]:
gbdtpredict = gbdt.predict(test)
gbdtpredict

array([0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0.

RF

In [None]:
# rf = RandomForestClassifier(oob_score=True, random_state=28, n_estimators=100, max_features=None)
rf = RandomForestClassifier(criterion='gini', 
                            n_estimators=1100,
                            max_depth=5,
                            min_samples_split=4,
                            min_samples_leaf=5,
                            max_features='auto',
                            oob_score=True,
                            random_state=28,
                            n_jobs=-1,
                            verbose=1)
rf.fit(train.drop(['Survived'],axis=1), train['Survived'])
print(rf.oob_score_)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1100 out of 1100 | elapsed:    2.2s finished


0.8372615039281706


In [None]:
rfpredict = rf.predict(test)
rfpredict

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 1100 out of 1100 | elapsed:    0.4s finished


array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [None]:
finalpredict = 4/10 * lgbpredict + 3/10 * xgbmpredict + 2/10 * gbdtpredict + 1/10 * rfpredict
finalpredict

array([0. , 0.4, 0. , 0. , 0.8, 0. , 0.9, 0. , 0.8, 0. , 0. , 0. , 1. ,
       0. , 1. , 1. , 0. , 0. , 0.3, 1. , 0. , 0.1, 1. , 0. , 1. , 0. ,
       1. , 0. , 0.3, 0. , 0. , 0. , 1. , 0.9, 0. , 0. , 0.4, 0. , 0. ,
       0.3, 0. , 0.9, 0. , 1. , 1. , 0. , 0.5, 0. , 1. , 1. , 0. , 0. ,
       1. , 1. , 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0.9, 0.6,
       1. , 0.9, 0. , 0.3, 1. , 1. , 0. , 0.5, 1. , 1. , 0.2, 0. , 1. ,
       0. , 1. , 0.5, 0.2, 0. , 0. , 0. , 0. , 1. , 0.5, 1. , 0.7, 0.3,
       0. , 1. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 1. , 0. , 0. , 0. ,
       1. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 1. , 0.7, 1. , 0. , 0. ,
       1. , 0. , 1. , 1. , 0. , 1. , 0. , 0. , 0.5, 0. , 1. , 0. , 0. ,
       0. , 0.6, 0. , 0. , 0. , 0. , 0. , 0. , 0.4, 0. , 0. , 1. , 0. ,
       0. , 0. , 0. , 0.4, 0. , 0. , 0. , 1. , 0. , 0. , 0.8, 0.1, 0. ,
       1. , 0. , 0.9, 0.7, 1. , 1. , 1. , 0. , 0. , 1. , 0. , 0. , 1. ,
       0. , 0. , 0. , 0. , 0. , 0. , 1. , 1. , 0.3, 1. , 1. , 0.

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = testRaw['PassengerId']
submission['Survived'] = np.int32(rfpredict >= 0.5)

In [None]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
submission.to_csv('Titanic.csv',index=False)
from google.colab import files
files.download('Titanic.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>