In [154]:
# ! pip install optuna

In [155]:
# !pip install kaggle
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c titanic
# !ls

In [156]:
# The usuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

# LightGBM\XGBoost\GBDT
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.model_selection import 

# Sklearn tools for model training and assessment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score


In [157]:
trainRaw = pd.read_csv('train.csv')
testRaw = pd.read_csv('test.csv')

# Concatenate together
TrainRow = trainRaw.shape[0]
allsets = pd.concat([trainRaw, testRaw], axis=0,sort=True)


In [158]:
# Build new features from Cabin
# Seperate Canbin to Letter and Number for who has Canbin
def CabSplit(s):
    """
    Function to try and extract cabin letter and number from the cabin column.
    Runs a regular expression that finds letters and numbers in the
    string. These are held in match.group, if they exist.
    """
    match = re.match(r"([a-z]+)([0-9]+)",s,re.I) #re.I 使匹配对大小写不敏感

    try:
        letter = match.group(1) #满足([a-z]+)的部分
    except:
        letter = ''

    try:
        number = match.group(2) #满足([0-9]+)的部分
    except:
        number = 9999
    return letter, number

# Count the number of Canbin
def DR(s):
    """
    From the cabin string, try and extract letter, number, and number of cabins
    """
    # Check contents
    if isinstance(s, (int,float)): 
      # 若s是int或者float(只有数字形式) 
      # 说明没有Cabin信息 为空 
       letter = ''
       number = ''
       nCabins = 9999
    else:
       # If field isn't empty, split string to letter and unmber. 
       # Some strings contain multiple cabins.
       s = s.split(' ')
       # Count the cabins based on number of splits
       nCabins = len(s)
       # Just take first cabin for letter/number extraction
       s = s[0]

       letter, number = CabSplit(s)

    return [letter, number, nCabins]

# Apply DR function to each cell in Cabin column using pandas apply method.
out = allsets['Cabin'].apply(DR)

# Output tuple with 3 values for each row, convert this to pandas df
out = out.apply(pd.Series)

# name the columns
out.columns = ['Cabletter','Cabnumber','Cabcount']

# Concatenate these columns to the dataset
allsets = pd.concat([allsets,out],axis = 1)      


In [159]:
# Add some family features directly to new columns in the dataset

#size
allsets['fsize'] = allsets['SibSp'] + allsets['Parch'] + 1

#Ratio
allsets['fRatio'] = (allsets['Parch'] + 1)/(allsets['SibSp'] + 1)

#Adult
allsets['Adult'] = allsets['Age'] > 18

In [160]:
# Extract titles from Name column, standardise
titleDict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Sir",
    "Don": "Sir",
    "Sir": "Sir",
    "Dr": "Dr",
    "Rev": "Rev",
    "theCountess": "Lady",
    "Dona": "Lady",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Lady"
}

def NameSplit(s, titleDict):
    """
    Extract title from name, replace with value in title dictionary. Also
    return surname.
    """

    # Remove '.' from name string
    s = s.replace('.','')
    # Split on spaces
    s = s.split(' ')
    # get surname
    surname = s[0]

    #get title - loop over titleDict
    #if s matches a key, take the corresponding value as the tile
    title = []
    for k, t in titleDict.items():
      if str(k) in s:
        title.append(t)
        
    if title == []:
        title = 'other'
    else:
        # Title is a list, so extract contents
        title = title[0]
    return surname.strip(','), title

out = allsets['Name'].apply(NameSplit,args=[titleDict])

out = out.apply(pd.Series)
out.columns = ['Surname','Title']
allsets = pd.concat([allsets, out],axis=1)


In [161]:
TicketDict = {
    "A./5.": "A",
    "A.5.": "A",
    "A/4": "A",
    "A/4.": "A",
    "A/5": "A",
    "A/5.": "A",
    "A/S": "A",
    "A4.": "A",
    "C": "C",
    "C.A.":"C",
    "CA.": "C",
    "C.A./": "C",
    "CA": "C",
    "F.C.": "FC",
    "F.C.C.": "FC",
    "LINE":"LINE",
    "PC": "PC",
    "PP": "PC",
    "SOTON/O.Q.": "SO",
    "SOTON/O2": "SO",
    "SOTON/OQ": "SO",
    "STON/O" : "ST",
    "STON/O2.": "ST"
}

def splitTic(s):

    match = re.match(r"([0-9]+)", s, re.I)

    try:
        number = int(match.group(1))
    except:
        number = s

    return number


def SP(s, TicketDict):
    s = splitTic(s)
    if isinstance(s, (int)):
       if s <= 9999:
          tic = 'a'
       if ((s >= 10000) and (s < 99999)):
          tic = 'b'
       if ((s >= 100000) and (s < 999999)):
          tic = 'c'
       else:
          tic = 'd'
    else:
       s = s.split(' ')
      
       tic = [t for k, t in TicketDict.items() if str(k) in s]
      
       if tic == []:
        tic = 'Other'
       else:
        # Title is a list, so extract contents
        tic = tic[0]
      
    return tic

# out = allsets['Ticket'].apply(splitTic, args=[TicketDict])
out = allsets['Ticket'].apply(SP, args = [TicketDict])
out = out.apply(pd.Series)
out.columns = ['Tickettype']

allsets = pd.concat([allsets,out],axis=1)


In [162]:
# List of categorical columns to record
catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype']

for c in catCols:
    # Convert column to pd.Catgorical
    # 找出有几个类别
    allsets[c] = pd.Categorical(allsets[c])
    # Extract the cat.codes and replace the column with these
    # 把类别转换成数字
    allsets[c] = allsets[c].cat.codes
    # Convert the cat codes to categorical 
    # 把dtype 从int8改为category
    allsets[c] = pd.Categorical(allsets[c])


# Generate a logical index of categorical columns to maybe use with LightGBM later
catCols = [i for i,v in enumerate(allsets.dtypes) if str(v)=='category']

# Replace missing age value with median
allsets['Age'] = allsets['Age'].fillna(allsets['Age'].median())
allsets = allsets.drop(['Ticket', 'Cabin', 'Name'],axis=1)


In [163]:
# Split datasets
train = allsets.iloc[0:TrainRow,:]
test = allsets.iloc[TrainRow:,:]

In [164]:
train_y = train.Survived
train_x = train.drop(['Survived'],axis=1)

In [165]:
import optuna

In [166]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, random_state=0)

In [167]:
params = {'boosting_type': 'gbdt', # traditional GBDT
          'max_depth' : -1, # <= 0 means no limit
          'objective': 'binary', # 二分类
          'nthread': 3, # Updated from nthread
          'num_leaves': 64, # Maximum tree leaves for base learners
          'learning_rate': 0.05, 
          'max_bin': 512, 
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8, # Subsample ratio of columns when constructing each tree
          'reg_alpha': 5, # L1 regularization term on weights
          'reg_lambda': 10, # L2 regularization term on weights.
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 2,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'
          }


In [168]:
# Define an objective function to be minimized.
def objective(trial):

    num_leaves = trial.suggest_int('num_leaves', 2, 64)
    learning_rate = trial.suggest_float('learning_rate', 0.05, 0.5)
    n_estimators = trial.suggest_int('n_estimators', 40, 300)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 0.7)
    subsample = trial.suggest_float('subsample', 0.65, 0.8)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 3)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 3)
    lgbmodel = lgb.LGBMClassifier(num_leaves=num_leaves, 
                                  learning_rate=learning_rate,
                                  n_estimators=n_estimators, 
                                  colsample_bytree=colsample_bytree,
                                  subsample=subsample, 
                                  reg_alpha=reg_alpha,
                                  reg_lambda=reg_lambda) 
 
    lgbmodel.fit(X_train, y_train)
    y_pred = lgbmodel.predict(X_val)

    acc = accuracy_score(y_val, y_pred)

    return acc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)



[32m[I 2021-05-07 20:06:20,680][0m A new study created in memory with name: no-name-0f581df6-3719-4331-a06f-b3b33ac1265c[0m
[32m[I 2021-05-07 20:06:20,773][0m Trial 0 finished with value: 0.8430493273542601 and parameters: {'num_leaves': 60, 'learning_rate': 0.29436136435312527, 'n_estimators': 221, 'colsample_bytree': 0.6083691844236951, 'subsample': 0.6698071713898861, 'reg_alpha': 1.5549661452427084, 'reg_lambda': 1.0658595935698778}. Best is trial 0 with value: 0.8430493273542601.[0m
[32m[I 2021-05-07 20:06:20,842][0m Trial 1 finished with value: 0.820627802690583 and parameters: {'num_leaves': 25, 'learning_rate': 0.29037901610773437, 'n_estimators': 58, 'colsample_bytree': 0.6033336148260582, 'subsample': 0.7572251541827102, 'reg_alpha': 1.9147946931759607, 'reg_lambda': 2.1139462829257916}. Best is trial 0 with value: 0.8430493273542601.[0m
[32m[I 2021-05-07 20:06:20,923][0m Trial 2 finished with value: 0.852017937219731 and parameters: {'num_leaves': 54, 'learning_ra

In [169]:
study.best_params

{'colsample_bytree': 0.623130817542208,
 'learning_rate': 0.18361804024296968,
 'n_estimators': 90,
 'num_leaves': 23,
 'reg_alpha': 1.149742575892818,
 'reg_lambda': 0.6143015265831211,
 'subsample': 0.7929788821934193}

In [171]:
lgbmodel = lgb.LGBMClassifier(num_leaves=study.best_params['num_leaves'], 
                              learning_rate=study.best_params['learning_rate'],
                              n_estimators=study.best_params['n_estimators'], 
                              colsample_bytree=study.best_params['colsample_bytree'],
                              subsample=study.best_params['subsample'], 
                              reg_alpha=study.best_params['reg_alpha'],
                              reg_lambda=study.best_params['reg_lambda'],
                              min_split_gain = params['min_split_gain'],
                              min_child_weight = params['min_child_weight'],
                              min_child_samples = params['min_child_samples'],
                              subsample_for_bin = params['subsample_for_bin'])

In [172]:
lgbmodel.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.623130817542208, importance_type='split',
               learning_rate=0.18361804024296968, max_depth=-1,
               min_child_samples=2, min_child_weight=1, min_split_gain=0.5,
               n_estimators=90, n_jobs=-1, num_leaves=23, objective=None,
               random_state=None, reg_alpha=1.149742575892818,
               reg_lambda=0.6143015265831211, silent=True,
               subsample=0.7929788821934193, subsample_for_bin=200,
               subsample_freq=0)

In [173]:
test

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
0,34.5,1,7.8292,0,892,3,1,0,,0,1,9999,1,1.0,True,399,4,8
1,47.0,2,7.0000,0,893,3,0,1,,0,1,9999,2,0.5,True,841,5,8
2,62.0,1,9.6875,0,894,2,1,0,,0,1,9999,1,1.0,True,550,4,8
3,27.0,2,8.6625,0,895,3,1,0,,0,1,9999,1,1.0,True,849,4,8
4,22.0,2,12.2875,1,896,3,0,1,,0,1,9999,3,1.0,True,341,5,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,2,8.0500,0,1305,3,1,0,,0,1,9999,1,1.0,False,751,4,0
414,39.0,0,108.9000,0,1306,1,0,0,,3,7,1,1,1.0,True,591,1,5
415,38.5,2,7.2500,0,1307,3,1,0,,0,1,9999,1,1.0,True,697,4,6
416,28.0,2,8.0500,0,1308,3,1,0,,0,1,9999,1,1.0,False,822,4,8


In [174]:
prediction = lgbmodel.predict(test.drop(['Survived'],axis=1))
prediction

array([0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [175]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = np.int32(prediction)
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [176]:
submission.to_csv('Titanic.csv',index=False)
from google.colab import files
files.download('Titanic.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>