In [None]:
# !pip install kaggle
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c titanic
# !ls

In [None]:
# The usuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

# LightGBM\XGBoost\GBDT
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import 

# Sklearn tools for model training and assessment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score


In [None]:
trainRaw = pd.read_csv('train.csv')
testRaw = pd.read_csv('test.csv')

# Concatenate together
TrainRow = trainRaw.shape[0]
allsets = pd.concat([trainRaw, testRaw], axis=0,sort=True)

In [None]:
# Build new features from Cabin
# Seperate Canbin to Letter and Number for who has Canbin
def CabSplit(s):
    """
    Function to try and extract cabin letter and number from the cabin column.
    Runs a regular expression that finds letters and numbers in the
    string. These are held in match.group, if they exist.
    """
    match = re.match(r"([a-z]+)([0-9]+)",s,re.I) #re.I 使匹配对大小写不敏感

    try:
        letter = match.group(1) #满足([a-z]+)的部分
    except:
        letter = ''

    try:
        number = match.group(2) #满足([0-9]+)的部分
    except:
        number = 9999
    return letter, number

# Count the number of Canbin
def DR(s):
    """
    From the cabin string, try and extract letter, number, and number of cabins
    """
    # Check contents
    if isinstance(s, (int,float)): 
      # 若s是int或者float(只有数字形式) 
      # 说明没有Cabin信息 为空 
       letter = ''
       number = ''
       nCabins = 9999
    else:
       # If field isn't empty, split string to letter and unmber. 
       # Some strings contain multiple cabins.
       s = s.split(' ')
       # Count the cabins based on number of splits
       nCabins = len(s)
       # Just take first cabin for letter/number extraction
       s = s[0]

       letter, number = CabSplit(s)

    return [letter, number, nCabins]

# Apply DR function to each cell in Cabin column using pandas apply method.
out = allsets['Cabin'].apply(DR)

# Output tuple with 3 values for each row, convert this to pandas df
out = out.apply(pd.Series)

# name the columns
out.columns = ['Cabletter','Cabnumber','Cabcount']

# Concatenate these columns to the dataset
allsets = pd.concat([allsets,out],axis = 1)      


In [None]:
# Add some family features directly to new columns in the dataset

#size
allsets['fsize'] = allsets['SibSp'] + allsets['Parch'] + 1

#Ratio
allsets['fRatio'] = (allsets['Parch'] + 1)/(allsets['SibSp'] + 1)

#Adult
allsets['Adult'] = allsets['Age'] > 18

In [None]:
# Extract titles from Name column, standardise
titleDict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Sir",
    "Don": "Sir",
    "Sir": "Sir",
    "Dr": "Dr",
    "Rev": "Rev",
    "theCountess": "Lady",
    "Dona": "Lady",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Lady"
}

def NameSplit(s, titleDict):
    """
    Extract title from name, replace with value in title dictionary. Also
    return surname.
    """

    # Remove '.' from name string
    s = s.replace('.','')
    # Split on spaces
    s = s.split(' ')
    # get surname
    surname = s[0]

    #get title - loop over titleDict
    #if s matches a key, take the corresponding value as the tile
    title = []
    for k, t in titleDict.items():
      if str(k) in s:
        title.append(t)
        
    if title == []:
        title = 'other'
    else:
        # Title is a list, so extract contents
        title = title[0]
    return surname.strip(','), title

out = allsets['Name'].apply(NameSplit,args=[titleDict])

out = out.apply(pd.Series)
out.columns = ['Surname','Title']
allsets = pd.concat([allsets, out],axis=1)


In [None]:
TicketDict = {
    "A./5.": "A",
    "A.5.": "A",
    "A/4": "A",
    "A/4.": "A",
    "A/5": "A",
    "A/5.": "A",
    "A/S": "A",
    "A4.": "A",
    "C": "C",
    "C.A.":"C",
    "CA.": "C",
    "C.A./": "C",
    "CA": "C",
    "F.C.": "FC",
    "F.C.C.": "FC",
    "LINE":"LINE",
    "PC": "PC",
    "PP": "PC",
    "SOTON/O.Q.": "SO",
    "SOTON/O2": "SO",
    "SOTON/OQ": "SO",
    "STON/O" : "ST",
    "STON/O2.": "ST"
}

def splitTic(s):

    match = re.match(r"([0-9]+)", s, re.I)

    try:
        number = int(match.group(1))
    except:
        number = s

    return number


def SP(s, TicketDict):
    s = splitTic(s)
    if isinstance(s, (int)):
       if s <= 9999:
          tic = 'a'
       if ((s >= 10000) and (s < 99999)):
          tic = 'b'
       if ((s >= 100000) and (s < 999999)):
          tic = 'c'
       else:
          tic = 'd'
    else:
       s = s.split(' ')
      
       tic = [t for k, t in TicketDict.items() if str(k) in s]
      
       if tic == []:
        tic = 'Other'
       else:
        # Title is a list, so extract contents
        tic = tic[0]
      
    return tic

# out = allsets['Ticket'].apply(splitTic, args=[TicketDict])
out = allsets['Ticket'].apply(SP, args = [TicketDict])
out = out.apply(pd.Series)
out.columns = ['Tickettype']

allsets = pd.concat([allsets,out],axis=1)


In [None]:
# List of categorical columns to record
catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype']
# catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype','Fee']


for c in catCols:
    allsets[c] = pd.factorize(allsets[c])[0]

# Replace missing age value with median
allsets['Age'] = allsets['Age'].fillna(allsets['Age'].median())
allsets['Fare'] = allsets['Fare'].fillna(allsets['Fare'].median())
allsets = allsets.drop(['Ticket', 'Cabin', 'Name'],axis=1)

In [None]:
allsets

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
0,22.0,0,7.2500,0,1,3,0,1,0.0,0,0,9999,2,0.5,True,0,0,0
1,38.0,1,71.2833,0,2,1,1,1,1.0,1,1,1,2,0.5,True,1,1,1
2,26.0,0,7.9250,0,3,3,1,0,1.0,0,0,9999,1,1.0,True,2,2,2
3,35.0,0,53.1000,0,4,1,1,1,1.0,1,2,1,2,0.5,True,3,1,3
4,35.0,0,8.0500,0,5,3,0,0,0.0,0,0,9999,1,1.0,True,4,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,8.0500,0,1305,3,0,0,,0,0,9999,1,1.0,False,863,0,0
414,39.0,1,108.9000,0,1306,1,1,0,,1,105,1,1,1.0,True,864,8,1
415,38.5,0,7.2500,0,1307,3,0,0,,0,0,9999,1,1.0,True,865,0,7
416,28.0,0,8.0500,0,1308,3,0,0,,0,0,9999,1,1.0,False,810,0,3


In [None]:
misstrain = allsets.isnull().sum()
misstrain = misstrain[misstrain>0]
misstrain 

Survived    418
dtype: int64

In [None]:
# Split datasets
train = allsets.iloc[0:TrainRow,:]
test = allsets.iloc[TrainRow:,:]
test = test.drop(['Survived'],axis=1)

In [None]:
test

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
0,34.5,2,7.8292,0,892,3,0,0,0,0,9999,1,1.0,True,261,0,3
1,47.0,0,7.0000,0,893,3,1,1,0,0,9999,2,0.5,True,661,1,3
2,62.0,2,9.6875,0,894,2,0,0,0,0,9999,1,1.0,True,662,0,3
3,27.0,0,8.6625,0,895,3,0,0,0,0,9999,1,1.0,True,663,0,3
4,22.0,0,12.2875,1,896,3,1,1,0,0,9999,3,1.0,True,394,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,8.0500,0,1305,3,0,0,0,0,9999,1,1.0,False,863,0,0
414,39.0,1,108.9000,0,1306,1,1,0,1,105,1,1,1.0,True,864,8,1
415,38.5,0,7.2500,0,1307,3,0,0,0,0,9999,1,1.0,True,865,0,7
416,28.0,0,8.0500,0,1308,3,0,0,0,0,9999,1,1.0,False,810,0,3


In [None]:
# Validation set 
X_trainData, X_validData, y_train, y_valid = train_test_split(train.drop(['Survived','PassengerId'],axis=1),
                                             train.Survived, 
                                             test_size = 0.3, 
                                             stratify=train.Survived,
                                             random_state=28)

In [None]:
X_trainData

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
520,30.0,0,93.5000,0,1,1,0,6,69,1,1,1.00,True,424,2,4
241,28.0,2,15.5000,0,3,1,1,0,0,9999,2,0.50,False,211,2,3
88,23.0,0,263.0000,2,1,1,3,1,7,3,6,0.75,True,26,2,4
20,35.0,0,26.0000,0,2,0,0,0,0,9999,1,1.00,True,20,0,3
874,28.0,1,24.0000,0,2,1,1,0,0,9999,2,0.50,True,266,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,42.0,0,8.4042,1,3,0,0,0,0,9999,2,2.00,True,140,0,4
224,38.0,0,90.0000,0,1,0,1,1,33,1,2,0.50,True,198,0,4
522,28.0,1,7.2250,0,3,0,0,0,0,9999,1,1.00,False,426,0,4
512,36.0,0,26.2875,0,1,0,0,2,68,1,1,1.00,True,416,0,1


In [None]:
gbdt = GradientBoostingClassifier(subsample=0.7,learning_rate=0.03,
                                  random_state=28, n_estimators=500,
                                  min_samples_leaf=3)
gbdt.fit(X_trainData, y_train)
train_pred = gbdt.predict(X_trainData)
valid_pred = gbdt.predict(X_validData)

acc_train = gbdt.score(X_trainData, y_train)
acc_valid = gbdt.score(X_validData, y_valid)

print(acc_train)
print(acc_valid)

0.9550561797752809
0.8134328358208955


In [None]:
predtest = gbdt.predict(test.drop(['PassengerId'],axis=1))
predtest
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = np.int32(predtest)

In [None]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
# submission.to_csv('Titanic.csv',index=False)
# from google.colab import files
# files.download('Titanic.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>