In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb


from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV


from sklearn.model_selection import cross_val_score

# Data Preparation

In [2]:
train = pd.read_csv("C:/Users/Daniel/Data Science/Workspace/Titanic/1. Original Data/train.csv")
test = pd.read_csv("C:/Users/Daniel/Data Science/Workspace/Titanic/1. Original Data/test.csv")

all_data = pd.concat((train.loc[:,'Pclass':], test.loc[:,'Pclass':]), ignore_index=True) # drops PassengerId implicitly

In [3]:
# extract Title from Names
all_data['Title'] = all_data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())

# a map of more aggregated titles
Title_Dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"
                    }

# we map each title
all_data['Title'] = all_data.Title.map(Title_Dictionary)

In [4]:
all_data = all_data.drop('Name', axis=1)
all_data = all_data.drop('Ticket', axis=1)

In [5]:
# To check how many columns have missing values - this can be repeated to see the progress made
def show_missing():
    missing = all_data.columns[all_data.isnull().any()].tolist()
    return missing

# Looking at categorical values
def cat_exploration(column):
    return all_data[column].value_counts()

# Imputing the missing values
def cat_imputation(column, value):
    all_data.loc[all_data[column].isnull(),column] = value

In [6]:
# Number of missing values in each column
all_data[show_missing()].isnull().sum() 

Age          263
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [7]:
all_data = all_data.drop('Cabin', axis=1)

In [8]:
all_data = all_data.fillna({
    'Fare' : all_data.Fare.median(),
    'Embarked': all_data.Embarked.mode()[0]})

In [9]:
all_data.groupby(['Sex','Pclass','Title']).median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,SibSp,Parch,Fare
Sex,Pclass,Title,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,1,Miss,30.0,0,0,99.9625
female,1,Mrs,45.0,1,0,78.1125
female,1,Officer,49.0,0,0,25.9292
female,1,Royalty,39.0,0,0,86.5
female,2,Miss,20.0,0,0,20.25
female,2,Mrs,30.0,1,0,26.0
female,3,Miss,18.0,0,0,8.05
female,3,Mrs,31.0,1,1,15.5
male,1,Master,6.0,1,2,134.5
male,1,Mr,41.5,0,0,47.1


In [10]:
all_data["Age"] = all_data.groupby(['Sex','Pclass','Title'])['Age'].transform(lambda x: x.fillna(x.median()))

In [11]:
all_data[show_missing()].isnull().sum() 

Series([], dtype: float64)

# Feature Creation

In [12]:
# introducing a new feature : the size of families (including the passenger)
all_data['FamilySize'] = all_data['Parch'] + all_data['SibSp'] + 1

# introducing other features based on the family size
all_data['Singleton'] = all_data['FamilySize'].map(lambda s : 1 if s == 1 else 0)
all_data['SmallFamily'] = all_data['FamilySize'].map(lambda s : 1 if 2<=s<=4 else 0)
all_data['LargeFamily'] = all_data['FamilySize'].map(lambda s : 1 if 5<=s else 0)

# Final Preparation

In [13]:
# Dummify categorical features
all_data = pd.get_dummies(all_data)  #, drop_first=True)

In [14]:
# Scale to [0, 1] 
all_data = all_data.apply(lambda x: x/x.max(),axis=0)

In [15]:
all_data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamilySize,Singleton,SmallFamily,LargeFamily,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,1.0,0.275,0.125,0.0,0.014151,0.181818,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.333333,0.475,0.125,0.0,0.139136,0.181818,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.325,0.0,0.0,0.015469,0.090909,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.333333,0.4375,0.125,0.0,0.103644,0.181818,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.4375,0.0,0.0,0.015713,0.090909,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
# creating matrices for sklearn:
X_train = all_data[:train.shape[0]].copy() # .shape[0] --> number of rows
X_test = all_data[train.shape[0]:].copy()
y = train.Survived.copy()

# Models

## Logistic Regression

In [17]:
model_log = LogisticRegressionCV().fit(X_train, y)

#log_preds = model_log.predict_proba(X_test)[:, 1]

log_preds = model_log.predict(X_test)

In [18]:
# cross_val_score(model_log, X_train, y, scoring='accuracy', cv=5).mean()

## RandomForestClassifier

In [19]:
param_grid = [ { 'max_depth' : [3, 4, 5],
                 'n_estimators': [20, 75, 100, 200],
                 'criterion': ['gini','entropy']}]

clf = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, scoring= 'accuracy', cv=5)

clf.fit(X_train, y)

summary = pd.DataFrame(clf.cv_results_)
summary.sort(columns='rank_test_score', ascending=True)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_criterion,param_max_depth,param_n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
17,0.156659,0.111375,0.835017,0.836982,entropy,4,75,"{'criterion': 'entropy', 'max_depth': 4, 'n_es...",1,0.860335,...,0.825843,0.83871,0.803371,0.842917,0.858757,0.831933,0.000776,0.00246,0.021706,0.004072
19,0.234429,0.109428,0.833895,0.83614,entropy,4,200,"{'criterion': 'entropy', 'max_depth': 4, 'n_es...",2,0.854749,...,0.825843,0.83871,0.803371,0.842917,0.858757,0.829132,0.000165,4.5e-05,0.020477,0.00508
5,0.157365,0.111368,0.833895,0.83614,gini,4,75,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",2,0.849162,...,0.825843,0.83871,0.803371,0.842917,0.864407,0.829132,0.002193,0.002437,0.020998,0.00508
7,0.257379,0.109483,0.833895,0.836421,gini,4,200,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",2,0.854749,...,0.825843,0.83871,0.803371,0.842917,0.864407,0.830532,0.045994,0.000121,0.022442,0.004711
23,0.23477,0.110595,0.832772,0.839508,entropy,5,200,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",5,0.849162,...,0.825843,0.83871,0.803371,0.847125,0.864407,0.833333,0.002687,0.002263,0.02149,0.004427
2,0.174831,0.109388,0.832772,0.835581,gini,3,100,"{'criterion': 'gini', 'max_depth': 3, 'n_estim...",5,0.843575,...,0.825843,0.83871,0.803371,0.842917,0.864407,0.82493,0.002487,4.4e-05,0.020289,0.00652
6,0.174118,0.109369,0.832772,0.836701,gini,4,100,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",5,0.854749,...,0.825843,0.83871,0.803371,0.842917,0.858757,0.830532,0.002693,4.2e-05,0.020981,0.004865
18,0.171863,0.110262,0.832772,0.836421,entropy,4,100,"{'criterion': 'entropy', 'max_depth': 4, 'n_es...",5,0.849162,...,0.825843,0.83871,0.803371,0.842917,0.858757,0.830532,0.000101,0.001726,0.01943,0.004711
21,0.156178,0.110577,0.83165,0.838948,entropy,5,75,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",9,0.854749,...,0.825843,0.83871,0.803371,0.84432,0.853107,0.831933,0.000131,0.002164,0.019672,0.003997
20,0.127284,0.109448,0.83165,0.843997,entropy,5,20,"{'criterion': 'entropy', 'max_depth': 5, 'n_es...",9,0.843575,...,0.825843,0.83871,0.803371,0.85554,0.864407,0.838936,0.004462,0.000172,0.020738,0.006514


In [20]:
print('Best score: {}'.format(clf.best_score_))
print('Best parameters: {}'.format(clf.best_params_))

Best score: 0.835016835016835
Best parameters: {'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 75}


In [21]:
# model_rf = RandomForestClassifier(n_estimators=75, max_depth=4, criterion='gini').fit(X_train, y) # Best result with 20/4/gini
rf_preds = clf.predict(X_test)

In [22]:
# cross_val_score(clf, X_train, y, scoring='accuracy', cv=5).mean()

## GradientBoostingClassifier

In [23]:
param_grid = [ {'subsample' : [0.75, 1],
                'n_estimators':[200, 300], 
                'max_depth':[3, 4], 
                'learning_rate':[0.07, 0.09, 0.1] }]

clf2 = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring= 'accuracy', cv=5)

clf2.fit(X_train, y)

# summary = pd.DataFrame(clf2.cv_results_)
# summary.sort(columns='rank_test_score', ascending=True)

print('Best score: {}'.format(clf2.best_score_))
print('Best parameters: {}'.format(clf2.best_params_))

#model_gbc = GradientBoostingClassifier(subsample=0.75, learning_rate=0.09, max_depth=3, n_estimators=200).fit(X_train, y)
#gbc_preds = model_gbc.predict(X_test)

gbc_preds = clf2.predict(X_test)

Best score: 0.8372615039281706
Best parameters: {'learning_rate': 0.07, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.75}


In [24]:
# cross_val_score(clf2, X_train, y, scoring='accuracy', cv=5).mean()

# Ensemble

In [28]:
preds = (log_preds + gbc_preds + rf_preds)/3
preds = preds.round().astype(int)

# Solution to CSV

In [29]:
solution = pd.DataFrame({"PassengerId":test.PassengerId, "Survived":preds})
solution.to_csv("submission.csv", index = False) 