# Using classification to predict survival with the Titanic dataset

In [210]:
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [154]:
ti = pd.read_csv("titanic.csv")

### Exploratory Data Analysis

In [155]:
ti.head(30)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [156]:
ti.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [157]:
ti.describe(include=['object'])

Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive
count,891,889,891,891,203,889,891
unique,2,3,3,3,7,3,2
top,male,S,Third,man,C,Southampton,no
freq,577,644,491,537,59,644,549


In [158]:
ti.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [159]:
ti.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### Data Cleansing and One Hot Encoding

In [160]:
ti['age'].fillna((ti['age'].mean()), inplace=True)

In [161]:
sex_dummies = pd.get_dummies(ti.sex, prefix='sex')
sex_dummies.drop(sex_dummies.columns[0], axis=1, inplace=True)
ti= pd.concat([ti, sex_dummies], axis=1)
ti.head()
ti.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
sex_male         0
dtype: int64

In [162]:
ti['embarked'].fillna('S',inplace=True)
embark_dummies = pd.get_dummies(ti.embarked, prefix='embark')
embark_dummies.drop(embark_dummies.columns[0], axis=1, inplace=True)
ti= pd.concat([ti, embark_dummies], axis=1)

In [163]:
who_dummies = pd.get_dummies(ti.who, prefix='who')
who_dummies.drop(who_dummies.columns[0], axis=1, inplace=True)
ti= pd.concat([ti, who_dummies], axis=1)

In [164]:
ti['deck'].fillna('H', inplace=True)
ti['deck']=ti['deck'].replace({'A':7,'B':6,'C':5,'D':4,'E':3,'F':2,'G':1,'H':0})

In [165]:
ti['class']=ti['class'].replace({'First':1,'Second':2,'Third':3})

In [166]:
print(ti.head())
ti.isna().sum()

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S      3   
1         1       1  female  38.0      1      0  71.2833        C      1   
2         1       3  female  26.0      0      0   7.9250        S      3   
3         1       1  female  35.0      1      0  53.1000        S      1   
4         0       3    male  35.0      0      0   8.0500        S      3   

     who  adult_male  deck  embark_town alive  alone  sex_male  embark_Q  \
0    man        True     0  Southampton    no  False         1         0   
1  woman       False     5    Cherbourg   yes  False         0         0   
2  woman       False     0  Southampton   yes   True         0         0   
3  woman       False     5  Southampton   yes  False         0         0   
4    man        True     0  Southampton    no   True         1         0   

   embark_S  who_man  who_woman  
0         1        1          0  
1         0       

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    2
alive          0
alone          0
sex_male       0
embark_Q       0
embark_S       0
who_man        0
who_woman      0
dtype: int64

### Feature Engineering

In [167]:
cols=['pclass','age','sibsp','parch','fare','class','deck','alone','sex_male','embark_Q','embark_S','who_man','who_woman']
incsurv=['survived','pclass','age','sibsp','parch','fare','class','deck','alone','sex_male','embark_Q','embark_S','who_man','who_woman']
ti1=ti[cols]
ti2=ti[incsurv]

In [168]:
ti1.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,class,deck,alone,sex_male,embark_Q,embark_S,who_man,who_woman
0,3,22.0,1,0,7.25,3,0,False,1,0,1,1,0
1,1,38.0,1,0,71.2833,1,5,False,0,0,0,0,1
2,3,26.0,0,0,7.925,3,0,True,0,0,1,0,1
3,1,35.0,1,0,53.1,1,5,False,0,0,1,0,1
4,3,35.0,0,0,8.05,3,0,True,1,0,1,1,0


In [169]:
for i in cols:
    p_value = stats.pearsonr(ti[i], ti['survived'])[1]
    print(f'{i}: {p_value}')

pclass: 2.537047387978631e-25
age: 0.037217083726811706
sibsp: 0.2922439286980196
parch: 0.014799245374712841
fare: 6.120189341917992e-15
class: 2.537047387978631e-25
deck: 2.5189952726553e-19
alone: 9.009490179320951e-10
sex_male: 1.406066130878875e-69
embark_Q: 0.913353235242466
embark_S: 7.223240983680848e-06
who_man: 8.998259162083635e-74
who_woman: 3.009425929546828e-59


In [170]:
ti[incsurv].corr()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,class,deck,alone,sex_male,embark_Q,embark_S,who_man,who_woman
survived,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307,-0.338481,0.294804,-0.203367,-0.543351,0.00365,-0.149683,-0.55708,0.506562
pclass,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,1.0,-0.743251,0.135207,0.1319,0.221009,0.074053,0.094035,-0.177049
age,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566,-0.331339,0.249842,0.179775,0.084153,-0.013855,-0.019336,0.253236,0.094168
sibsp,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651,0.083081,-0.041333,-0.584471,-0.114631,-0.026354,0.068734,-0.253586,0.047071
parch,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225,0.018443,0.031308,-0.583398,-0.245489,-0.081228,0.060814,-0.349943,0.150167
fare,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0,-0.5495,0.525994,-0.271832,-0.182333,-0.117216,-0.162184,-0.182024,0.191243
class,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,1.0,-0.743251,0.135207,0.1319,0.221009,0.074053,0.094035,-0.177049
deck,0.294804,-0.743251,0.249842,-0.041333,0.031308,0.525994,-0.743251,1.0,-0.137515,-0.118282,-0.128407,-0.136063,-0.098553,0.151676
alone,-0.203367,0.135207,0.179775,-0.584471,-0.583398,-0.271832,0.135207,-0.137515,1.0,0.303646,0.086464,0.029074,0.404744,-0.211036
sex_male,-0.543351,0.1319,0.084153,-0.114631,-0.245489,-0.182333,0.1319,-0.118282,0.303646,1.0,-0.074115,0.119224,0.908578,-0.896214


In [171]:
fecols=['pclass','age','parch','fare','class','deck','alone','sex_male','embark_Q','embark_S','who_man','who_woman']

### Logistic Regression

In [172]:
model_logistic = LogisticRegression()

In [174]:
x_train, x_test, y_train, y_test = train_test_split(ti2.iloc[:,1:],ti2.iloc[:,0],test_size=0.2,random_state=42)
ti2.shape, x_train.shape, y_train.shape, x_test.shape, y_test.shape

((891, 14), (712, 13), (712,), (179, 13), (179,))

In [175]:
model_logistic.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [176]:
predictions = model_logistic.predict(x_test)
score = model_logistic.score(x_test, y_test)
print (score)

0.8100558659217877


In [177]:
scoring = 'roc_auc'
results = model_selection.cross_val_score(model_logistic ,ti2.iloc[:,1:] , ti2.iloc[:,0], scoring=scoring)
print(("AUC: %.3f (%.3f)") % (results.mean(), results.std()))

AUC: 0.860 (0.018)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [178]:
matrix = confusion_matrix(y_test, predictions)
print(matrix)

[[91 14]
 [20 54]]


In [214]:
pprint(model_logistic.get_params())

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


### Decision Trees

In [179]:
model = tree.DecisionTreeClassifier(criterion='entropy')

In [215]:
model.fit(x_train, y_train)
predictions = model.predict(x_test)
score = model.score(x_test, y_test)
print(score)

0.7988826815642458


In [197]:
scoring = 'roc_auc'
results = model_selection.cross_val_score(model ,ti2.iloc[:,1:] , ti2.iloc[:,0], scoring=scoring)
print(("AUC: %.3f (%.3f)") % (results.mean(), results.std()))

AUC: 0.864 (0.040)


In [195]:
matrix = confusion_matrix(y_test, predictions)
print(matrix)

[[87 18]
 [18 56]]


In [213]:
pprint(model.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


### Random Forest

In [185]:
model= RandomForestClassifier(n_estimators=1000)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
score = model.score(x_test, y_test)
print(score)

0.7988826815642458


In [191]:
scoring = 'roc_auc'
results = model_selection.cross_val_score(model ,ti2.iloc[:,1:] , ti2.iloc[:,0], scoring=scoring)
print(("AUC: %.3f (%.3f)") % (results.mean(), results.std()))

AUC: 0.864 (0.041)


In [194]:
matrix = confusion_matrix(y_test, predictions)
print(matrix)

[[87 18]
 [18 56]]


In [219]:
pprint(model.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


### Gradient Boosting

In [207]:
gb_clf = GradientBoostingClassifier(n_estimators=500, random_state=0)
gb_clf.fit(x_train, y_train)

Accuracy score (validation): 0.799


In [208]:
score = gb_clf.score(x_test, y_test)
print(score)

0.7988826815642458


In [209]:
scoring = 'roc_auc'
results = model_selection.cross_val_score(gb_clf ,ti2.iloc[:,1:] , ti2.iloc[:,0], scoring=scoring)
print(("AUC: %.3f (%.3f)") % (results.mean(), results.std()))

AUC: 0.858 (0.036)


In [193]:
predictions = gb_clf.predict(x_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

Confusion Matrix:
[[87 18]
 [18 56]]


In [212]:
pprint(gb_clf.get_params())

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_iter_no_change': None,
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90],
    'n_estimators': [10, 20]
}

### Hyperparameter tuning

In [199]:
rf = RandomForestClassifier(random_state = 42)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [201]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90],
    'n_estimators': [10, 20]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)

In [202]:
grid_search.fit(x_train, y_train)
grid_search.best_params_

Fitting 2 folds for each of 4 candidates, totalling 8 fits


{'bootstrap': True, 'max_depth': 90, 'n_estimators': 10}

In [204]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels.count())
    accuracy = 100 - mape
    print('Model Performance')
    print('Error: {:0.4f}.'.format(np.sum(errors)))
    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [205]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, x_test, y_test)

Model Performance
Error: 30.0000.
Average Error: 0.1676
Accuracy = 99.91%.


In [243]:
lr_paramgrid={
    'max_iter':[100,200]
}

dt_paramgrid = {
    'criterion':['entropy'],
    'max_depth':[80,90],  
}

rf_paramgrid = {
    'bootstrap':[True],
    'max_depth':[80,90],
    'max_features':[10,20],
    'max_leaf_nodes':[10,20],
    'min_samples_leaf':[1,5],
    'n_estimators':[10,20]    
}

gb_paramgrid={
    'learning_rate':[0.01,0.5],
    'max_depth':[2,10],
    'max_features':[10,20],
    'max_leaf_nodes':[10,20],
    'min_samples_leaf':[1,5],
    'n_estimators':[10,20]   
}

lr=(LogisticRegression(),lr_paramgrid)
dt = (tree.DecisionTreeClassifier(),dt_paramgrid)
rf= (RandomForestClassifier(),rf_paramgrid)
gb = (GradientBoostingClassifier(),gb_paramgrid)
methods=[lr,dt,rf,gb]

for method in methods:
    print(method)
    grid_search = GridSearchCV(estimator = method[0], param_grid = method[1], 
                          cv = 2, n_jobs = -1, verbose = 2)
    grid_search.fit(x_train, y_train)
    grid_search.best_params_
    best_grid = grid_search.best_estimator_
    grid_accuracy = evaluate(best_grid, x_test, y_test)
    print('-------------------------')

(LogisticRegression(), {'max_iter': [100, 200]})
Fitting 2 folds for each of 2 candidates, totalling 4 fits
Model Performance
Error: 34.0000.
Average Error: 0.1899
Accuracy = 99.89%.
-------------------------
(DecisionTreeClassifier(), {'criterion': ['entropy'], 'max_depth': [80, 90]})


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting 2 folds for each of 2 candidates, totalling 4 fits
Model Performance
Error: 39.0000.
Average Error: 0.2179
Accuracy = 99.88%.
-------------------------
(RandomForestClassifier(), {'bootstrap': [True], 'max_depth': [80, 90], 'max_features': [10, 20], 'max_leaf_nodes': [10, 20], 'min_samples_leaf': [1, 5], 'n_estimators': [10, 20]})
Fitting 2 folds for each of 32 candidates, totalling 64 fits


 0.8244382  0.82865169        nan        nan        nan        nan
        nan        nan        nan        nan 0.8258427  0.81741573
 0.8244382  0.8258427  0.81601124 0.82162921 0.83146067 0.83146067
        nan        nan        nan        nan        nan        nan
        nan        nan]


Model Performance
Error: 33.0000.
Average Error: 0.1844
Accuracy = 99.90%.
-------------------------
(GradientBoostingClassifier(), {'learning_rate': [0.01, 0.5], 'max_depth': [2, 10], 'max_features': [10, 20], 'max_leaf_nodes': [10, 20], 'min_samples_leaf': [1, 5], 'n_estimators': [10, 20]})
Fitting 2 folds for each of 64 candidates, totalling 128 fits
Model Performance
Error: 33.0000.
Average Error: 0.1844
Accuracy = 99.90%.
-------------------------


 0.62359551 0.62359551        nan        nan        nan        nan
        nan        nan        nan        nan 0.62359551 0.62359551
 0.62359551 0.62359551 0.62359551 0.62359551 0.62359551 0.62359551
        nan        nan        nan        nan        nan        nan
        nan        nan 0.82022472 0.8258427  0.82724719 0.82865169
 0.82162921 0.81882022 0.81460674 0.82022472        nan        nan
        nan        nan        nan        nan        nan        nan
 0.80196629 0.79073034 0.80337079 0.79494382 0.77106742 0.76966292
 0.80337079 0.77808989        nan        nan        nan        nan
        nan        nan        nan        nan]
