In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics

## Final Functions

Functions developed in the Exploratory Titanic book, updated for flexibility with training and test data.

In [2]:
def remove_nan(df):
    df['Embarked'] = df['Embarked'].fillna('S')
    for pclass in df['Pclass'].unique():
        df.loc[(df['Pclass'] == pclass) & (df['Age'].isnull()), 'Age'] = df.groupby('Pclass')['Age'].mean().loc[pclass]
    df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].median()
    return df    

def remove_features(df):
    df['Sex'].replace({'male':0, 'female':1}, inplace = True)
    
    useless_features = ['Cabin', 'PassengerId', 'Ticket', 'Name']
    for item in useless_features:
        if item in df:
            df = df.drop(item, axis=1)
    return df
    
def one_hot_encode_embarked(df):
    encoded_embarked = pd.get_dummies(df['Embarked'])
    df = pd.concat([df, encoded_embarked], axis=1)
    df = df.drop('Embarked', axis = 1)
    return df

def flag_age(df):
    df['Infant'] = df['Age'] <= 7 
    df['Geriatric'] = df['Age'] >= 65
    return df

def extract_titles(df):
    if 'Name' in df:
        split_names = df['Name'].str.split(',')
        titles = split_names.apply(lambda x: x[1]).str.strip().str.split('.').apply(lambda x: x[0])
        df['Title'] = titles
        mens_titles = ['Jonkheer', 'Sir', 'Capt', 'Col', 'Don', 'Dr', 'Rev', 'Major']
        mrs_titles = ['Mme', 'Dona']
        miss_titles = ['Ms', 'Lady', 'Mlle', 'the Countess']
        
        for title in mens_titles:
            df.loc[df['Title'] == title, 'Title'] = 'Mr'
        
        for title in miss_titles :
            df.loc[df['Title'] == title, 'Title'] = 'Miss'
            
        for title in mrs_titles:
            df.loc[df['Title'] == title, 'Title'] = 'Mrs'
            
    encoded_titles = pd.get_dummies(df['Title'])
    df = pd.concat([df, encoded_titles], axis=1)
    df = df.drop(['Name', 'Title'], axis = 1)
    return df

def build_model():
    model = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())
    ])

    gs = GridSearchCV(
        model,
        {'logreg__penalty': ['l1', 'l2'],
        'logreg__C': np.arange(0.01, 15, 0.1)},
        cv=5,
        n_jobs=4
    )
    return model, gs
def return_prediction(test):
    prediction  = gs.predict(test)

In [3]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')
test = pd.read_csv('./test.csv')
PID = test.pop('PassengerId')

In [4]:
#Basic model
X = remove_features(one_hot_encode_embarked(remove_nan(X)))
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,0,22.0,1,0,7.25,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,0
2,3,1,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,3,0,35.0,0,0,8.05,0,0,1


In [5]:
X_train, X_test, y_train, y_test = train_test_split(*shuffle(X, Y), test_size=0.1)

In [6]:
model, gs = build_model()
gs.fit(X_train, y_train)
gs.best_params_

{'logreg__C': 0.11, 'logreg__penalty': 'l2'}

In [7]:
prediction = gs.predict(X_test)

print 'Accuracy: %f' % metrics.accuracy_score(prediction, y_test)
print 'Recall: %f' % metrics.recall_score(prediction, y_test)
print 'Precision: %f' % metrics.precision_score(prediction, y_test)
print 'ROC AUC: %f'   %metrics.roc_auc_score(prediction, y_test)

Accuracy: 0.877778
Recall: 0.818182
Precision: 0.843750
ROC AUC: 0.865231


## Flag age model   

Flag age method isn't likely to work due to huge class imbalance 95-5% for infant and 98-2% for Geriatric but will put it through to see. 

See Titanic Exploratory for more information

In [8]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')

In [9]:
X = flag_age(one_hot_encode_embarked(remove_features(remove_nan(X))))
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Infant,Geriatric
0,3,0,22.0,1,0,7.25,0,0,1,False,False
1,1,1,38.0,1,0,71.2833,1,0,0,False,False
2,3,1,26.0,0,0,7.925,0,0,1,False,False
3,1,1,35.0,1,0,53.1,0,0,1,False,False
4,3,0,35.0,0,0,8.05,0,0,1,False,False


In [10]:
X_train, X_test, y_train, y_test = train_test_split(*shuffle(X, Y), test_size=0.1)
age_model, age_gs = build_model()
age_gs.fit(X_train, y_train)
age_gs.best_params_

{'logreg__C': 0.31000000000000005, 'logreg__penalty': 'l1'}

In [11]:
prediction = age_gs.predict(X_test)

print 'Accuracy: %f' % metrics.accuracy_score(prediction, y_test)
print 'Recall: %f' % metrics.recall_score(prediction, y_test)
print 'Precision: %f' % metrics.precision_score(prediction, y_test)
print 'ROC AUC: %f'   %metrics.roc_auc_score(prediction, y_test)

Accuracy: 0.822222
Recall: 0.742857
Precision: 0.787879
ROC AUC: 0.807792


# Title Model

In [12]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')
test = pd.read_csv('./test.csv')


In [13]:
X = remove_features(extract_titles(one_hot_encode_embarked(remove_nan(X))))
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs
0,3,0,22.0,1,0,7.25,0,0,1,0,0,1,0
1,1,1,38.0,1,0,71.2833,1,0,0,0,0,0,1
2,3,1,26.0,0,0,7.925,0,0,1,0,1,0,0
3,1,1,35.0,1,0,53.1,0,0,1,0,0,0,1
4,3,0,35.0,0,0,8.05,0,0,1,0,0,1,0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(*shuffle(X, Y), test_size=0.1)

In [15]:
title_model, title_gs = build_model()
title_gs.fit(X_train, y_train)
title_gs.best_params_

{'logreg__C': 0.31000000000000005, 'logreg__penalty': 'l1'}

In [16]:
prediction = title_gs.predict(X_test)

print 'Accuracy: %f' % metrics.accuracy_score(prediction, y_test)
print 'Recall: %f' % metrics.recall_score(prediction, y_test)
print 'Precision: %f' % metrics.precision_score(prediction, y_test)
print 'ROC AUC: %f'   %metrics.roc_auc_score(prediction, y_test)

Accuracy: 0.811111
Recall: 0.821429
Precision: 0.657143
ROC AUC: 0.813940


# Title and Age Model 

In [17]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')
test = pd.read_csv('./test.csv')


In [18]:
X = remove_features(flag_age(extract_titles(one_hot_encode_embarked(remove_nan(X)))))
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs,Infant,Geriatric
0,3,0,22.0,1,0,7.25,0,0,1,0,0,1,0,False,False
1,1,1,38.0,1,0,71.2833,1,0,0,0,0,0,1,False,False
2,3,1,26.0,0,0,7.925,0,0,1,0,1,0,0,False,False
3,1,1,35.0,1,0,53.1,0,0,1,0,0,0,1,False,False
4,3,0,35.0,0,0,8.05,0,0,1,0,0,1,0,False,False


In [19]:
X_train, X_test, y_train, y_test = train_test_split(*shuffle(X, Y), test_size=0.1)
combined_model, combined_gs = build_model()
combined_gs.fit(X_train, y_train)
combined_gs.best_params_

{'logreg__C': 0.11, 'logreg__penalty': 'l1'}

In [20]:
prediction = combined_gs.predict(X_test)

print 'Accuracy: %f' % metrics.accuracy_score(prediction, y_test)
print 'Recall: %f' % metrics.recall_score(prediction, y_test)
print 'Precision: %f' % metrics.precision_score(prediction, y_test)
print 'ROC AUC: %f'   %metrics.roc_auc_score(prediction, y_test)

Accuracy: 0.855556
Recall: 0.939394
Precision: 0.738095
ROC AUC: 0.873206


# Tests with final Data

Instead of splitting the train data into two, to benchmark each model this will return CSVs to submit to Kaggle

## Initial Model

In [21]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')
test = pd.read_csv('./test.csv')
PID = test.pop('PassengerId')

In [22]:
X = one_hot_encode_embarked(remove_features(remove_nan(X)))
test = one_hot_encode_embarked(remove_features(remove_nan(test)))

In [23]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,0,34.5,0,0,7.8292,0,1,0
1,3,1,47.0,1,0,7.0,0,0,1
2,2,0,62.0,0,0,9.6875,0,1,0
3,3,0,27.0,0,0,8.6625,0,0,1
4,3,1,22.0,1,1,12.2875,0,0,1


In [24]:
model, gs = build_model()
gs.fit(X, Y)
prediction = gs.predict(test)
pd.DataFrame({'PassengerId':PID, 'Survived': prediction}).set_index('PassengerId').to_csv('Basic_Predictions.csv')

## Age Model

In [25]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')
test = pd.read_csv('./test.csv')
PID = test.pop('PassengerId')

In [26]:
X = flag_age(one_hot_encode_embarked(remove_features(remove_nan(X))))
test = flag_age(one_hot_encode_embarked(remove_features(remove_nan(test))))

In [27]:
age_model, age_gs = build_model()
age_gs.fit(X, Y)
prediction = age_gs.predict(test)
pd.DataFrame({'PassengerId':PID, 'Survived': prediction}).set_index('PassengerId').to_csv('Age_Predictions.csv')

## Title Model

In [28]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')
test = pd.read_csv('./test.csv')
PID = test.pop('PassengerId')

In [29]:
X = remove_features(extract_titles(one_hot_encode_embarked(remove_nan(X))))
test = remove_features(extract_titles(one_hot_encode_embarked(remove_nan(test))))
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs
0,3,0,22.0,1,0,7.25,0,0,1,0,0,1,0
1,1,1,38.0,1,0,71.2833,1,0,0,0,0,0,1
2,3,1,26.0,0,0,7.925,0,0,1,0,1,0,0
3,1,1,35.0,1,0,53.1,0,0,1,0,0,0,1
4,3,0,35.0,0,0,8.05,0,0,1,0,0,1,0


In [30]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs
0,3,0,34.5,0,0,7.8292,0,1,0,0,0,1,0
1,3,1,47.0,1,0,7.0,0,0,1,0,0,0,1
2,2,0,62.0,0,0,9.6875,0,1,0,0,0,1,0
3,3,0,27.0,0,0,8.6625,0,0,1,0,0,1,0
4,3,1,22.0,1,1,12.2875,0,0,1,0,0,0,1


In [31]:
title_model, title_gs = build_model()
title_gs.fit(X, Y)
prediction = title_gs.predict(test)
pd.DataFrame({'PassengerId':PID, 'Survived': prediction}).set_index('PassengerId').to_csv('Title_Predictions.csv')

## Age and Title Model 

In [32]:
X = pd.read_csv('./train.csv')
Y = X.pop('Survived')
test = pd.read_csv('./test.csv')
PID = test.pop('PassengerId')

In [33]:
X = remove_features(flag_age(extract_titles(one_hot_encode_embarked(remove_nan(X)))))
test = remove_features(flag_age(extract_titles(one_hot_encode_embarked(remove_nan(test)))))
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs,Infant,Geriatric
0,3,0,22.00000,1,0,7.2500,0,0,1,0,0,1,0,False,False
1,1,1,38.00000,1,0,71.2833,1,0,0,0,0,0,1,False,False
2,3,1,26.00000,0,0,7.9250,0,0,1,0,1,0,0,False,False
3,1,1,35.00000,1,0,53.1000,0,0,1,0,0,0,1,False,False
4,3,0,35.00000,0,0,8.0500,0,0,1,0,0,1,0,False,False
5,3,0,25.14062,0,0,8.4583,0,1,0,0,0,1,0,False,False
6,1,0,54.00000,0,0,51.8625,0,0,1,0,0,1,0,False,False
7,3,0,2.00000,3,1,21.0750,0,0,1,1,0,0,0,True,False
8,3,1,27.00000,0,2,11.1333,0,0,1,0,0,0,1,False,False
9,2,1,14.00000,1,0,30.0708,1,0,0,0,0,0,1,False,False


In [34]:
combined_model, combined_gs = build_model()
combined_gs.fit(X, Y)
prediction = combined_gs.predict(test)
pd.DataFrame({'PassengerId':PID, 'Survived': prediction}).set_index('PassengerId').to_csv('Combined_Predictions.csv')

In [35]:
combined_gs.best_params_

{'logreg__C': 0.21000000000000002, 'logreg__penalty': 'l1'}