# TITANIC SUBMISSION SET 02
Evaluation metric: Accuracy

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## DATA MUNGING

In [2]:
title_mapping = {
    'Mr':['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col'],
    'Mrs':['Countess', 'Mme','Mrs'],
    'Miss':['Mlle', 'Ms','Miss'],
    'Master':['Master'],
    'Dr':['Dr']
}

def MatchTitles(title):
    for i in title_mapping:
        if title in title_mapping[i]:
            return i

def MatchSubstrings(main_string, substrings):
    for substring in substrings:
        if main_string.find(substring) != -1:
            return substring
    return np.nan

def Munge(data):
    df = data.copy()
    
    # lower case the column names
    df.columns = df.columns.str.lower()
    
    # extract titles
    titles = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
              'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
              'Don', 'Jonkheer']
    df['title'] = df.name.map(lambda x: MatchSubstrings(x, titles))
    
    # group titles
    df['grouped_title'] = df.title.map(MatchTitles)
    
    # add family size
    df['family_size'] = df.parch + df.sibsp
    
    # missing values for fares (only 1 from testset)
    df.ix[df.fare.isnull(), 'fare'] = 0
    
    # impute missing ages with the mean based on title
    df['impute_age'] = df.age
    df.ix[(df.age.isnull()) & (df.title=='Mr'), 'impute_age'] = np.average(df[df.title=='Mr'].age.dropna())
    df.ix[(df.age.isnull()) & (df.title=='Mrs'), 'impute_age'] = np.average(df[df.title=='Mrs'].age.dropna())
    df.ix[(df.age.isnull()) & (df.title=='Miss'), 'impute_age'] = np.average(df[df.title=='Miss'].age.dropna())
    df.ix[(df.age.isnull()) & (df.title=='Master'), 'impute_age'] = np.average(df[df.title=='Master'].age.dropna())
    df.ix[(df.age.isnull()) & (df.title=='Dr'), 'impute_age'] = np.average(df[df.title=='Dr'].age.dropna())
    
    # binning age groups into categories
    bins = [0,10,30,60,200]
    names = ['child','adult','senior','aged']
    df['grouped_age'] = pd.cut(df.impute_age, bins, labels=names)
    
    # create dummies for sex variable
    df = df.join(pd.get_dummies(df.sex, prefix='sex'))
    df = df.join(pd.get_dummies(df.pclass, prefix='pclass'))
    df = df.join(pd.get_dummies(df.embarked, prefix='embarked'))
    df = df.join(pd.get_dummies(df.grouped_title, prefix='title'))
    df = df.join(pd.get_dummies(df.grouped_age, prefix='age'))
    
    # lower case the column names again before returning
    df.columns = df.columns.str.lower()
    
    return df

In [3]:
# load data
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

# munge data
df_train = Munge(df_train)
df_test = Munge(df_test)

## EDA
- Females were more likely to survive
- People from class 1 were more likely to survive

In [4]:
pd.crosstab(df_train.sex, df_train.survived, dropna=False, normalize='index')

survived,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.257962,0.742038
male,0.811092,0.188908


In [5]:
pd.crosstab(df_train.pclass, df_train.survived, dropna=False, normalize='index')

survived,0,1
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.37037,0.62963
2,0.527174,0.472826
3,0.757637,0.242363


In [6]:
pd.crosstab(df_train.embarked, df_train.survived, dropna=False, normalize='index')

survived,0,1
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.446429,0.553571
Q,0.61039,0.38961
S,0.663043,0.336957


## MODELS

In [7]:
df_train

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,...,embarked_s,title_dr,title_master,title_miss,title_mr,title_mrs,age_child,age_adult,age_senior,age_aged
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [8]:
# train test split
train, val = train_test_split(df_train, test_size=0.3, random_state=0)

In [9]:
# features to exclude
excluded_features = ['survived','cabin','sex','ticket','name','embarked','pclass','sibsp','parch','title',
                     'grouped_title','age','impute_age','grouped_age']

features = df_train.ix[:,~df_train.columns.isin(excluded_features)].columns
features

Index(['fare', 'family_size', 'sex_female', 'sex_male', 'pclass_1', 'pclass_2',
       'pclass_3', 'embarked_c', 'embarked_q', 'embarked_s', 'title_dr',
       'title_master', 'title_miss', 'title_mr', 'title_mrs', 'age_child',
       'age_adult', 'age_senior', 'age_aged'],
      dtype='object')

### Training set

In [10]:
train_x = train.ix[:,features]
train_y = train.survived

In [11]:
# logistic regression
lr = LogisticRegression()
lr.fit(train_x, train_y)
lr_pred_train = lr.predict(train_x)
print('lr train accuracy: {result}'.format(result=accuracy_score(train_y, lr_pred_train)))

# random forest
rf = RandomForestClassifier(criterion='entropy', n_estimators=500, random_state=0, n_jobs=3)
rf.fit(train_x, train_y)
rf_pred_train = rf.predict(train_x)
print('rf train accuracy: {result}'.format(result=accuracy_score(train_y, rf_pred_train)))

lr train accuracy: 0.8346709470304976
rf train accuracy: 0.9502407704654896


### Validation set

In [12]:
val_x = val.ix[:,features]
val_y = val.survived

In [13]:
# logistic regression
lr_pred_val = lr.predict(val_x)
print('lr validation accuracy: {result}'.format(result=accuracy_score(val_y, lr_pred_val)))

# random forest
rf_pred_val = rf.predict(val_x)
print('rf validation accuracy: {result}'.format(result=accuracy_score(val_y, rf_pred_val)))

lr validation accuracy: 0.8246268656716418
rf validation accuracy: 0.8208955223880597


### Initial submission

In [14]:
def SubmitCSV(data, filename):
    data = pd.Series(submission, index=df_test.index, name='Survived')
    pd.DataFrame(data).to_csv(filename)

In [15]:
submission = rf.predict(df_test.ix[:,features])
SubmitCSV(submission, '../submissions/submit03_randomforest.csv')

submission = lr.predict(df_test.ix[:,features])
SubmitCSV(submission, '../submissions/submit04_logisticreg.csv')