# TITANIC SUBMISSION SET 04
Evaluation metric: Accuracy

The random forest performed better. I'm scaling back the model for this submission to only use gender, class, and fare per person.

The score was still lower than submission set 03.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## DATA MUNGING

In [93]:
title_mapping = {
    'Mr':['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col'],
    'Mrs':['Countess', 'Mme','Mrs'],
    'Miss':['Mlle', 'Ms','Miss'],
    'Master':['Master'],
    'Dr':['Dr']
}

def MatchTitles(title):
    for i in title_mapping:
        if title in title_mapping[i]:
            return i

def MatchSubstrings(main_string, substrings):
    for substring in substrings:
        if main_string.find(substring) != -1:
            return substring
    return np.nan

def Munge(data):
    df = data.copy()
    
    # lower case the column names
    df.columns = df.columns.str.lower()
    
    # missing values for fares (only 1 from testset)
    df.ix[df.fare.isnull(), 'fare'] = 0
    
    # add family size
    df['family_size'] = df.parch + df.sibsp
    df['fare_per_person'] = df.fare / (df.family_size+1)

    # extract titles
    titles = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
              'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
              'Don', 'Jonkheer']
    df['title'] = df.name.map(lambda x: MatchSubstrings(x, titles))
    
    # group titles
    df['grouped_title'] = df.title.map(MatchTitles)
    
    # impute missing ages with the mean based on title
    df['impute_age'] = df.age
    df.ix[(df.age.isnull()) & (df.grouped_title=='Mr'), 'impute_age'] = np.average(df[df.title=='Mr'].age.dropna())
    df.ix[(df.age.isnull()) & (df.grouped_title=='Mrs'), 'impute_age'] = np.average(df[df.title=='Mrs'].age.dropna())
    df.ix[(df.age.isnull()) & (df.grouped_title=='Miss'), 'impute_age'] = np.average(df[df.title=='Miss'].age.dropna())
    df.ix[(df.age.isnull()) & (df.grouped_title=='Master'), 'impute_age'] = np.average(df[df.title=='Master'].age.dropna())
    df.ix[(df.age.isnull()) & (df.grouped_title=='Dr'), 'impute_age'] = np.average(df[df.title=='Dr'].age.dropna())
    
    # binning age groups into categories
    bins = [0,10,30,60,200]
    names = ['child','adult','senior','aged']
    df['grouped_age'] = pd.cut(df.impute_age, bins, labels=names)

    # encoding categorical variables
    le = preprocessing.LabelEncoder()
    
    le.fit(df.sex)
    x_sex = le.transform(df.sex)
    df.sex = x_sex.astype(np.float)
    
    le.fit(df.title)
    x_title = le.transform(df.title)
    df.title = x_title.astype(np.float)
    
    le.fit(df.grouped_age)
    x_age = le.transform(df.grouped_age)
    df.grouped_age = x_age.astype(np.float)
        
    return df

In [94]:
# load data
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

# munge data
df_train = Munge(df_train)
df_test = Munge(df_test)

## GENDER, CLASS, FARE

In [96]:
# train test split
train, val = train_test_split(df_train, test_size=0.3, random_state=0)

In [97]:
# features to exclude
excluded_features = ['survived','cabin','ticket','name','embarked','sibsp','parch','title',
                     'grouped_title','age','impute_age','grouped_age', 'family_size']

features = df_train.ix[:,~df_train.columns.isin(excluded_features)].columns
features

Index(['pclass', 'sex', 'fare', 'fare_per_person'], dtype='object')

### Training set

In [98]:
train_x = train.ix[:,features]
train_y = train.survived

In [102]:
# logistic regression
lr = LogisticRegression()
lr.fit(train_x, train_y)
lr_pred_train = lr.predict(train_x)
print('lr train accuracy: {result}'.format(result=accuracy_score(train_y, lr_pred_train)))

# random forest
rf = RandomForestClassifier(criterion='entropy', n_estimators=500, 
                            max_depth=5, min_samples_split=1, min_samples_leaf=1,
                            random_state=123, n_jobs=3)
rf.fit(train_x, train_y)
rf_pred_train = rf.predict(train_x)
print('rf train accuracy: {result}'.format(result=accuracy_score(train_y, rf_pred_train)))

lr train accuracy: 0.7849117174959872
rf train accuracy: 0.8475120385232745


### Validation set

In [103]:
val_x = val.ix[:,features]
val_y = val.survived

In [104]:
# logistic regression
lr_pred_val = lr.predict(val_x)
print('lr validation accuracy: {result}'.format(result=accuracy_score(val_y, lr_pred_val)))

# random forest
rf_pred_val = rf.predict(val_x)
print('rf validation accuracy: {result}'.format(result=accuracy_score(val_y, rf_pred_val)))

lr validation accuracy: 0.7873134328358209
rf validation accuracy: 0.8134328358208955


### Submission

In [105]:
def SubmitCSV(data, filename):
    data = pd.Series(submission, index=df_test.index, name='Survived')
    pd.DataFrame(data).to_csv(filename)

In [106]:
submission = rf.predict(df_test.ix[:,features])
SubmitCSV(submission, '../submissions/submit06_randomforest.csv')