# TITANIC SUBMISSION SET 03
Evaluation metric: Accuracy

The random forest performed better.

In [3]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## DATA MUNGING

In [35]:
title_mapping = {
    'Mr':['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col'],
    'Mrs':['Countess', 'Mme','Mrs'],
    'Miss':['Mlle', 'Ms','Miss'],
    'Master':['Master'],
    'Dr':['Dr']
}

def MatchTitles(title):
    for i in title_mapping:
        if title in title_mapping[i]:
            return i

def MatchSubstrings(main_string, substrings):
    for substring in substrings:
        if main_string.find(substring) != -1:
            return substring
    return np.nan

def Munge(data):
    df = data.copy()
    
    # lower case the column names
    df.columns = df.columns.str.lower()
    
    # extract titles
#     titles = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
#               'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
#               'Don', 'Jonkheer']
#     df['title'] = df.name.map(lambda x: MatchSubstrings(x, titles))
    
    # group titles
#     df['grouped_title'] = df.title.map(MatchTitles)
    
    # add family size
    df['family_size'] = df.parch + df.sibsp + 1
    
    # missing values for fares (only 1 from testset)
    df.ix[df.fare.isnull(), 'fare'] = 0
    
    # impute missing ages with the mean based on title
#     df['impute_age'] = df.age
#     df.ix[(df.age.isnull()) & (df.title=='Mr'), 'impute_age'] = np.average(df[df.title=='Mr'].age.dropna())
#     df.ix[(df.age.isnull()) & (df.title=='Mrs'), 'impute_age'] = np.average(df[df.title=='Mrs'].age.dropna())
#     df.ix[(df.age.isnull()) & (df.title=='Miss'), 'impute_age'] = np.average(df[df.title=='Miss'].age.dropna())
#     df.ix[(df.age.isnull()) & (df.title=='Master'), 'impute_age'] = np.average(df[df.title=='Master'].age.dropna())
#     df.ix[(df.age.isnull()) & (df.title=='Dr'), 'impute_age'] = np.average(df[df.title=='Dr'].age.dropna())
    
    # binning age groups into categories
#     bins = [0,10,30,60,200]
#     names = ['child','adult','senior','aged']
#     df['grouped_age'] = pd.cut(df.impute_age, bins, labels=names)
    
    # create dummies for sex variable
    df = df.join(pd.get_dummies(df.sex, prefix='sex'))
    df = df.join(pd.get_dummies(df.pclass, prefix='pclass'))
    df = df.join(pd.get_dummies(df.embarked, prefix='embarked'))
#     df = df.join(pd.get_dummies(df.grouped_title, prefix='title'))
#     df = df.join(pd.get_dummies(df.grouped_age, prefix='age'))
    
    # mappings
#     df.sex = df.sex.map({'female': 0, 'male': 1}).astype(int)
    
    # lower case the column names again before returning
#     df.columns = df.columns.str.lower()
    
    return df

In [36]:
# load data
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

# munge data
df_train = Munge(df_train)
df_test = Munge(df_test)

## MODELS

In [44]:
# train test split
train, val = train_test_split(df_train, test_size=0.3, random_state=0)

In [45]:
# features to exclude
excluded_features = ['survived','cabin','sex','ticket','name','embarked','pclass','sibsp','parch','title',
                     'grouped_title','age','impute_age','grouped_age', 'family_size','fare']

features = df_train.ix[:,~df_train.columns.isin(excluded_features)].columns
features

Index(['sex_female', 'sex_male', 'pclass_1', 'pclass_2', 'pclass_3',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

### Training set

In [46]:
train_x = train.ix[:,features]
train_y = train.survived

In [47]:
# logistic regression
lr = LogisticRegression()
lr.fit(train_x, train_y)
lr_pred_train = lr.predict(train_x)
print('lr train accuracy: {result}'.format(result=accuracy_score(train_y, lr_pred_train)))

# random forest
rf = RandomForestClassifier(criterion='entropy', n_estimators=500, random_state=0, n_jobs=3)
rf.fit(train_x, train_y)
rf_pred_train = rf.predict(train_x)
print('rf train accuracy: {result}'.format(result=accuracy_score(train_y, rf_pred_train)))

lr train accuracy: 0.7752808988764045
rf train accuracy: 0.812199036918138


### Validation set

In [48]:
val_x = val.ix[:,features]
val_y = val.survived

In [49]:
# logistic regression
lr_pred_val = lr.predict(val_x)
print('lr validation accuracy: {result}'.format(result=accuracy_score(val_y, lr_pred_val)))

# random forest
rf_pred_val = rf.predict(val_x)
print('rf validation accuracy: {result}'.format(result=accuracy_score(val_y, rf_pred_val)))

lr validation accuracy: 0.7835820895522388
rf validation accuracy: 0.8097014925373134


### Initial submission

In [50]:
def SubmitCSV(data, filename):
    data = pd.Series(submission, index=df_test.index, name='Survived')
    pd.DataFrame(data).to_csv(filename)

In [52]:
submission = rf.predict(df_test.ix[:,features])
SubmitCSV(submission, '../submissions/submit04_randomforest.csv')

submission = lr.predict(df_test.ix[:,features])
SubmitCSV(submission, '../submissions/submit05_logisticreg.csv')