# TITANIC SUBMISSION SET 05
Evaluation metric: Accuracy

Much of this set was taken from Elena Cuco's [blog](http://elenacuoco.altervista.org/blog/archives/1195). It has an excellent example of data munging and cross validation with grid search to fit a random forest. It helped me to move up to the top 25% with an accuracy of 0.79904.

In [145]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

from patsy import dmatrices
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## DATA MUNGING

In [114]:
title_mapping = {
    'Mr':['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col'],
    'Mrs':['Countess', 'Mme','Mrs'],
    'Miss':['Mlle', 'Ms','Miss'],
    'Master':['Master']
}

def MatchTitles(x):    
    for i in title_mapping:
        if x.title in title_mapping[i]:
            return i
        elif x.title == 'Dr':
            if x.sex == 'male':
                return 'Mr'
            else:
                return 'Mrs'

def MatchSubstrings(main_string, substrings):
    for substring in substrings:
        if main_string.find(substring) != -1:
            return substring
    return np.nan

def Munge(data):
    df = data.copy()
    
    # lower case the column names
    df.columns = df.columns.str.lower()
    
    # missing values for fares (only 1 from testset)
    df.ix[df.fare.isnull(), 'fare'] = 0
    
    # add family size
    df['family_size'] = df.parch + df.sibsp
    df['fare_per_person'] = df.fare / (df.family_size+1)

    # extract titles
    titles = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
              'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
              'Don', 'Jonkheer']
    df['title'] = df.name.map(lambda x: MatchSubstrings(x, titles))
    
    # group titles
    df['grouped_title'] = df.apply(MatchTitles, axis=1)
    
    # impute missing ages with the mean based on title
    df['impute_age'] = df.age
    df.ix[(df.age.isnull()) & (df.grouped_title=='Mr'), 'impute_age'] = np.average(df[df.grouped_title=='Mr'].age.dropna())
    df.ix[(df.age.isnull()) & (df.grouped_title=='Mrs'), 'impute_age'] = np.average(df[df.grouped_title=='Mrs'].age.dropna())
    df.ix[(df.age.isnull()) & (df.grouped_title=='Miss'), 'impute_age'] = np.average(df[df.grouped_title=='Miss'].age.dropna())
    df.ix[(df.age.isnull()) & (df.grouped_title=='Master'), 'impute_age'] = np.average(df[df.grouped_title=='Master'].age.dropna())
    
    # binning age groups into categories
    bins = [0,10,30,60,200]
    names = ['child','adult','senior','aged']
    df['grouped_age'] = pd.cut(df.impute_age, bins, labels=names)

    # encoding categorical variables
    le = preprocessing.LabelEncoder()
    
    le.fit(df.sex)
    x_sex = le.transform(df.sex)
    df.sex = x_sex.astype(np.float)
    
    le.fit(df.grouped_title)
    x_grouped_title = le.transform(df.grouped_title)
    df.grouped_title = x_grouped_title.astype(np.float)
    
    le.fit(df.grouped_age)
    x_age = le.transform(df.grouped_age)
    df.grouped_age = x_age.astype(np.float)
        
    return df

In [143]:
# load data
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

# munge data
df_train = Munge(df_train)
df_test = Munge(df_test)

## GENDER, CLASS, FARE, FAMILY SIZE, FARE PER PERSON, TITLE, AGE

In [149]:
formula_ml='survived~pclass+C(grouped_title)+sex+C(grouped_age)+fare_per_person+fare+family_size'
train_y, train_x = dmatrices(formula_ml, data=df_train, return_type='dataframe')
train_y = np.asarray(train_y).ravel()

In [150]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2,random_state=123)

### Training set

In [151]:
# random forest
rf = RandomForestClassifier(criterion='entropy', n_estimators=500, 
                            max_depth=5, min_samples_split=1, min_samples_leaf=1,
                            max_features='auto', random_state=123, n_jobs=1)

# params
param_grid = dict( )

##classify pipeline
pipeline = Pipeline([('rf',rf)])

# grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3, scoring='accuracy', 
                           cv = StratifiedShuffleSplit(train_y, n_iter=10, test_size=0.2, 
                                                       train_size=None, random_state=123)
                          ).fit(train_x, train_y)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV]  ................................................................
[CV] ....................................... , score=0.860140 -   0.6s
[CV]  ................................................................
[CV] ....................................... , score=0.811189 -   0.6s
[CV]  ................................................................
[CV] ....................................... , score=0.769231 -   0.6s
[CV]  ................................................................
[CV] ....................................... , score=0.853147 -   0.6s
[CV]  ................................................................
[CV] ....................................... , score=0.846154 -   0.6s
[CV]  ................................................................
[CV] ....................................... , score=0.783217 -   0.6s
[CV]  ................................................................
[CV] ...........

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.1s finished


In [152]:
print("Best score: %0.3f" % grid_search.best_score_)
print ('on all train set')
scores = cross_val_score(grid_search.best_estimator_, train_x, train_y, cv=3, scoring='accuracy')
print(scores.mean(),scores)

Best score: 0.822
on all train set
0.830053540404 [ 0.83193277  0.80168776  0.85654008]


### Validation set

In [154]:
print ('on test set')
scores = cross_val_score(grid_search.best_estimator_, val_x, val_y, cv=3, scoring='accuracy')
print(scores.mean(),scores)

on test set
0.838323917137 [ 0.83333333  0.78333333  0.89830508]


In [156]:
rf_final = grid_search.best_estimator_

### Submission

In [170]:
def SubmitCSV(data, filename):
    data = pd.Series(submission, index=df_test.index, name='Survived')
    pd.DataFrame(data).to_csv(filename)

In [167]:
# prepare the data
df_test['survived'] = [0 for x in range(len(df_test))]
test_pred, test_x = dmatrices(formula_ml, data=df_test, return_type='dataframe')
test_pred = np.asarray(test_pred).ravel()

# run the prediction
submission = rf_final.predict(test_x).astype(int)

In [171]:
SubmitCSV(submission, '../submissions/submit07_randomforest.csv')