# TITANIC SUBMISSION SET 01
Evaluation metric: Accuracy

https://github.com/agconti/kaggle-titanic/blob/master/Titanic.ipynb

http://elenacuoco.altervista.org/blog/archives/1195

In [270]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## DATA MUNGING

In [300]:
def Munge(data):
    df = data.copy()
    
    # lower case the column names
    df.columns = df.columns.str.lower()

    # create dummies for sex variable
    df = df.join(pd.get_dummies(df.sex, prefix='sex'))
    df = df.join(pd.get_dummies(df.pclass, prefix='pclass'))
    df = df.join(pd.get_dummies(df.embarked, prefix='embarked'))
    
    # missing values for fares
    df.ix[df.fare.isnull(), 'fare'] = 0
    
    # impute missing ages as the average
    df.ix[df.age.isnull(),'age'] = df.age.mean()
    
    return df

In [301]:
# load data
df_train = pd.read_csv('data/train.csv', index_col=0)
df_test = pd.read_csv('data/test.csv', index_col=0)

# munge data
df_train = Munge(df_train)
df_test = Munge(df_test)

## EDA
- Females were more likely to survive
- People from class 1 were more likely to survive

In [113]:
pd.crosstab(df_train.sex, df_train.survived, dropna=False, normalize='index')

survived,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.257962,0.742038
male,0.811092,0.188908


In [211]:
pd.crosstab(df_train.pclass, df_train.survived, dropna=False, normalize='index')

survived,0,1
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.37037,0.62963
2,0.527174,0.472826
3,0.757637,0.242363


In [302]:
pd.crosstab(df_train.embarked, df_train.survived, dropna=False, normalize='index')

survived,0,1
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.446429,0.553571
Q,0.61039,0.38961
S,0.663043,0.336957


## BASE MODELS

In [303]:
# train test split
train, val = train_test_split(df_train, test_size=0.3, random_state=0)

In [330]:
# features to exclude
excluded_features = ['survived','cabin','sex','ticket','name','embarked','pclass']
features = train.ix[:,~train.columns.isin(excluded_features)].columns
features

Index(['age', 'sibsp', 'parch', 'fare', 'sex_female', 'sex_male', 'pclass_1',
       'pclass_2', 'pclass_3', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

### Training set

In [331]:
train_x = train.ix[:,features]
train_y = train.survived

In [332]:
# logistic regression
lr = LogisticRegression()
lr.fit(train_x, train_y)
lr_pred_train = lr.predict(train_x)
print('lr train accuracy: {result}'.format(result=accuracy_score(train_y, lr_pred_train)))

# random forest
rf = RandomForestClassifier(criterion='entropy', n_estimators=500, random_state=0, n_jobs=3)
rf.fit(train_x, train_y)
rf_pred_train = rf.predict(train_x)
print('rf train accuracy: {result}'.format(result=accuracy_score(train_y, rf_pred_train)))

lr train accuracy: 0.8073836276083467
rf train accuracy: 0.9791332263242376


### Validation set

In [333]:
val_x = val.ix[:,features]
val_y = val.survived

In [334]:
# logistic regression
lr_pred_val = lr.predict(val_x)
print('lr validation accuracy: {result}'.format(result=accuracy_score(val_y, lr_pred_val)))

# random forest
rf_pred_val = rf.predict(val_x)
print('rf validation accuracy: {result}'.format(result=accuracy_score(val_y, rf_pred_val)))

lr validation accuracy: 0.7985074626865671
rf validation accuracy: 0.8134328358208955


### Initial submission

In [335]:
def SubmitCSV(data, filename):
    data = pd.Series(submission, index=df_test.index, name='Survived')
    pd.DataFrame(data).to_csv(filename)

In [336]:
submission = rf.predict(df_test.ix[:,features])
SubmitCSV(submission, 'submissions/randomforest02.csv')