## 4 - Final Model + Testing

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

np.random.seed(12)

In [2]:
df = pd.read_csv('data.csv')

y = df.Survived
X = df.drop(['Survived'], axis=1)

# Check that our columns are correctly updated
assert 'Survived' in y.name
assert 'Survived' not in X.columns

X = X.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=10)

### Feature Engineering

In [4]:
# Cabin
x_train = x_train.drop(['Cabin'], axis=1)
x_test = x_test.drop(['Cabin'], axis=1)

# Embarked
x_train["Embarked"] = x_train["Embarked"].fillna("S")
x_test["Embarked"] = x_test["Embarked"].fillna("S")

pd.get_dummies(x_train['Embarked'], prefix='Embarked').head(10)

x_train = pd.concat([x_train, pd.get_dummies(x_train['Embarked'], prefix='Embarked')], axis=1)
x_test = pd.concat([x_test, pd.get_dummies(x_test['Embarked'], prefix='Embarked')], axis=1)

x_train = x_train.drop(['Embarked'], axis=1)
x_test = x_test.drop(['Embarked'], axis=1)

# Sex
x_train['Sex'] = x_train['Sex'].map({'female':0, 'male':1})
x_test['Sex'] = x_test['Sex'].map({'female':0, 'male':1})

# Age
a = [ max(0, np.random.normal(x_train.Age.mean(), x_train.Age.std())) for n in range(0, len(X)) ]
x_train.loc[:, 'age_filler'] = pd.Series(a)
x_train.loc[x_train.Age.isnull(), 'Age'] = x_train.loc[x_train.Age.isnull(), 'age_filler']

a = [ max(0, np.random.normal(x_test.Age.mean(), x_test.Age.std())) for n in range(0, len(X)) ]
x_test.loc[:, 'age_filler'] = pd.Series(a)
x_test.loc[x_test.Age.isnull(), 'Age'] = x_test.loc[x_test.Age.isnull(), 'age_filler']

x_train = x_train.drop(['age_filler'], axis=1)
x_test = x_test.drop(['age_filler'], axis=1)

# Family
x_train.loc[:, 'Family'] =  x_train["Parch"] + x_train["SibSp"]
x_train.loc[x_train['Family'] > 0, 'Family'] = 1
x_train.loc[x_train['Family'] == 0, 'Family'] = 0

x_test.loc[:, 'Family'] =  x_test["Parch"] + x_test["SibSp"]
x_test.loc[x_test['Family'] > 0, 'Family'] = 1
x_test.loc[x_test['Family'] == 0, 'Family'] = 0

x_train = x_train.drop(['SibSp'], axis=1)
x_test = x_test.drop(['SibSp'], axis=1)

x_train = x_train.drop(['Parch'], axis=1)
x_test = x_test.drop(['Parch'], axis=1)

### Final Model

Since we have specified our final model, now we can use the test data. Note the validation and training data should be used together at this point to train our model. More training data is always better.

In [5]:
from sklearn import tree, ensemble

model = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=10)

model = model.fit(x_train, y_train)

In [6]:
model.score(x_test, y_test)

0.87777777777777777

### Future Data

What model should we use if we have more data coming in the future?

Since we have selected our final model, we should train our model on the entire dataset available.

### Feature importances

In [7]:
pd.DataFrame(list(model.feature_importances_), index=x_train.columns.tolist(), columns=['importance']).sort_values('importance', ascending=False)

Unnamed: 0,importance
Sex,0.443578
Fare,0.191929
Pclass,0.151541
Age,0.115865
Family,0.03398
Embarked_S,0.028059
Embarked_C,0.023824
Embarked_Q,0.011225
