# Titanic competition quick entry creation

Notebook for creating quick competition entries for the Titanic kaggle competition - this should be a concise way of training a model and outputting the csv files without the analysis carried out in the first two notebooks. 

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

In [5]:
PATH = "data/titanic"
!ls tmp

keep_cols.npy  titanic_procdfd_raw  titanic_raw_cats


In [86]:

PATH = "data/titanic/"
max_n_cat = 5
df_raw = pd.read_feather('tmp/titanic_raw_cats')
df, y, nas = proc_df(df_raw, 'Survived', max_n_cat=max_n_cat)

## Functions

In [87]:
def split_vals(a,n): 
    return a[:n].copy(), a[n:].copy()

# n_valid = 418 #same as Kaggle's test set size
n_valid = 100 #smaller validation set to provide more training data
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
x_train, x_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

raw_train.shape, x_train.shape, y_train.shape, x_valid.shape, y_valid.shape


((791, 13), (791, 18), (791,), (100, 18), (100,))

In [88]:
def rmse(pred,actual): return math.sqrt(((pred-actual)**2).mean())

In [89]:
def print_score(m):
    result = [f'rmse train: {rmse(m.predict(x_train), y_train)} Training set size: {len(y_train)}', 
              f'\nrmse validation: {rmse(m.predict(x_valid), y_valid)} Validation set size: {len(y_valid)}',
              f'\nRsquared train: {m.score(x_train, y_train)}',
              f'\nRSquared valid: {m.score(x_valid, y_valid)}']
    print(result[0], result[1], result[2], result[3])
    if hasattr(m, 'oob_score_'):
        result.append(f'OOB: {m.oob_score_}')
        print(result[4])

In [90]:
def custom_print_score(m, x_t, y_t):
    result = [f'rmse train: {rmse(m.predict(x_t), y_t)} Total set size: {len(y_t)}', 
              f'\nRsquared train: {m.score(x_t, y_t)}']
    print(result[0], result[1])
    if hasattr(m, 'oob_score_'):
        result.append(f'OOB: {m.oob_score_}')
        print(result[2])

In [91]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, 
                           "display.max_columns", 1000):
        display(df)

In [112]:
def dectree_max_depth(tree):
    children_left = tree.children_left
    children_right = tree.children_right
    

    def walk(node_id):
        if (children_left[node_id] != children_right[node_id]):
            left_max = 1 + walk(children_left[node_id])
            right_max = 1 + walk(children_right[node_id])
            return max(left_max, right_max)
        else: # leaf
            return 1

    root_node_id = 0
    return walk(root_node_id)

### Hyperparameters

In [110]:
set_rf_samples(700) # Number of passengers available to each tree 
# reset_rf_samples() # Set each tree to have access to all rows
n_estimators = 400 # Number of trees/estimators in model
min_samples_leaf = 3 # number remaining in each leaf node - average taken
max_features = 0.6 # For each decision , proportion of independent variables available.


### Train the model

In [111]:
#Training set and validation set scores
m = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, max_features=max_features, n_jobs=-1, oob_score=True)
m.fit(x_train, y_train)
print_score(m)

rmse train: 0.23404175080580836 Training set size: 791 
rmse validation: 0.32352214865069834 Validation set size: 100 
Rsquared train: 0.7690720615614989 
RSquared valid: 0.5457179658525844
OOB: 0.47033352012924856


In [97]:
feature_importance = rf_feat_importance(m,df);
feature_importance[:10]

Unnamed: 0,cols,imp
12,Sex_male,0.180654
11,Sex_female,0.172527
7,Ticket,0.109672
2,Name,0.093342
8,Fare,0.087645
0,PassengerId,0.076785
4,Age,0.076701
1,Pclass,0.076513
9,Cabin,0.044269
3,Married,0.027266


## Decide which variables to keep and drop

In [103]:
to_drop = ['PassengerId'] # from our tests in L2

In [104]:
to_keep = feature_importance[feature_importance.imp>0.05].cols
df_keep = df[to_keep].copy()
df_keep.drop(to_drop, axis=1, inplace=True)

In [109]:
np.save('tmp/keep_cols.npy', np.array(df_keep.columns))
df_keep.columns

Index(['Sex_male', 'Sex_female', 'Ticket', 'Name', 'Fare', 'Age', 'Pclass'], dtype='object')

In [117]:
tree=m.estimators_[0].tree_
print(f'This tree has {dectree_max_depth(tree)} splits')

This tree has 18 splits


# Train a new model on the combined set of training and validation data with reduced number of columns. 

In [176]:
df_keep.head()

Unnamed: 0,Sex_male,Sex_female,Ticket,Name,Fare,Age,Pclass
0,1,0,524,109,7.25,22.0,3
1,0,1,597,191,71.2833,38.0,1
2,0,1,670,354,7.925,26.0,3
3,0,1,50,273,53.1,35.0,1
4,1,0,473,16,8.05,35.0,3


In [178]:
m = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, max_features=max_features, n_jobs=-1, oob_score=True)
m.fit(df_keep, y)
custom_print_score(m, df_keep, y)

rmse train: 0.24797897976170488 Total set size: 891 
Rsquared train: 0.7399920091759133
OOB: 0.4736324233284127


## Load the testset

In [179]:
testset = pd.read_csv('data/titanic/test.csv')
# testset.head()

In [180]:
train_cats(testset)

In [181]:
df_test, _ , nas_test = proc_df(testset, max_n_cat=max_n_cat)

In [183]:
df_test_keep = df_test[df_keep.columns]
# df_test_keep.drop(to_drop, axis=1, inplace=True)
df_test_keep.head()

Unnamed: 0,Sex_male,Sex_female,Ticket,Name,Fare,Age,Pclass
0,1,0,153,207,7.8292,34.5,3
1,0,1,222,404,7.0,47.0,3
2,1,0,74,270,9.6875,62.0,2
3,1,0,148,409,8.6625,27.0,3
4,0,1,139,179,12.2875,22.0,3


### Make predictions on the testset using our model

In [185]:
predictions = m.predict(df_test_keep)

In [191]:
# average predicted survival rate in the test cases
m.predict(df_test_keep).sum()/len(m.predict(df_test_keep))

0.43927201898314344

In [195]:
binary_predictions = [int(x) for x in (predictions.round())]

In [203]:
df_test_keep.index


RangeIndex(start=0, stop=418, step=1)

## Export the prediction data to a .CSV file

In [206]:
output = pd.DataFrame({"PassengerId" :df_test_keep.index+1, "Survived": binary_predictions})


In [208]:
output.head(3)

Unnamed: 0,PassengerId,Survived
0,1,0
1,2,1
2,3,0


In [209]:
output.to_csv('second_titanic_submission.csv', index=False)
print("Saved")

Saved
