# Titanic competition quick entry creation

Notebook for creating quick competition entries for the Titanic kaggle competition - this should be a concise way of training a model and outputting the csv files without the analysis carried out in the first two notebooks. 

This entry removes the independent variables which make the smallest or no influence on the depen

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

In [4]:
PATH = "data/titanic"
!ls tmp

keep_cols.npy  titanic_procdfd_raw  titanic_raw_cats


In [5]:

PATH = "data/titanic/"
max_n_cat = 5
df_raw = pd.read_feather('tmp/titanic_raw_cats')
df, y, nas = proc_df(df_raw, 'Survived', max_n_cat=max_n_cat)

## Functions

In [None]:
def split_vals(a,n): 
    return a[:n].copy(), a[n:].copy()

# n_valid = 418 #same as Kaggle's test set size
n_valid = 100 #smaller validation set to provide more training data
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
x_train, x_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

raw_train.shape, x_train.shape, y_train.shape, x_valid.shape, y_valid.shape


In [None]:
def rmse(pred,actual): return math.sqrt(((pred-actual)**2).mean())

In [None]:
def print_score(m):
    result = [f'rmse train: {rmse(m.predict(x_train), y_train)} Training set size: {len(y_train)}', 
              f'\nrmse validation: {rmse(m.predict(x_valid), y_valid)} Validation set size: {len(y_valid)}',
              f'\nRsquared train: {m.score(x_train, y_train)}',
              f'\nRSquared valid: {m.score(x_valid, y_valid)}']
    print(result[0], result[1], result[2], result[3])
    if hasattr(m, 'oob_score_'):
        result.append(f'OOB: {m.oob_score_}')
        print(result[4])

In [None]:
def custom_print_score(m, x_t, y_t):
    result = [f'rmse train: {rmse(m.predict(x_t), y_t)} Total set size: {len(y_t)}', 
              f'\nRsquared train: {m.score(x_t, y_t)}']
    print(result[0], result[1])
    if hasattr(m, 'oob_score_'):
        result.append(f'OOB: {m.oob_score_}')
        print(result[2])

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, 
                           "display.max_columns", 1000):
        display(df)

In [None]:
def dectree_max_depth(tree):
    children_left = tree.children_left
    children_right = tree.children_right
    

    def walk(node_id):
        if (children_left[node_id] != children_right[node_id]):
            left_max = 1 + walk(children_left[node_id])
            right_max = 1 + walk(children_right[node_id])
            return max(left_max, right_max)
        else: # leaf
            return 1

    root_node_id = 0
    return walk(root_node_id)

### Hyperparameters

In [None]:
set_rf_samples(700) # Number of passengers available to each tree 
# reset_rf_samples() # Set each tree to have access to all rows
n_estimators = 400 # Number of trees/estimators in model
min_samples_leaf = 3 # number remaining in each leaf node - average taken
max_features = 0.6 # For each decision , proportion of independent variables available.


### Train the model

In [None]:
#Training set and validation set scores
m = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, max_features=max_features, n_jobs=-1, oob_score=True)
m.fit(x_train, y_train)
print_score(m)

In [None]:
feature_importance = rf_feat_importance(m,df);
feature_importance[:10]

## Decide which variables to keep and drop

In [None]:
to_drop = ['PassengerId'] # from our tests in L2

In [None]:
to_keep = feature_importance[feature_importance.imp>0.05].cols
df_keep = df[to_keep].copy()
df_keep.drop(to_drop, axis=1, inplace=True)

In [None]:
np.save('tmp/keep_cols.npy', np.array(df_keep.columns))
df_keep.columns

In [None]:
tree=m.estimators_[0].tree_
print(f'This tree has {dectree_max_depth(tree)} splits')

# Train a new model on the combined set of training and validation data with reduced number of columns. 

In [None]:
df_keep.head()

In [None]:
set_rf_samples(700) # Number of passengers available to each tree 
# reset_rf_samples() # Set each tree to have access to all rows
n_estimators = 1000 # Number of trees/estimators in model
min_samples_leaf = 3 # number remaining in each leaf node - average taken
max_features = 0.6 # For each decision , proportion of independent variables available.

In [None]:
m = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, max_features=max_features, n_jobs=-1, oob_score=True)
m.fit(df_keep, y)
custom_print_score(m, df_keep, y)

## Load the testset

In [None]:
testset = pd.read_csv('data/titanic/test.csv')
testset.head()

In [None]:
train_cats(testset)

In [None]:
df_test, _ , nas_test = proc_df(testset, max_n_cat=max_n_cat)

In [None]:
df_test_keep = df_test[df_keep.columns]
# df_test_keep.drop(to_drop, axis=1, inplace=True)
df_test_keep.head()

### Make predictions on the testset using our model

In [None]:
predictions = m.predict(df_test_keep)

In [None]:
# average predicted survival rate in the test cases
m.predict(df_test_keep).sum()/len(m.predict(df_test_keep))

In [None]:
binary_predictions = [int(x) for x in (predictions.round())]

In [None]:
df_test_keep.index


## Export the prediction data to a .CSV file

In [None]:
output = pd.DataFrame({"PassengerId" :df_test_keep.index+892, "Survived": binary_predictions})


In [None]:
output.head()

In [None]:
output.to_csv('second_titanic_submission.csv', index=False)
print("Saved")