In [9]:
# Imports

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [10]:
# Load data

train = pd.read_csv('inputs/train.csv')
X_train = pd.read_csv('intermediary_outputs/X_train.csv')
X_test = pd.read_csv('intermediary_outputs/X_test.csv')

# Target

y = train['Survived']

In [11]:
# Random forests model

rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=188)

In [12]:
# Hyperparameter tuning

tuned_parameters = [{'n_estimators': [300, 500, 700, 900],
                     'criterion': ['gini', 'entropy'],
                     'min_samples_split': [5, 6, 7, 8, 9],
                     'min_samples_leaf': [1, 2, 3, 4],
                     'max_features': [0.20, 0.225, 0.25, 0.275, 0.30]}]

model = GridSearchCV(rf, tuned_parameters, cv=3, verbose=3)
model.fit(X_train, y)

scores = model.cv_results_['mean_test_score']
scores_std = model.cv_results_['std_test_score']

print('Average scores:', scores.round(4))
print('Score standard deviations:', scores_std.round(3))
print('Best parameters:', model.best_params_)
print('Best score:', round(model.best_score_, 4))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END criterion=entropy, max_features=0.225, min_samples_leaf=1, min_samples_split=7, n_estimators=700;, score=0.835 total time=   0.8s
[CV 2/3] END criterion=entropy, max_features=0.225, min_samples_leaf=1, min_samples_split=7, n_estimators=700;, score=0.865 total time=   0.8s
[CV 3/3] END criterion=entropy, max_features=0.225, min_samples_leaf=1, min_samples_split=7, n_estimators=700;, score=0.845 total time=   0.9s
Average scores: [0.8485]
Score standard deviations: [0.013]
Best parameters: {'criterion': 'entropy', 'max_features': 0.225, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 700}
Best score: 0.8485


In [22]:
# Feature importances

best_rf = RandomForestClassifier(criterion='entropy',
                                 max_features=0.225,
                                 min_samples_leaf=1,
                                 min_samples_split=7,
                                 n_estimators=700,
                                 oob_score=True,
                                 n_jobs=-1)

best_rf.fit(X_train, y)

feature_importances = pd.Series(best_rf.feature_importances_.round(2), index=X_train.columns)

print('FEATURE IMPORTANCES (TOP 15)')
feature_importances.sort_values(ascending=False).head(15)

FEATURE IMPORTANCES (TOP 15)


Fare                     0.13
Age                      0.11
Mr                       0.11
NameLength               0.11
Female                   0.11
TicketLength             0.05
Miss                     0.03
Mrs                      0.03
CabinMissing             0.03
FamilySizeSmall          0.03
FamilySizeBig            0.03
TicketFirstCharacter1    0.02
FirstClass               0.02
SecondClass              0.02
TicketFirstCharacter3    0.02
dtype: float64

In [25]:
# Make predictions

submission = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission['Survived'] = model.predict(X_test)
submission['PassengerId'] = submission.index + 892
submission.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [26]:
# Export to csv

submission.to_csv('outputs/submission_rf.csv', index=False)