<a href="https://colab.research.google.com/github/Jiablero/notebooks/blob/master/titanic_kaggle_season3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [0]:
# from season 1

# data preparation
def prepare_titanic_data(data):
  X = data.drop(['PassengerId', 'Name', 'Fare', 'Ticket', 'Embarked', 'Cabin'], axis = 1)
  if 'Survived' in data.columns : 
    X = X.drop('Survived', axis = 1) # в тестовых данных нет Survived
  X = X.fillna({'Age': X.Age.median()})
  X = pd.get_dummies(X).drop('Sex_male', axis=1)
  return(X)

# grid search universal
def grid_search(clf, params, X, y):
  gs = GridSearchCV(clf, params, n_jobs=-1)
  gs.fit(X, y)
  # best_params_ для отладки
  return {'best_params_': gs.best_params_, 'best_estimator_': gs.best_estimator_}

In [0]:
# Подготовка данных
X, test_X = train_test_split(train, test_size = 0.2)
y = X.Survived
X = prepare_titanic_data(X)
test_y = test_X.Survived
test_X = prepare_titanic_data(test_X)

# для кроссвалидации
cross_y = train.Survived
cross_X = prepare_titanic_data(train)

In [0]:
# Random Forest
rf_clf = RandomForestClassifier()
rf_parameters = {
    'n_estimators': range(10, 30, 5), 
    'max_depth': range(3, 10, 1), 
    'min_samples_leaf': range(1, 7), 
    'min_samples_split': range(4, 12, 2)}

In [0]:
clf_final_rf = grid_search(rf_clf, rf_parameters, X, y)

In [14]:
pred_proba = clf_final_rf.get('best_estimator_').predict_proba(test_X)
roc_score = roc_auc_score(test_y, pred_proba[:, 1])
print("ROC: {}".format(roc_score))

ROC: 0.8533596837944664


In [19]:
# Кроссвалидация
clf_cv = RandomForestClassifier(**clf_final_rf.get('best_estimator_').get_params())
cv_scores = cross_val_score(clf_cv, cross_X, cross_y, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)

print("Crossvalidation mean(ROC): {}".format(mean_cv_scores))

Crossvalidation mean(ROC): 0.8604224315989022


In [0]:
# MLP neural network (best params from season 1)
nn_fixed_clf = MLPClassifier( activation = 'tanh',alpha = 0.0002, 
                              learning_rate = 'invscaling',
                              max_iter = 300,
                              solver = 'adam',
                              validation_fraction = 0.1)

In [21]:
nn_fixed_clf.fit(X, y)

MLPClassifier(activation='tanh', alpha=0.0002, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='invscaling',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [26]:
pred_proba_nn = nn_fixed_clf.predict_proba(test_X)
roc_score_nn = roc_auc_score(test_y, pred_proba_nn[:, 1])
print("ROC: {}".format(roc_score_nn))

ROC: 0.8368082368082368


In [24]:
# Кроссвалидация
nn_clf_cv = MLPClassifier(**nn_fixed_clf.get_params())
cv_scores_nn = cross_val_score(nn_clf_cv, cross_X, cross_y, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores_nn = np.mean(cv_scores_nn)

print("Crossvalidation mean(ROC): {}".format(mean_cv_scores_nn))

Crossvalidation mean(ROC): 0.8609560591913533
