<a href="https://colab.research.google.com/github/Jiablero/notebooks/blob/master/titanic_kaggle_season3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [0]:
# from season 1

# data preparation
def prepare_titanic_data(data):
  X = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare', 'Embarked'], axis = 1)  
  if 'Survived' in data.columns : 
    X = X.drop('Survived', axis = 1) # в тестовых данных нет Survived
  X = X.fillna({'Age': X.Age.median()})
  #X['Family'] = X.SibSp + X.Parch + 1
  X = pd.get_dummies(X).drop('Sex_male', axis=1)
  return(X)

# grid search universal
def grid_search(clf, params, X, y):
  gs = GridSearchCV(clf, params, n_jobs=-1)
  gs.fit(X, y)
  # best_params_ для отладки
  return {'best_params_': gs.best_params_, 'best_estimator_': gs.best_estimator_}

In [0]:
# Подготовка данных
X, test_X = train_test_split(train, test_size = 0.2)
y = X.Survived
X = prepare_titanic_data(X)
test_y = test_X.Survived
test_X = prepare_titanic_data(test_X)

# для кроссвалидации
cross_y = train.Survived
cross_X = prepare_titanic_data(train)

In [0]:
# Random Forest
rf_clf = RandomForestClassifier()
rf_parameters = {
    'n_estimators': range(10, 30, 5), 
    'max_depth': range(3, 10, 1), 
    'min_samples_leaf': range(1, 7), 
    'min_samples_split': range(4, 12, 2)}

In [0]:
clf_final_rf = grid_search(rf_clf, rf_parameters, X, y)

In [0]:
pred_proba = clf_final_rf.get('best_estimator_').predict_proba(test_X)
roc_score = roc_auc_score(test_y, pred_proba[:, 1])
print("ROC: {}".format(roc_score))

ROC: 0.8575289575289575


In [0]:
# Кроссвалидация
clf_cv = RandomForestClassifier(**clf_final_rf.get('best_estimator_').get_params())
cv_scores = cross_val_score(clf_cv, cross_X, cross_y, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)

print("Crossvalidation mean(ROC): {}".format(mean_cv_scores))

Crossvalidation mean(ROC): 0.8556507087683558


In [0]:
# MLP neural network (best params from season 1)
nn_fixed_clf = MLPClassifier( activation = 'tanh',alpha = 0.0002, 
                              learning_rate = 'invscaling',
                              max_iter = 300,
                              solver = 'adam',
                              validation_fraction = 0.1)

In [0]:
nn_fixed_clf.fit(X, y)

MLPClassifier(activation='tanh', alpha=0.0002, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='invscaling',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [0]:
pred_proba_nn = nn_fixed_clf.predict_proba(test_X)
roc_score_nn = roc_auc_score(test_y, pred_proba_nn[:, 1])
print("ROC: {}".format(roc_score_nn))

ROC: 0.8866151866151866


In [0]:
# Кроссвалидация
nn_clf_cv = MLPClassifier(**nn_fixed_clf.get_params())
cv_scores_nn = cross_val_score(nn_clf_cv, cross_X, cross_y, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores_nn = np.mean(cv_scores_nn)

print("Crossvalidation mean(ROC): {}".format(mean_cv_scores_nn))

Crossvalidation mean(ROC): 0.8561979741391506


In [0]:
# готовим ответ для Kaggle
nn_fixed_clf.fit(cross_X, cross_y)
clf_final_rf.get('best_estimator_').fit(cross_X, cross_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
result_X = prepare_titanic_data(test)

In [0]:
result = pd.DataFrame(data=test.PassengerId)

In [0]:
result['nn'] = pd.Series(nn_fixed_clf.predict(result_X))
result['rf'] = pd.Series(clf_final_rf.get('best_estimator_').predict(result_X))

In [0]:
result['nn_proba_0'] = pd.Series(nn_fixed_clf.predict_proba(result_X)[:,0])
result['rf_proba_0'] = pd.Series(clf_final_rf.get('best_estimator_').predict_proba(result_X)[:,0])
result['nn_proba_1'] = pd.Series(nn_fixed_clf.predict_proba(result_X)[:,1])
result['rf_proba_1'] = pd.Series(clf_final_rf.get('best_estimator_').predict_proba(result_X)[:,1])

In [0]:
result['summ'] = result.nn + result.rf

In [169]:
# Random Forrest и MLP neural network не сошлись во мнениях в 9 из 418 случаях
# приведем их к консенсусу
result[result.summ == 1]

Unnamed: 0,PassengerId,nn,rf,nn_proba_0,rf_proba_0,nn_proba_1,rf_proba_1,summ
21,913,1,0,0.363047,0.717114,0.636953,0.282886,1
64,956,0,1,0.612758,0.496693,0.387242,0.503307,1
80,972,1,0,0.290101,0.603435,0.709899,0.396565,1
159,1051,1,0,0.496492,0.50675,0.503508,0.49325,1
161,1053,1,0,0.372824,0.680721,0.627176,0.319279,1
194,1086,1,0,0.197495,0.547238,0.802505,0.452762,1
201,1093,1,0,0.060199,0.502935,0.939801,0.497065,1
225,1117,1,0,0.499813,0.501057,0.500187,0.498943,1
376,1268,0,1,0.521723,0.432273,0.478277,0.567727,1


In [0]:
result['Survived'] = np.where(result.summ == 2, 1, 0)

In [190]:
# пришлось отрезать кусочки датасета, т.к. np.where не хочет работать на фильтрованном датасете
# не придумал как сделать правильно
surv_1 = result[(result.summ == 1) & (result.nn == 1)]
surv_1.Survived = np.where(surv_1.nn_proba_1 > surv_1.rf_proba_0, 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [192]:
surv_2 = result[(result.summ == 1) & (result.nn == 0)]
surv_2.Survived = np.where(surv_2.nn_proba_0 > surv_2.rf_proba_1, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [0]:
# прилепляем отрезанное обратно
result = result[result.summ != 1].append(surv_1)

In [0]:
result = result.append(surv_2)

In [0]:
# 0.78947 на Kaggle, 5208 место из 23835
result[['PassengerId', 'Survived']].sort_values(by=['PassengerId']).to_csv('results.csv', index = False)