<a href="https://colab.research.google.com/github/Jiablero/notebooks/blob/master/titanic_kaggle_contest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# gender = pd.read_csv('gender_submission.csv') # just an example of result file

Data Preparation.




In [0]:
def prepare_titanic_data(data):
  X = data.drop(['PassengerId', 'Name', 'Fare', 'Ticket', 'Embarked', 'Cabin'], axis = 1)
  if 'Survived' in data.columns : 
    X = X.drop('Survived', axis = 1) # в тестовых данных нет Survived
  X = X.fillna({'Age': X.Age.median()})
  X = pd.get_dummies(X).drop('Sex_male', axis=1)
  return(X)

In [0]:
X = prepare_titanic_data(train)
y = train.Survived
test_X = prepare_titanic_data(test)

Classifiers creation, training.

In [0]:
def grid_search(clf, params, X, y):
  gs = GridSearchCV(clf, params, n_jobs=-1)
  gs.fit(X, y)
  # best_params_ для отладки
  return {'best_params_': gs.best_params_, 'best_estimator_': gs.best_estimator_}

In [0]:
# Random Forest
rf_clf = RandomForestClassifier()
rf_parameters = {
    'n_estimators': range(10, 30, 5), 
    'max_depth': range(3, 10, 1), 
    'min_samples_leaf': range(1, 7), 
    'min_samples_split': range(4, 12, 2)}

In [0]:
clf_final_rf = grid_search(rf_clf, rf_parameters, X, y)

In [79]:
clf_final_rf.get('best_estimator_').score(X, y)

0.8361391694725028

In [0]:
# Neural network (MLP)
nn_clf = MLPClassifier()
nn_parameters = {  'activation': ['identity', 'logistic', 'tanh', 'relu'],
                'solver': ['lbfgs', 'sgd', 'adam'],
                'alpha': np.arange(0.0001, 0.0005, 0.0001),
                'learning_rate': ['constant', 'invscaling', 'adaptive'],
                'max_iter': range(100, 500, 100),
                'validation_fraction': np.arange(0.1, 0.4, 0.1), 
              }

In [0]:
clf_final_nn = grid_search(nn_clf, nn_parameters, X, y)

In [0]:
clf_final_nn.get('best_estimator_').score(X, y)

In [0]:
# Bayes
by_cld = MultinomialNB()
by_parameters = {'alpha': np.arrange(0.0, 2.0, 0.1)}

In [0]:
clf_final_by = grid_search(by_clf, by_parameters, X, y)

In [0]:
test_result = test_X


In [0]:
test_result['Survived'] = clf.predict(test_X)

In [0]:
test_result

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Survived
0,3,34.5,0,0,0,0
1,3,47.0,1,0,1,0
2,2,62.0,0,0,0,0
3,3,27.0,0,0,0,0
4,3,22.0,1,1,1,1
...,...,...,...,...,...,...
413,3,27.0,0,0,0,0
414,1,39.0,0,0,1,1
415,3,38.5,0,0,0,0
416,3,27.0,0,0,0,0


In [0]:
clf_bias = MultinomialNB()

In [0]:
clf_bias.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
clf_bias.score(X,y)

0.7845117845117845

In [0]:
clf_bias.predict(test_X.drop('Survived', axis=1))

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [0]:
test_result['Survived_Bias'] = clf_bias.predict(test_X.drop('Survived', axis=1))

In [0]:
test_result

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Survived,Survived_Bias
0,3,34.5,0,0,0,0,0
1,3,47.0,1,0,1,0,1
2,2,62.0,0,0,0,0,0
3,3,27.0,0,0,0,0,0
4,3,22.0,1,1,1,1,1
...,...,...,...,...,...,...,...
413,3,27.0,0,0,0,0,0
414,1,39.0,0,0,1,1,1
415,3,38.5,0,0,0,0,0
416,3,27.0,0,0,0,0,0


In [0]:
clf_nn.fit(X, y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [0]:
clf_nn.score(X, y)

0.8305274971941639

In [0]:
test_result['Survived_NN'] = clf_nn.predict(test_X.drop(['Survived', 'Survived_Bias'], axis=1))


In [0]:
test_result['Surv_Median'] = test_result.Survived + test_result.Survived_Bias + test_result.Survived_NN

In [0]:
test_result[test_result.Surv_Median < 2] = 0
test_result[test_result.Surv_Median >= 2] = 1

In [0]:
test_result

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Survived,Survived_Bias,Survived_NN,Surv_Median
0,0,0.0,0,0,0,0,0,0,0
1,1,1.0,1,1,1,1,1,1,1
2,0,0.0,0,0,0,0,0,0,0
3,0,0.0,0,0,0,0,0,0,0
4,1,1.0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...
413,0,0.0,0,0,0,0,0,0,0
414,1,1.0,1,1,1,1,1,1,1
415,0,0.0,0,0,0,0,0,0,0
416,0,0.0,0,0,0,0,0,0,0


In [0]:
result = pd.DataFrame({'PassengerID': test.PassengerId, 'Survived': test_result.Surv_Median})

In [0]:
result

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [0]:
# TODO: добавить еще пару методов, grid search cv, вычислять результат по probs а не по ответам, отрефакторить, оттестить.