**Import libraries**

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
%matplotlib inline

from collections import Counter


from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

**Getting and splitting data**

Reading dataset with feature engineering done! Splitting dataset into train and test set.

In [2]:
# Read dataset
dataset = pd.read_csv("./data/dataset.csv")

In [3]:
# Getting the passengerId for each cluster and test/train dataset
id_trainW = dataset[(dataset["Survived"].isnull() == False) & (dataset["Sex"] == 1)]["PassengerId"] # Cluster 1 / train
id_trainM = dataset[(dataset["Survived"].isnull() == False) & (dataset["Sex"] == 0)]["PassengerId"] # Cluster 2 / train

id_testW = dataset[(dataset["Survived"].isnull()) & (dataset["Sex"] == 1)]["PassengerId"] # Cluster 1 / test
id_testM = dataset[(dataset["Survived"].isnull()) & (dataset["Sex"] == 0)]["PassengerId"] # Cluster 2 / test

In [4]:
# Drop PassengerId in dataset
dataset.drop(labels = ["PassengerId"], axis = 1, inplace = True)

In [5]:
# rain / test set
train = dataset[np.invert(dataset['Survived'].isnull())]
test = dataset[dataset['Survived'].isnull()]

Clustering dataset by sex

In [6]:
#train set
trainW = train[train['Sex']==1]
trainM = train[train['Sex']==0]

#test set
testW = test[test['Sex']==1]
testM = test[test['Sex']==0]

Features to use in modeling phase and target to predict

In [7]:
# Cluster 1: women
X_trainW = trainW.drop('Survived',axis=1)
y_trainW = trainW['Survived'].astype(int)

X_testW = testW.drop('Survived',axis=1)

# CLuster 2: man
X_trainM = trainM.drop('Survived',axis=1)
y_trainM = trainM['Survived'].astype(int)

X_testM = testM.drop('Survived',axis=1)

**Modeling**

In [8]:
def ensamble_modeling(X_train, y_train, X_test, folds = 10, n_jobs = 4):
    
    # Cross validation
    kfold = StratifiedKFold(n_splits=folds)
    
    
    # ---- MODEL 1 ---- ADABOOST ----
    
    DTC = DecisionTreeClassifier()
    adaDTC = AdaBoostClassifier(DTC, random_state=7)

    # Search grid for optimal parameters
    ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
                      "base_estimator__splitter" :   ["best", "random"],
                      "algorithm" : ["SAMME","SAMME.R"],
                      "n_estimators" :[1,2],
                      "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

    gsadaDTC = GridSearchCV(adaDTC, param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= n_jobs, verbose = 1)
    
    gsadaDTC.fit(X_train,y_train)
    
    ada_best = gsadaDTC.best_estimator_

    # Best score
    print("AdaBoost best score: {}".format(gsadaDTC.best_score_))
    
    
    # ---- MODEL 2 ---- EXTRATREES ----
    
    ExtC = ExtraTreesClassifier()

    # Search grid for optimal parameters
    ex_param_grid = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[100,300],
                  "criterion": ["gini"]}


    gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring="accuracy", n_jobs= n_jobs, verbose = 1)

    gsExtC.fit(X_train,y_train)

    ExtC_best = gsExtC.best_estimator_

    # Best score
    print("ExtraTrees best score: {}".format(gsExtC.best_score_))
    
    
    # ---- MODEL 3 ---- RANDOM FOREST ----
    
    RFC = RandomForestClassifier()

    # Search grid for optimal parameters
    rf_param_grid = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[100,300],
                  "criterion": ["gini"]}


    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= n_jobs, verbose = 1)

    gsRFC.fit(X_train,y_train)

    RFC_best = gsRFC.best_estimator_

    # Best score
    print("RandomForest best score: {}".format(gsRFC.best_score_))
    
    
    # ---- MODEL 4 ---- Gradient Boosting ----

    GBC = GradientBoostingClassifier()

    # Search grid for optimal parameters
    gb_param_grid = {'loss' : ["deviance"],
                  'n_estimators' : [100,200,300],
                  'learning_rate': [0.1, 0.05, 0.01],
                  'max_depth': [4, 8],
                  'min_samples_leaf': [100,150],
                  'max_features': [0.3, 0.1] 
                  }

    gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= n_jobs, verbose = 1)

    gsGBC.fit(X_train,y_train)

    GBC_best = gsGBC.best_estimator_

    # Best score
    print("Gradient Boosting best score: {}".format(gsGBC.best_score_))
    
    
    # ---- MODEL 5 ---- SVM ----
    SVMC = SVC(probability=True)

    # Search grid for optimal parameters
    svc_param_grid = {'kernel': ['rbf'], 
                      'gamma': [ 0.001, 0.01, 0.1, 1],
                      'C': [1, 10, 50, 100,200,300, 1000]}

    gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= n_jobs, verbose = 1)

    gsSVMC.fit(X_train,y_train)

    SVMC_best = gsSVMC.best_estimator_

    # Best score
    print("Support Vector Machine best score: {}".format(gsSVMC.best_score_))
    
    
    # ---- PREDICTIONS ----
    
    results = pd.DataFrame({"RFC":RFC_best.predict(X_test),
                        "ExtT":ExtC_best.predict(X_test),
                        "SVM":SVMC_best.predict(X_test),
                        "Ada":ada_best.predict(X_test),
                        "GBC":GBC_best.predict(X_test)})
    
    
    
    model = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best), 
                                         ('svc', SVMC_best), ('ada',ada_best), ('gbc',GBC_best)], 
                             voting='soft', n_jobs=n_jobs)
    
    return model

In [9]:
model_cluster_1 = ensamble_modeling(X_trainW, y_trainW, X_testW)

Fitting 10 folds for each of 112 candidates, totalling 1120 fits


[Parallel(n_jobs=4)]: Done 712 tasks      | elapsed:    2.6s


AdaBoost best score: 0.803921568627451
Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Done 1120 out of 1120 | elapsed:    3.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   24.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   60.0s
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed:  1.2min finished


ExtraTrees best score: 0.821078431372549
Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   29.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed:  1.4min finished


RandomForest best score: 0.8063725490196079
Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    5.4s


Gradient Boosting best score: 0.7990196078431373
Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=4)]: Done 720 out of 720 | elapsed:   25.2s finished
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    5.1s


Support Vector Machine best score: 0.7941176470588235


[Parallel(n_jobs=4)]: Done 280 out of 280 | elapsed:   12.0s finished


In [10]:
model_cluster_2 = ensamble_modeling(X_trainM, y_trainM, X_testM)

Fitting 10 folds for each of 112 candidates, totalling 1120 fits


[Parallel(n_jobs=4)]: Done 608 tasks      | elapsed:    3.4s


AdaBoost best score: 0.8047091412742382
Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Done 1120 out of 1120 | elapsed:    5.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   30.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed:  1.5min finished


ExtraTrees best score: 0.8310249307479224
Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   31.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed:  1.6min finished


RandomForest best score: 0.8337950138504155
Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   18.6s
[Parallel(n_jobs=4)]: Done 720 out of 720 | elapsed:   38.0s finished


Gradient Boosting best score: 0.8060941828254847
Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   22.2s
[Parallel(n_jobs=4)]: Done 280 out of 280 | elapsed:   40.7s finished


Support Vector Machine best score: 0.8310249307479224


**Predictions**

Fit the model and predictions:

In [11]:
model_cluster_1.fit(X_trainW, y_trainW)
model_cluster_2.fit(X_trainM, y_trainM)

VotingClassifier(estimators=[('rfc', RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=10,
            min_wei...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))],
         flatten_transform=None, n_jobs=4, voting='soft', weights=None)

In [12]:
predictions_cluster_1 = model_cluster_1.predict(X_testW)
predictions_cluster_2 = model_cluster_2.predict(X_testM)

Sorting predictions according to delivery format

In [67]:
predictions = pd.DataFrame({"PassengerId":pd.concat([id_trainW,id_trainM,id_testW,id_testM], axis = 0),
                            "Survived":list(pd.concat([pd.Series(train[train["Sex"] == 1]["Survived"].astype(int)),
                                                       pd.Series(train[train["Sex"] == 0]["Survived"].astype(int)),
                                                       pd.Series(predictions_cluster_1),
                                                       pd.Series(predictions_cluster_2)],axis=0))})

In [93]:
predictions = predictions.sort_values("PassengerId")[891:].reset_index().iloc[:,1:3]

In [95]:
predictions.to_csv("./outputs/complete_2cluster.csv",index=False)