In [1]:
import json
import numpy as np
import pandas as pd
from utils import preprocess

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
train, test = preprocess.get_data()

xTrain = train.drop('Survived', axis=1)
yTrain = train.Survived

## Creating and fixing the folds for all the classifiers

In [4]:
train = train.reset_index(drop=True)
train['fold'] = None

In [5]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

for i, (train_idx, valid_dx) in enumerate(kfold.split(xTrain, yTrain)):
    train.loc[valid_dx,'fold'] = i

## Loading pre-tuned classifiers

In [6]:
with open('./results/05_.json') as file:
    top_clf = json.load(file)

In [7]:
print('Differet Classifiers with tuned Parameters\n')
for i, (key, val) in enumerate(top_clf.items()):
    print(i+1,'  ', key, '\n    ', val)
    print('-'*100, '\n')

Differet Classifiers with tuned Parameters

1    GaussianNB 
     {}
---------------------------------------------------------------------------------------------------- 

2    LogisticRegression 
     {'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
---------------------------------------------------------------------------------------------------- 

3    KNeighborsClassifier 
     {'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}
---------------------------------------------------------------------------------------------------- 

4    SVC 
     {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
---------------------------------------------------------------------------------------------------- 

5    RandomForestClassifier 
     {'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 500}
---------------------------------------------------------------------------------------------------- 

6    AdaBoostClassifier 
     {'n_estimators': 50}
-------------------

In [8]:
def get_preds(train, estimator, params, prob=True):
    """
    Function returns prediction or positive class probability by training on n-1 folds
    and predicting for single fold at a time, based on predefind folds
    
    
    Input:
    train - dataframe with 'Survived' as target column and 'fold' with validation fold numbers.
    estimator - classfier object
    params - parameters for the classifier
    prob - binary imput to spesify wether to return predicion for proabability
    
    returns:
    pandas series with predictions or probability for positive class based on input 'prob'
    """
    
    Preds = pd.Series(index=train.index)
    for fold in train.fold.unique():
        xData = train.drop(['Survived', 'fold'], axis=1)
        yData = train.Survived

        X_train = xData[train['fold'] != fold]
        y_train = yData[train['fold'] != fold]
        X_valid = xData[train['fold'] == fold]
        y_valid = yData[train['fold'] == fold]

        clf = estimator(**params)
        clf.fit(X_train, y_train)
                
        if prob:
            # only taking probability for class 1
            Preds[train['fold'] == fold] = clf.predict_proba(X_valid)[:,1]
        else:
            Preds[train['fold'] == fold] = clf.predict(X_valid)
            
    return Preds

In [9]:
prob_df = pd.DataFrame({
    'nbc':get_preds(train, GaussianNB, top_clf['GaussianNB']),
    'lrc':get_preds(train, LogisticRegression, top_clf['LogisticRegression']),
    'nnc':get_preds(train, KNeighborsClassifier, top_clf['KNeighborsClassifier']),
    'svc':get_preds(train, SVC, {**top_clf['SVC'], **{'probability':True}} ),
    'rfc':get_preds(train, RandomForestClassifier, top_clf['RandomForestClassifier']),
    'abc':get_preds(train, AdaBoostClassifier, top_clf['AdaBoostClassifier']),
    'gbc':get_preds(train, GradientBoostingClassifier, top_clf['GradientBoostingClassifier'])    
})

pred_df = prob_df.round()

In [10]:
print('Individual performance of the classifiers\n')
scores = {}
for each in pred_df:
    scores[each] = np.mean(pred_df[each] == train.Survived)
print(pd.DataFrame(scores, index=[0]))

Individual performance of the classifiers

       nbc      lrc      nnc       svc      rfc       abc       gbc
0  0.76431  0.82716  0.82716  0.833895  0.83165  0.813692  0.828283


## Hard Voting

In [11]:
pred_df.head()

Unnamed: 0,nbc,lrc,nnc,svc,rfc,abc,gbc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.0,1.0,0.0,1.0,0.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
mode_pred = pred_df.mode(axis=1).values.flatten()
print("Accuracy for simple voting classifier", np.mean(mode_pred == train.Survived))

Accuracy for simple voting classifier 0.8361391694725028


## Soft Voting

In [13]:
prob_df.head()

Unnamed: 0,nbc,lrc,nnc,svc,rfc,abc,gbc
0,8.516697e-08,0.065178,0.2,0.10724,0.106648,0.486433,0.089726
1,0.9998619,0.945885,1.0,0.896522,0.956119,0.52434,0.981633
2,0.07003943,0.62244,0.5,0.642025,0.498886,0.50323,0.667532
3,0.9978777,0.932387,1.0,0.904884,0.9569,0.518774,0.989538
4,6.444948e-08,0.069333,0.1,0.16111,0.103103,0.489391,0.139744


In [14]:
mean_prob = prob_df.mean(axis=1).values.flatten()
print("Accuracy for soft voting classifier", np.mean(mean_prob.round() == train.Survived))

Accuracy for soft voting classifier 0.8316498316498316


## Test

In [15]:
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Name_Master.,Name_Miss.,Name_Mr.,Name_Mrs.,Name_grp1,Name_grp2,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,fold
0,0,3,-0.592481,1,0,-0.502445,0,0,1,0,0,0,0,1,0,0,1,0
1,1,1,0.638789,1,0,0.786845,0,0,0,1,0,0,1,0,1,0,0,1
2,1,3,-0.284663,0,0,-0.488854,0,1,0,0,0,0,1,0,0,0,1,4
3,1,1,0.407926,1,0,0.42073,0,0,0,1,0,0,1,0,0,0,1,2
4,0,3,0.407926,0,0,-0.486337,0,0,1,0,0,0,0,1,0,0,1,4


In [16]:
test.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Name_Master.,Name_Miss.,Name_Mr.,Name_Mrs.,Name_grp1,Name_grp2,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892,3,0.369449,0,0,-0.490783,0,0,1,0,0,0,0,1,0,1,0
893,3,1.331378,1,0,-0.507479,0,0,0,1,0,0,1,0,0,0,1
894,2,2.485693,0,0,-0.453367,0,0,1,0,0,0,0,1,0,1,0
895,3,-0.207709,0,0,-0.474005,0,0,1,0,0,0,0,1,0,0,1
896,3,-0.592481,1,1,-0.401017,0,0,0,1,0,0,1,0,0,0,1


In [17]:
def predict(train, test, estimator, params):
    """
    Train given estimator on train and return prediction for test.
    """
    
    X_train = train.drop(['Survived', 'fold'], axis=1)
    y_train = train.Survived
    
    clf = estimator(**params)
    clf.fit(X_train, y_train)
    pred = clf.predict_proba(test)[:,1]
    
    return pred

In [18]:
test_prob_df = pd.DataFrame({
    'nbc':predict(train, test, GaussianNB, top_clf['GaussianNB']),
    'lrc':predict(train, test, LogisticRegression, top_clf['LogisticRegression']),
    'nnc':predict(train, test, KNeighborsClassifier, top_clf['KNeighborsClassifier']),
    'svc':predict(train, test, SVC, {**top_clf['SVC'], **{'probability':True}} ),
    'rfc':predict(train, test, RandomForestClassifier, top_clf['RandomForestClassifier']),
    'abc':predict(train, test, AdaBoostClassifier, top_clf['AdaBoostClassifier']),
    'gbc':predict(train, test, GradientBoostingClassifier, top_clf['GradientBoostingClassifier'])    
})

test_pred_df = test_prob_df.round()

In [19]:
hard_vote = test_pred_df.mode(axis=1).values.flatten()

In [20]:
soft_vote = test_prob_df.mean(axis=1).values.flatten().round()

## Stacking

### Training a model on predictions of first level Classifier's predictions

In [21]:
def parameterTune(estimator, param_grid, X, y):
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(
            estimator  = estimator,
            param_grid = param_grid,
            n_jobs     = 11,
            cv         = 5,
    )
    grid.fit(X, y)
    
    return grid.best_score_, grid.best_params_

In [22]:
# instantiating Support Vector Classifier
from sklearn.svm import SVC
estimator = SVC()

param_grid = [
    {   'kernel' : ['linear'],
        'C'      : [0.1, 1, 10, 100]},
    
    {   'kernel' : ['rbf'],
        'C'      : [0.1, 1, 10, 100],
        'gamma'  : ['scale', 'auto'],},
]

svc_best_score_, svc_best_params_ = parameterTune(estimator, param_grid, prob_df, yTrain)
print('best_score_:',svc_best_score_,'\nbest_params_:',svc_best_params_)

best_score_: 0.8383654510074697 
best_params_: {'C': 0.1, 'kernel': 'linear'}


In [23]:
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression(tol=1e-4, solver='liblinear', random_state=1)

param_grid = {
    'max_iter' : [1000, 2000, 3000],
    'penalty'  : ['l1', 'l2'],
    'solver'   : ['liblinear']
}

lrc_best_score_, lrc_best_params_ = parameterTune(estimator, param_grid, prob_df, yTrain)
print('best_score_:',lrc_best_score_,'\nbest_params_:',lrc_best_params_)

best_score_: 0.8383905592869247 
best_params_: {'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
