# MIT 6.867 Final Project: Ensembling Algorithms
Irina Degtiar

Sources for functions: https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/

In [17]:
##########################################################################################################
### Set up workspace
##########################################################################################################
# Ensure re-load of all code
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Import libraries - general
import numpy as np
import pylab as pl
import pandas as pd
import random

# Import libraries - classification
import sklearn.metrics
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from Helper_files.Gini_coefficient import gini_sklearn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
##########################################################################################################
### Load data
##########################################################################################################
training_scaled = pd.read_pickle('../Data/2_Cleaned/training_scaled.pickle')
validation_scaled = pd.read_pickle('../Data/2_Cleaned/training_scaled.pickle')
test_scaled = pd.read_pickle('../Data/2_Cleaned/training_scaled.pickle')

In [None]:
# ##########################################################################################################
# ### Define functions
# ##########################################################################################################
# # Parameter definition
# ntrain = train.shape[0]
# ntest = test.shape[0]
# SEED = 123 # for reproducibility
# NFOLDS = 5 # set folds for out-of-fold prediction
# kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# # Class to extend the Sklearn classifier
# class SklearnHelper(object):
#     def __init__(self, classifier, seed=0, params=None):
#         params['random_state'] = seed
#         self.classifier = classifier(**params)

#     def train(self, x_train, y_train):
#         self.classifier.fit(x_train, y_train)

#     def predict(self, x):
#         return self.classifier.predict(x)
    
#     def fit(self,x,y):
#         return self.classifier.fit(x,y)
    
#     def feature_importances(self,x,y):
#         return self.classifier.fit(x,y).feature_importances_

# # Function to obtain out-of-fold training and test predictions
# def get_oof(classifier, x_train, y_train, x_test):
#     oof_train = np.zeros((ntrain,))
#     oof_test = np.zeros((ntest,))
#     oof_test_skf = np.empty((NFOLDS, ntest))

#     for i, (train_index, test_index) in enumerate(kf):
#         x_tr = x_train[train_index]
#         y_tr = y_train[train_index]
#         x_te = x_train[test_index]

#         classifier.train(x_tr, y_tr)

#         oof_train[test_index] = classifier.predict(x_te)
#         oof_test_skf[i, :] = classifier.predict(x_test)

#     oof_test[:] = oof_test_skf.mean(axis=0)
#     return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [15]:
##########################################################################################################
### Set parameters for each algorithm; create algorithms
##########################################################################################################
seed = 123

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
    max_depth=9, max_features='auto', max_leaf_nodes=None,
    #min_impurity_decrease=0, min_impurity_split=None,
    min_samples_leaf=1, min_samples_split=2,
    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
    oob_score=False, random_state=seed, verbose=0, warm_start=False)
clf3 = GaussianNB()
lr = LogisticRegression() # Will replace with XGBoost
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          use_probas=False,
                          average_probas=False,
                          meta_classifier=lr)


In [None]:
##########################################################################################################
### Run stacked ensemble
##########################################################################################################
print('5-fold cross validation:\n')
X = training_scaled.drop(['id', 'target'], 1)
y = training_scaled['target']

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=5, scoring=gini_sklearn)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))
