In [30]:
# Importing the required libraries
import pandas as pd
pd.set_option('display.max_columns', 50) # Display up to 50 columns at a time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm
plt.style.use('seaborn')
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12,5
import glob # To read all csv files in the directory
import seaborn as sns
import calendar
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support
import itertools
import time
import xgboost as xgb
import glob
import pickle as pk
from keras.models import load_model
import os
from sklearn.metrics import precision_recall_fscore_support
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.externals import joblib
from keras.models import load_model

In [19]:
def stacked_dataset(models, inputX, scaler = None, fit_scaler = False):
    """
    Input : list of learners, np.array, sklearn object, bool
    Output: np.array
    The function takes a list of pretrained models, the training observations and eventually a standard scaler to scale 
    the stacked data. The last boolean is an indicator to tell if the standard scaler ought to be trained or if it has already
    been. Then it returns the concatenated predictions
    of each and every model in a flattened array. The output will be the input of the level 1 model to train with
    trainStack
    """
    stackX = None
    for model in models:
        # make prediction
        yhat = model.predict(inputX)
        # stack predictions into [rows, members, probabilities]
        if stackX is None:
            stackX = yhat
        else:
            stackX = np.dstack((stackX, yhat))
    # flatten predictions to [rows, members x probabilities]
    if len(models) > 1:
        stackX = stackX.reshape((stackX.shape[1], stackX.shape[2]))
        
#     print("Il y a {0} modèles, le format des observations est : {1} et celui des observations empilées est : {2}".format(len(models), inputX.shape, stackX.shape))
#     print("Les cinq premières lignes ressemblent à ceci : {}".format(stackX[:5,:]))

    if scaler is not None:
        if fit_scaler:
            stackX = scaler.fit_transform(stackX)
        else:
            stackX = scaler.transform(stackX)
    
#     print('Les résultats agglomérés des modèles ressemblent à ça : {}'.format(stackX[:5,:]))
    return stackX

In [20]:
def trainStack(first_models, final_model, X_train, y_train, is_MLP = False, epochs = 300, scaler = None, fit_scaler = False):
    """
    Input : list of learners, learner, np.array, np.array, bool, int, sklearn object, bool
    Output : learner
    The function takes the level 0 trained learners, the level 1 learner to train, the training observations, the 
    training labels, the boolean telling whether or not the top-level classifier is a Multi-Layer Perceptron and the integer
    corresponding to the number of training epochs if we have an MLP. The two last arguments are a respectively a 
    standard scaler in case we need to scale our data and a boolean telling whether or not it has to be trained.
    It returns the level 1 trained model.
    """
    X_stacked = stacked_dataset(first_models, X_train, scaler = scaler, fit_scaler = fit_scaler)
    
    if is_MLP:
        y_train_categ = to_categorical(y_train)
#         print("Les labels pour l'entrainement ont cette forme : {}".format(y_train_categ[:5]))
        final_model.fit(X_stacked, y_train_categ, epochs = epochs, verbose = 0)
    else:
        final_model.fit(X_stacked, y_train)
    
    return final_model

In [21]:
def predictStack(first_models, final_model, X_test, scaler = None):
    """
    Input : list of learners, learner, array-like, sklearn object
    Output : array-like
    The function takes the first-level trained models, the top-level trained model, the test set and eventually a scaler 
    that scales the X_test data with a pretrained scaler (trained on the training data) and returns 
    the predictions of the stack on the test set.
    """
    X_stacked = stacked_dataset(first_models, X_test, scaler = scaler)
        
    y_predicted = final_model.predict(X_stacked)
    return y_predicted

In [22]:
def probabilitiesToClass(array):
    """
    Input: np.array of size (n,2)
    Output : np.array of size (n,)
    The function takes the array whose lines are the observations and column values correspond to the probability of belonging 
    to a given class. If the probability in a given column is superior to 0.5, then we will return the number of that class. 
    For instance : [0.33,0.66] for an observation will render [1] and [0.7,0.3] will render [0].
    """
    res = np.array(array.shape[0]*[0])
    for i in range(array.shape[0]):
        if array[i,1] > array[i,0]:
            res[i] = 1
    return res

In [23]:
def displayPerformances(y_test, y_test_pred, y_train, y_train_pred):
    """
    Input: np.array (n,1), np.array (n,1), np.array (N,1), np.array (N,1)
    Output : pd.DataFrame
    The function takes the labels from the training set, the predicted labels, and the same for the test set. It returns
    a DataFrame containing the values on each data set of the precision, the recall and the f1-score.
    """
    test_precision, test_recall, test_f1score, test_support = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')
    train_precision, train_recall, train_f1score, train_support = precision_recall_fscore_support(y_train, y_train_pred, average='weighted')
    results = {'Precision':[test_precision, train_precision], 'Recall':[test_recall, train_recall], 'F1_score': [test_f1score, train_f1score]}
    results = pd.DataFrame(results, index=['Test','Train'])
    
    return results

#### Majority vote

We did still did not implemant a majority vote on the result given by our level 0 classifiers. Maybe it's time. However, one should keep in mind that ensemble learning and stacking may behave better with **a lot more level 0 classifiers** than currently. Indeed, to take a majority vote, it may seem better to have a lot of different outputs from level 0 learners. Anyway, we'll keep it short with our 3 classifiers.

In [24]:
def majorityVote(first_models, X_train):
    """
    Input : list of learners, np.array (N,d)
    Output : np.array (N,)
    The function takes the level 0 trained learners, and the training observations. It returns the array containing the majority
    vote coming from the level-0 classifiers.
    """
    
    X_stacked = stacked_dataset(first_models, X_train)
    res = np.array(X_stacked.shape[0]*[0])
    
    for i in range(X_stacked.shape[0]):
        line = X_stacked[i,:]
        if (line == 1).sum() > (line == 0).sum():
            res[i] = 1
    
    return res

## Stacking : logistic regression
The level 1 classifier is a logistic regression

In [25]:
# Let's import a bunch of the level-0 classifiers
level0_models = []
level0_nn = []

dir_name = 'zero_level_models'
file_list = os.listdir(dir_name)

for f in file_list: 
    if 'neural_net_model1_' in f:
        with open(dir_name+'/'+f,'rb') as file:
            try:
                temp = load_model(file)
                level0_nn.append(temp)
            except:
                continue
    elif 'model1_' in f:
        with open(dir_name+'/'+f,'rb') as file:
            pickler = pk.Unpickler(file)
            try:
                temp = pickler.load()
                level0_models.append(temp)
            except:
                continue

In [26]:
# Loading the datasets. Since the models were trained on the first part of the dataset, we must choose different datasets this
# time, for instance the second subsets
X_train = pd.read_csv('processed_datasets/observations2.csv', index_col=0)
y_train = pd.read_csv('processed_datasets/labels2.csv', index_col=0)

In [27]:
X_test = pd.read_csv('processed_datasets/observations3.csv', index_col=0)
y_test = pd.read_csv('processed_datasets/labels3.csv', index_col=0)

In [11]:
top_model1 = AdaBoostClassifier();
#sc = StandardScaler()

In [12]:
stack1 = trainStack(level0_models, top_model1, X_train.iloc[:5000,:], y_train.iloc[:5000,:])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)


In [13]:
y_predicted = predictStack(level0_models, stack1, X_test.iloc[:5000,:])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.2s finished


In [14]:
# Displaying the results
y_train_pred = predictStack(level0_models, stack1, X_train.iloc[:5000,:])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished


In [15]:
res = displayPerformances(y_test.iloc[:5000,:], y_predicted, y_train.iloc[:5000,:],y_train_pred)
res

Unnamed: 0,Precision,Recall,F1_score
Test,0.762531,0.76,0.752974
Train,0.758229,0.7596,0.754518


## Stacking : neural network

In [31]:
# let's retrieve the dimension of the input layer
n = len(X_train.columns)
# let's retrieve the dimension of the output layer
m = 1
# Neural network
top_model2 = Sequential()
top_model2.add(Dropout(0.2, input_shape = (n,)))
top_model2.add(Dense(1000, activation='relu'))
top_model2.add(Dense(1000, activation='relu'))
top_model2.add(Dense(1000, activation='relu'))
top_model2.add(Dense(500, activation='relu'))
top_model2.add(Dense(2, activation='softmax'))
top_model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  

In [33]:
stack2 = trainStack(level0_models, top_model2, X_train.iloc[:4000,:], y_train.iloc[:4000,:], is_MLP = True);

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.2s finished


ValueError: Error when checking input: expected dropout_1_input to have shape (106,) but got array with shape (16,)

In [None]:
y_predicted2 = predictStack(level0_models, stack2, X_test.iloc[:4000,:])
y_pred_flat2 = probabilitiesToClass(y_predicted2);

In [None]:
y_train_pred2 = predictStack(level0_models, stack2, X_train.iloc[:4000,:])
y_train_pred_flat2 = probabilitiesToClass(y_train_pred2);        

In [None]:
res = displayPerformances(y_test.iloc[:4000,:], y_pred_flat2, y_train.iloc[:4000,:],y_train_pred_flat2)
res