# MIT 6.867 Final Project: Ensembling Algorithms
Irina Degtiar

Sources for functions: https://www.kaggle.com/yekenot/simple-stacker-lb-0-284/code

Consider implementing: log odds aggregation: https://www.kaggle.com/aharless/xgboost-k-fold-with-log-odds-aggregation

In [1]:
##########################################################################################################
### Set up workspace
##########################################################################################################
# Ensure re-load of all code
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Import libraries - general
import numpy as np
import pylab as pl
import pandas as pd
import random

# Import libraries - classification
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier
#from xgboost import XGBClassifier

from mlxtend.classifier import StackingClassifier

import Helper_files.Gini_coefficient
from Helper_files.Gini_coefficient import gini_sklearn

In [2]:
##########################################################################################################
### Load data
##########################################################################################################
training_scaled = pd.read_pickle('../Data/2_Cleaned/training_scaled.pickle')
validation_scaled = pd.read_pickle('../Data/2_Cleaned/training_scaled.pickle')
test_scaled = pd.read_pickle('../Data/2_Cleaned/training_scaled.pickle')



In [3]:
##########################################################################################################
### Prepare data
##########################################################################################################
X_train = training_scaled.drop(['id', 'target'], 1)
y_train = training_scaled['target']

X_val = validation_scaled.drop(['id', 'target'], 1)
y_val = validation_scaled['target']

X_test = test_scaled.drop(['id'], 1)

In [4]:
##########################################################################################################
### Define function - Ensemble running prediction models
##########################################################################################################
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X) # Train features
        y = np.array(y) # Train target values
        T = np.array(T) # Test features

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=123).split(X, y))

        # Stack predicted probabilities
        S_train = np.zeros((X.shape[0], len(self.base_models))) # n_train*#models
        S_test = np.zeros((T.shape[0], len(self.base_models)))  # n_test*#models
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits)) # n_test*#folds

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
#                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
#                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring=gini_sklearn)
#                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred # Add holdout predictions to appropriate index, dim: n_train*#models
                S_test_i[:, j] = clf.predict_proba(T)[:,1] # Predict for training dataset using that fold
            S_test[:, i] = S_test_i.mean(axis=1) # Average across folds to get test prediction, dim: n_test*#models

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring=gini_sklearn) # dim: #models
        print("Stacker score: %.5f" % (results.mean()))

        # Fit ensemble model on stacked training data
        self.stacker.fit(S_train, y) # Fit ensemble model: features correspond to holdout predictions from each algorithm in ensemble
        res = self.stacker.predict_proba(S_test)[:,1] # Use model to predict targets for test dataset: features correspond to average test prediction across folds from each algorithm in ensemble
        return res

In [5]:
##########################################################################################################
### Define function - Ensemble using prediction outputs
##########################################################################################################
# Inputs:
#   - stacker: algorithm used to create ensemble prediction
#   - S_train: list or array of holdout training predictions from each algorithm in ensemble, dim: n_train*#models
#   - S_train: list or array of average test prediction across folds from each algorithm in ensemble, dim: n_test*#models
#   - CV: number of cross validations for computing stacker cross_val_score. Default is 3. 

def Ensemble_from_predictions(stacker, S_train, y_train, S_test, CV=3):
    # Clean inputs
    S_train = np.array(S_train) # Train predictions
    S_test = np.array(S_test) # Test predictions

    # Calculate training gini coefficient
    results = cross_val_score(stacker, S_train, y_train, cv=CV, scoring=gini_sklearn) # dim: #models
    print("Stacker score: %.5f" % (results.mean()))

    # Fit ensemble model on stacked training data
    stacker.fit(S_train, y) # Fit ensemble model: features correspond to holdout predictions from each algorithm in ensemble
    res = stacker.predict_proba(S_test)[:,1] # Use model to predict targets for test dataset: features correspond to average test prediction across folds from each algorithm in ensemble
    return res

In [None]:
##########################################################################################################
### Set parameters for each algorithm; create algorithms
##########################################################################################################  
seed = 123

# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['random_state'] = seed


lgb_params2 = {}
lgb_params2['learning_rate'] = 0.02
lgb_params2['n_estimators'] = 1090
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['random_state'] = seed


lgb_params3 = {}
lgb_params3['learning_rate'] = 0.02
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['random_state'] = seed


# RandomForest params
#rf_params = {}
#rf_params['n_estimators'] = 200
#rf_params['max_depth'] = 6
#rf_params['min_samples_split'] = 70
#rf_params['min_samples_leaf'] = 30


# ExtraTrees params
#et_params = {}
#et_params['n_estimators'] = 155
#et_params['max_features'] = 0.3
#et_params['max_depth'] = 6
#et_params['min_samples_split'] = 40
#et_params['min_samples_leaf'] = 18


# XGBoost params
#xgb_params = {}
#xgb_params['objective'] = 'binary:logistic'
#xgb_params['learning_rate'] = 0.04
#xgb_params['n_estimators'] = 490
#xgb_params['max_depth'] = 4
#xgb_params['subsample'] = 0.9
#xgb_params['colsample_bytree'] = 0.9  
#xgb_params['min_child_weight'] = 10


# Regularized Greedy Forest params
#rgf_params = {}
#rgf_params['max_leaf'] = 2000
#rgf_params['learning_rate'] = 0.5
#rgf_params['algorithm'] = "RGF_Sib"
#rgf_params['test_interval'] = 100
#rgf_params['min_samples_leaf'] = 3 
#rgf_params['reg_depth'] = 1.0
#rgf_params['l2'] = 0.5  
#rgf_params['sl2'] = 0.005



lgb_model = LGBMClassifier(**lgb_params)
lgb_model2 = LGBMClassifier(**lgb_params2)
lgb_model3 = LGBMClassifier(**lgb_params3)

#rf_model = RandomForestClassifier(**rf_params)
#et_model = ExtraTreesClassifier(**et_params)      
#xgb_model = XGBClassifier(**xgb_params)
#rgf_model = RGFClassifier(**rgf_params) 
#gb_model = GradientBoostingClassifier(max_depth=5)
#ada_model = AdaBoostClassifier()

rf_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
    max_depth=9, max_features='auto', max_leaf_nodes=None,
    #min_impurity_decrease=0, min_impurity_split=None,
    min_samples_leaf=1, min_samples_split=2,
    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
    oob_score=False, random_state=seed, verbose=0, warm_start=False)
nb_model = GaussianNB()

log_model = LogisticRegression()

In [None]:
##########################################################################################################
### ALT 1: Run stacked ensemble - inputs: models
##########################################################################################################
stack = Ensemble(n_splits=5,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3, rf_model, nb_model))        
        
y_pred1 = stack.fit_predict(X_train, y_train, X_val)  

Helper_files.Gini_coefficient.gini_normalizedc(y_val, y_pred1)
# Using lgb1-3, training 3-fold-cross-validation standardized gini coeff: 0.25624, validation: 0.65332241539935254

Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4


In [None]:
##########################################################################################################
### ALT 2: Run models to obtain predictions (will eventually be done in other files)
##########################################################################################################
# Input parameters
base_models = (lgb_model, lgb_model2, lgb_model3, rf_model, nb_model)
n_splits=3

X = np.array(X_train) # Train features
y = np.array(y_train) # Train target values
T = np.array(X_val) # Test features

folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(X, y))

# Stack predicted probabilities
S_train = np.zeros((X.shape[0], len(base_models))) # n_train*#models
S_test = np.zeros((T.shape[0], len(base_models)))  # n_test*#models
for i, clf in enumerate(base_models):

    S_test_i = np.zeros((T.shape[0], n_splits)) # n_test*#folds

    for j, (train_idx, test_idx) in enumerate(folds):
        X_train_j = X[train_idx]
        y_train_j = y[train_idx]
        X_holdout_j = X[test_idx]

        print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
        clf.fit(X_train_j, y_train_j)
        #  cross_score = cross_val_score(clf, X_train_j, y_train_j, cv=3, scoring=gini_sklearn)
        #  print("    cross_score: %.5f" % (cross_score.mean()))
        y_pred = clf.predict_proba(X_holdout_j)[:,1]                

        S_train[test_idx, i] = y_pred # Add holdout predictions to appropriate index, dim: n_train*#models
        S_test_i[:, j] = clf.predict_proba(T)[:,1] # Predict for training dataset using that fold
    S_test[:, i] = S_test_i.mean(axis=1) # Average across folds to get test prediction, dim: n_test*#models
print("DONE")

In [None]:
##########################################################################################################
### ALT 2: Run stacked ensemble - inputs: predictions
##########################################################################################################
stacker=log_model
#y_pred2 = Ensemble_from_predictions(stacker, S_train, y, S_test, CV=3) #Works but assumes data already aggregated  

# For separate inputs
S_train1 = S_train[:,0]
S_train2 = S_train[:,1]
S_trainmerge = np.array([S_train1,S_train2]).T
S_test1 = S_test[:,0]
S_test2 = S_test[:,1]
S_testmerge = np.array([S_test1,S_test2]).T

y_pred2 = Ensemble_from_predictions(stacker=log_model, S_train=S_trainmerge, y_train=y_train, \
                                    S_test=S_testmerge, CV=5)
Helper_files.Gini_coefficient.gini_normalizedc(y_val, y_pred2)

In [None]:
##########################################################################################################
### Save predictions for submission
##########################################################################################################
# submission = pd.DataFrame()
# submission['id'] = test_scaled['id']
# submission['target'] = y_pred
# submission.to_csv('submission.csv', index=False)