#### Stacking multiple models

In [1]:
# -*- coding: utf-8 -*-

import random
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb
from sklearn import preprocessing
import pickle
import gc

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

random.seed(22)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\santander-customer-prediction')

# load the cleanData
data = pd.read_csv(r'.\data\trainFinal.csv')
testdata = pd.read_csv(r'.\data\testFinal.csv')


In [2]:
data.columns

Index(['target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6',
       'var_7', 'var_8',
       ...
       'var_198_quantile_20', 'var_199_quantile_20', 'sum_rows', 'median_rows',
       'mean_rows', 'min_rows', 'max_rows', 'std_rows', 'skew_rows',
       'kurt_rows'],
      dtype='object', length=817)

In [3]:
features = pd.read_csv(r'.\data\features.csv')
feature_imp = features[features['0']>0]
feature_selected = np.array(feature_imp['1'])
testdata = testdata[feature_selected]

feature_selected = np.append(feature_selected, 'target')
data = data[feature_selected]


In [4]:
def preprocess_data(data):
    data = data.rename(columns={'target': 'labels'})

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']
    
    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, test_size = 0.4, random_state = 0)
    
    # First, scale the Data - only those numerical/non-categorical
    return X_train, x_test, Y_train, y_test

X_train, x_test, Y_train, y_test = preprocess_data(data)

In [9]:
def train_ensemble(X_train, Y_train, x_test, y_test, n_estimators=500, n_jobs=-1, learning_rate=0.01):
    gbm = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    
    ada = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    
    etrees = ExtraTreesClassifier(n_estimators=n_estimators, n_jobs=n_jobs)
    
    bernoulli = BernoulliNB()
    
    rf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs)
    
    logit = LogisticRegression(n_jobs=n_jobs)
    
    lda = LinearDiscriminantAnalysis()
    
    knn21 = KNeighborsClassifier(n_neighbors=21, n_jobs=n_jobs)
    knn31 = KNeighborsClassifier(n_neighbors=31, n_jobs=n_jobs)
    knn41 = KNeighborsClassifier(n_neighbors=41, n_jobs=n_jobs)    
    knn51 = KNeighborsClassifier(n_neighbors=51, n_jobs=n_jobs)
        
    lgbm = lgb.LGBMClassifier(boosting_type='gbdt',  
                         objective='binary', 
                         metric='auc',
                         verbose= 1,
                         sub_feature= 0.55,
                         seed= 123,
                         save_binary= True,
                         reg_alpha= 0.1,
                         num_leaves= 6,
                         num_boost_round= 700,
                         min_data= 25,
                         max_depth= 12,
                         max_bin= 63,
                         learning_rate= 0.1,
                         lambda_l2= 0.4,
                         lambda_l1= 0.7,
                         is_unbalance= True,
                         feature_fraction_seed= 1234,
                         drop_seed= 1234,
                         data_random_seed= 1234,
                         boost_from_average= False,
                         bagging_seed= 1234,
                         bagging_freq= 5,
                         bagging_fraction= 0.55,
                         n_jobs=-1)
    
    
    models = {'gbm':gbm, 'ada':ada, 'etrees':etrees, 'bernoulli':bernoulli, 
              'rf':rf, 'logit':logit, 'lda':lda, 'knn21':knn21, 'knn31':knn31,
              'knn41':knn41, 'knn51':knn51, 'lgbm':lgbm}
    predicted = pd.DataFrame()
    columns = ['auc', 'accuracy', 'precision', 'recall']
    measures = pd.DataFrame(index=list(models.keys()), columns=columns)
    preds = pd.Series()
    for key, value in models.items():
        value.fit(X_train, Y_train)
        predictions = value.predict_proba(x_test)
        predicted['probapreds_'+key] = pd.Series(predictions[:,1])
        predicted.loc[predicted['probapreds_'+key]>=.5, 'preds_'+key] = 1
        predicted.loc[predicted['probapreds_'+key]<.5, 'preds_'+key] = 0
        measures[key, 'auc'] = metrics.roc_auc_score(np.asarray(y_test), np.asarray(predicted['preds_'+key])).astype(str)
        measures[key, 'accuracy'] = metrics.accuracy_score(predicted['preds_'+key], y_test)
        measures[key, 'precision'] = metrics.precision_score(y_test, predicted['preds_'+key])
        measures[key, 'recall'] = metrics.recall_score(y_test, predicted['preds_'+key])
        model_name = key+'.sav'
        pickle.dump(value, open(r'.\models\model'+model_name, 'wb'))
    predicted['y_test'] = y_test.reset_index().drop('index', axis=1)

    return predicted, measures


In [None]:
predicted, measures = train_ensemble(X_train, Y_train, x_test, y_test)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
preds_data = predicted[['probapreds_gbm', 'probapreds_ada', 'probapreds_etrees', 'probapreds_bernoulli', 
                       'probapreds_rf', 'probapreds_logit', 'probapreds_lda', 'probapreds_knn', 'probapreds_lgbm', 'y_test']]

def preprocess_data(data):
    data = data.rename(columns={'y_test': 'labels'})

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']
    
    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, test_size = 0.3, random_state = 0)
    
    # First, scale the Data - only those numerical/non-categorical
    return X_train, x_test, Y_train, y_test


In [None]:
pX_train, px_test, pY_train, py_test = preprocess_data(preds_data)

In [None]:
## train lgb classifier

def train_model(X_train, Y_train):
    model = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', metric='auc',
                                 verbose= 1,
                                 sub_feature= 0.55,
                                 seed= 123,
                                 save_binary= True,
                                 reg_alpha= 0.1,
                                 num_leaves= 6,
                                 num_boost_round= 700,
                                 min_data= 25,
                                 max_depth= 12,
                                 max_bin= 63,
                                 learning_rate= 0.1,
                                 lambda_l2= 0.4,
                                 lambda_l1= 0.7,
                                 is_unbalance= True,
                                 feature_fraction_seed= 1234,
                                 drop_seed= 1234,
                                 data_random_seed= 1234,
                                 boost_from_average= False,
                                 bagging_seed= 1234,
                                 bagging_freq= 5,
                                 bagging_fraction= 0.55)
    #params['learning_rate'] = 0.1
    #params['sub_feature'] = 0.6
    #params['num_leaves'] = 14
    #params['min_data'] = 20
    #params['lambda_l1'] = 0.4
    #params['lambda_l2'] = 0
    #params['reg_alpha'] = 0
    model = model.fit(X_train, Y_train)
    return model

def predict_data(X, Y, data_type):
    ## predicting test data
    y_pred = model.predict(X)
    for i in range(len(X)):
        if y_pred[i]>=.5:       # setting threshold to .5
           y_pred[i]=1
        else:  
           y_pred[i]=0

    #print accuracy_score(y_test, predicted) for test data
    accuracy = metrics.accuracy_score(y_pred, Y)
    print('\n\n\nThe following are metrices for ', data_type, ' data')
    print('\nACCURACY is ' + accuracy.astype(str))
    preds = pd.DataFrame({'true': Y, 'predicted': y_pred})
    confusion = pd.crosstab(preds['predicted'], preds['true'])
    print('\n CONFUSION MATRIX:\n', confusion)
    precision = metrics.precision_score(Y, y_pred)
    print('\n', data_type ,'DATA PRECISION ' + precision.astype(str))
    recall = metrics.recall_score(Y, y_pred)
    print('\n', data_type ,'DATA RECALL ' + recall.astype(str))
    auc = metrics.roc_auc_score(np.asarray(Y), y_pred).astype(str)
    print('\n', data_type ,'DATA AUC ' + auc.astype(str))
    return y_pred

model = train_model(pX_train, pY_train)
preds_train = predict_data(pX_train, pY_train, data_type='TRAIN')
preds_test = predict_data(px_test, py_test, data_type='TEST')

In [None]:
pickle.dump(model, open(r'.\models\lightGBMmodelSTACKED.sav', 'wb'))

In [None]:
predicted