In [267]:
## importing the relevant packages:

# clear the workspace
%reset -f

# print list of files in directory
import os
print(os.listdir())

# print/display all plots inline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc, bisect

# the various packages/modules used across processing (sklearn), modelling (lightgbm) and bayesian optimization (hyperopt, bayes_opt)
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing
from sklearn.cross_validation import cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from sklearn.base import TransformerMixin
from bayes_opt import BayesianOptimization
from tqdm import tqdm
from hyperopt import hp, tpe, STATUS_OK, fmin, Trials
from hyperopt.fmin import fmin
#from hyperopt.pyll.stochastic import sample

# modelling algorithms
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier

# Evaluation of the model
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer

# Exporting packages for SHAP/LIME
import shap
import lime
import lime.lime_tabular

# define the global variables used later
MAX_EVALS = 20 # number of iterations/parameter sets created towards tuning
N_FOLDS = 5 # number of cv folds
randomseed = 1 # the value for the random state used at various points in the pipeline

['.ipynb_checkpoints', 'ABI_LGBM_LIME.ipynb', 'ABI_LGB_binary_classification.ipynb', 'ABI_RF_LIME.ipynb', 'ABI_RF_SHAP.ipynb', 'ABI_XGB_LIME.ipynb', 'ABI_XGB_SHAP.ipynb', 'agaricus-lepiota.data', 'airlines.csv', 'airports.csv', 'example lightgbm.ipynb', 'flights.csv', 'flights_sample.csv', 'gbm_trials.csv', 'hyperparameter-optimization-master', 'MODEL INTERPRETER.ipynb', 'MODEL_SELECTION_TUNING.ipynb', 'MODEL_SELECTION_TUNING_TEST.ipynb', 'testing_ad.csv', 'testing_ad_labels.csv', 'training_ad.csv']


In [268]:
#### MAIN CLASSES ####
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], 
                              index=X.columns)
#         self.fill = pd.Series(['No Data' if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
#             index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
    def num_missing(self):
        return sum(self.isnull())
    
    def imputer_mean(self, column):
        x = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
        return x.fit_transform(self[[column]]).ravel()
    
    def imputer_median(self, column):
        x = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
        return x.fit_transform(self[[column]]).ravel()
    
    def imputer_mode(self, column):
        x = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
        return x.fit_transform(self[[column]]).ravel()

class prepare_data():
    
    def __init__(self):
        """ To prepare data,
                1. read in data
                2. pre-processing/cleaning
                3. creating helper objects for later steps
                4. processing for modelling
        """
    
    def labelEncoder(train_df, test_df, cat_columns):
        categorical_names = {}
        for feature in tqdm(cat_columns):
            le = preprocessing.LabelEncoder()
            le.fit(train_df[feature].astype(str))
            train_df[feature] = le.transform(train_df[feature].astype(str))
            test_df[feature] = test_df[feature].map(lambda i: 'No Data' if i not in le.classes_ else i)
            le_classes = le.classes_.tolist()
            bisect.insort_left(le_classes, 'No Data')
            le.classes_ = le_classes
            test_df[feature] = le.transform(test_df[feature].astype(str))
            categorical_names[feature] = le.classes_
        return train_df, test_df, categorical_names
    
    ## function to get frequency count of elements in a vector/list
    def freq_count(input_vector):
        return collections.Counter(input_vector)
    
    def categ_feats(train_df, test_df):
        x = list(train_df.dtypes)
        x_1 = [1 if x == 'O' else 0 for x in x]
        categorical_idx = [i for i, x in enumerate(x_1) if x == 1]

        # Get feature names and their values for categorical data (needed for LIME)
        cat_columns = train_df.select_dtypes(include=['object']).columns.values
        train, test, categorical_names = prepare_data.labelEncoder(train_df, test_df, cat_columns)

        return train_df, test_df, categorical_names, categorical_idx

    def create(input_file_path, input_file_path_2, response, cols_to_remove = ['id'], random_seed = 1234):
        train = pd.read_csv(input_file_path, na_values=['No Data', ' '])
        test = pd.read_csv(input_file_path_2, na_values=['No Data', ' '])
        
        train = pd.DataFrame(train)
        test = pd.DataFrame(test)
        
        for col in cols_to_remove:
            train.drop([col], axis = 1, inplace = True)
        test = pd.DataFrame(data = test[train.columns])
        
        print(train.shape, '\n')
        train.dropna(thresh=0.6*(train.shape[0]), axis=1, inplace = True)
        train.dropna(thresh=0.5*(train.shape[1]), axis=0, inplace = True)
        print(train.shape, '\n')
        test = test[train.columns]
        
        #print(train.apply(DataFrameImputer.num_missing, axis=0), '\n')
        imputer_object = DataFrameImputer()
        imputer_object.fit(train)
        train = imputer_object.transform(train)
        test = imputer_object.transform(test)
        
        print(prepare_data.freq_count(train[response]), '\n')

        y_train = train[response].values
        X_train = train.drop([response], axis = 1)
        y_test = test[response].values
        X_test = test.drop([response], axis = 1)
        
        X_train, X_test, categ_names, categ_idx = prepare_data.categ_feats(X_train, X_test)

        ##  segment for usage if doing the train/test split ##
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed)
        #feature_names_train = list(X_train.columns.values)
        #num_feature = X_train.shape[1]
        #X_train = pd.DataFrame(data=X_train, columns=feature_names_train)
        #X_test = pd.DataFrame(data=X_test, columns=feature_names_train)
        #return X_train, X_test, y_train, y_test, feature_names_train, categ_names, categ_idx, num_feature
        
        X_train = pd.DataFrame(data=X_train, columns=X_train.columns.values)
        X_test = pd.DataFrame(data=X_test, columns=X_test.columns.values)
        
        return X_train, X_test, y_train, y_test

In [269]:
X_train, X_test, y_train, y_test = prepare_data.create(input_file_path='training_ad.csv', input_file_path_2='testing_ad.csv',
                                            response = 'label',
                                cols_to_remove = ['global id', 'ethnicity', 'original hire date'])

(7080, 57) 

(6778, 42) 

Counter({0: 5890, 1: 888}) 



100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [00:01<00:00, 20.67it/s]


In [270]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(collections.Counter(y_train))
print(collections.Counter(y_test))

(6778, 41)
(5788, 41)
(6778,)
(5788,)
Counter({0: 5890, 1: 888})
Counter({0: 5624, 1: 164})


In [271]:
from imblearn.combine import SMOTETomek, SMOTEENN

# Apply SMOTE + Tomek links
# sm = SMOTETomek(random_state=0)
feat_names = X_train.columns.values
# X_train, y_train = sm.fit_sample(X_train, y_train)
# X_train = pd.DataFrame(data=X_train, columns=feat_names)

smote_enn = SMOTEENN(random_state=0)
X_train, y_train = smote_enn.fit_sample(X_train, y_train)
X_train = pd.DataFrame(data=X_train,columns=feat_names)

In [272]:
## xgboost class for tuning

class xgboost_model():
    
    def __init__():
        """ this class initializes some functions used in the xgboost pipeline """
    
    def xgb_score(params):
        global ITERATION
        ITERATION += 1
        
        randomseed = 1
        
        #params['max_depth'] = int(float(params['max_depth']))
        # Make sure parameters that need to be integers are integers
        for parameter_name in ['max_depth']:
            params[parameter_name] = int(params[parameter_name])
        
        clf = xgb.XGBClassifier(n_estimators = 1000,
                                base_score = params['base_score'],
                                colsample_bytree = params['colsample_bytree'],
                               learning_rate = params['learning_rate'],
                                max_depth = params['max_depth'],
                                min_child_weight = params['min_child_weight'],
                                subsample = params['subsample'],
                               gamma = params['gamma'],
                               reg_lambda = params['reg_lambda'], 
                                scale_pos_weight = 3, silent = False, seed = randomseed)
        X = pd.DataFrame(X_train).values
        Xcv = X_test.values
        eval_set  = [(X, y_train), (Xcv, y_test)]

        clf.fit(X, y_train,
                eval_set = eval_set, eval_metric = 'auc',
                early_stopping_rounds = 20, verbose = False)
        num_rounds = clf.best_iteration
        bst_score = clf.best_score

        pred = clf.predict_proba(Xcv)[:, 1]
        predict = np.where(pred > params['base_score'], 1, 0)
        auc_score = roc_auc_score(y_test, pred)
        recall_score = sklearn.metrics.recall_score(y_pred=predict, y_true=y_test)
        #print('recall score is: ', recall_score)
        #print('The AUC for iteration ', ITERATION, ' is {:.4f}.'.format(auc_score), '\n')
        
        return {'loss': (1 - recall_score), 'status': STATUS_OK, 'params': params, 'num_boost': num_rounds, 'bst_score': bst_score}
    
    def optimize(X_train, y_train, X_test, y_test):
        # Keep track of evals
        global ITERATION
        ITERATION = 0
        
        global trials
        trials = Trials()
        
        space = {
            'base_score' : hp.quniform('base_score', 0.4, 0.6, 0.01),
             'learning_rate' : hp.quniform('learning_rate', 0.001, 0.2, 0.05),
             #'max_depth' : hp.choice('max_depth', np.arange(3, 8, dtype=int)),
            'max_depth' : hp.quniform('max_depth', 2, 8, 1),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 4, 1),
             'subsample' : hp.quniform('subsample', 0.4, 0.8, 0.05),
             'gamma' : hp.quniform('gamma', 0.25, 1, 0.05),
            'reg_lambda' : hp.uniform ('reg_lambda', 0, 1),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.4, 0.8, 0.05)
        }
        
        best = fmin(xgboost_model.xgb_score, space, algo=tpe.suggest, trials=trials, max_evals=MAX_EVALS,
                    rstate=np.random.RandomState(randomseed))
        best = trials.best_trial['result']['params']
        num_rounds = trials.best_trial['result']['num_boost']
        
        return trials, best, num_rounds
    
    def xgb_train(best_params, num_rounds):
        model = xgb.XGBClassifier(silent = False, seed = randomseed, n_estimators=num_rounds+1)
        model.set_params(**best_params)
        model.fit(X_train, y_train, eval_metric = "auc", verbose = True)
        return model
    
    def xgb_predict(X_test, y_test, model, mode = "validate"):
        pred = model.predict_proba(X_test)[:, 1]
        predict = np.where(pred > 0.45, 1, 0)
        
        if mode == "validate":
            recall_score = sklearn.metrics.recall_score(y_pred=predict, y_true=y_test)
            print('recall score is: ', recall_score)
            auc_score = roc_auc_score(y_test, pred)
            print('accuracy score: ', sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predict))
            print('The final AUC after taking the best params and num_rounds when it stopped is {:.4f}.'.format(auc_score), '\n')
            return pred, predict
        else:
            return pred

In [273]:
trials, best, num_rounds = xgboost_model.optimize(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
print(1 - trials.average_best_error(), '\n')

model = xgboost_model.xgb_train(best, num_rounds)

pred, predict = xgboost_model.xgb_predict(X_test=X_test, model=model, y_test=y_test, mode='validate')

0.7804878048780488 

recall score is:  0.524390243902439
accuracy score:  0.5763648928818245
The final AUC after taking the best params and num_rounds when it stopped is 0.5600. 



In [238]:
xgb.plot_importance(booster=model, max_num_features=15, show_values=False)

NameError: name 'model' is not defined

In [87]:
# lightgbm class for tuning

class lightgbm_model():
    
    def __init__():
        """ this class initializes some functions used in the lightgbm pipeline """
        
    def lgbm_score(params):        
        global ITERATION
        ITERATION += 1
        
        # Retrieve the subsample if present otherwise set to 1.0
        subsample = params['boosting_type'].get('subsample', 1.0)
        # Extract the boosting type
        params['boosting_type'] = params['boosting_type']['boosting_type']
        params['subsample'] = subsample

        # Make sure parameters that need to be integers are integers
        for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
            params[parameter_name] = int(params[parameter_name])
        
        start = timer()
        # Perform n_folds cross validation
        cv_results = lgb.cv(params, train_set, num_boost_round = 1000, nfold = N_FOLDS, 
                            early_stopping_rounds = 25, metrics = 'auc', seed = randomseed)
        run_time = timer() - start

        # Extract the best score
        best_score = np.max(cv_results['auc-mean'])
        # Loss must be minimized
        loss = 1 - best_score

        # Boosting rounds that returned the highest cv score
        n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)

        # Dictionary with information for evaluation
        return {'loss': loss, 'params': params, 'iteration': ITERATION,
                'estimators': n_estimators, 
                'train_time': run_time, 'status': STATUS_OK}
    
    def optimize():
        # Keep track of evals
        global ITERATION
        ITERATION = 0
        
        global trials
        trials = Trials()
        
        space = {
            'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.6, 0.9)}, 
                                                         {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.6, 0.9)},
                                                         {'boosting_type': 'goss', 'subsample': 1.0}]),
            'num_leaves': hp.quniform('num_leaves', 30, 500, 1),
            'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.2)),
            'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
            'min_child_samples': hp.quniform('min_child_samples', 5, 10, 1),
            'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
            'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
            'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 0.8)
        }
        
        # Run optimization
        best = fmin(fn = lightgbm_model.lgbm_score, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = trials, rstate = np.random.RandomState(randomseed))
        best = trials.best_trial['result']['params']
        nestimators = trials.best_trial['result']['estimators']
        return best, trials, nestimators
    
    def lgbm_train(best_params, nestimators):
        model = lgb.LGBMClassifier(silent = False, random_state = randomseed, objective = 'binary', n_estimators=nestimators)
        model.set_params(**best_params)
        model.fit(X_train, y_train, eval_metric = "auc")
        return model
    
    def lgbm_predict(X_test, y_test, model, mode = "validate"):
        pred = model.predict_proba(X_test)[:, 1]
        predict = np.where(pred > 0.5, 1, 0)
        
        if mode == "validate":
            auc_score = roc_auc_score(y_test, pred)
            recall_score = sklearn.metrics.recall_score(y_pred=predict, y_true=y_test)
            print('recall score is: ', recall_score)
            print('accuracy score: ', sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predict))
            print('The final AUC after taking the best params and num_rounds when it stopped is {:.4f}.'.format(auc_score), '\n')
            return pred, predict
        else:
            return pred

In [89]:
# Create a lgb dataset
train_set = lgb.Dataset(X_train, label = y_train)

In [90]:
best, trials, nestimators = lightgbm_model.optimize()

print(1 - trials.average_best_error(), '\n')

model = lightgbm_model.lgbm_train(best, nestimators)

pred, predict = lightgbm_model.lgbm_predict(X_test=X_test, model=model, y_test=y_test, mode='validate')

0.9667677780712894 

recall score is:  0.024390243902439025
accuracy score:  0.9581893572909468
The final AUC after taking the best params and num_rounds when it stopped is 0.5708. 



In [67]:
# random forest class for tuning

class rf_model():
    
    def __init__():
        """ this class initializes some functions used in the random forest pipeline """
        
    def rf_score(params):        
        global ITERATION
        ITERATION += 1

        # Make sure parameters that need to be integers are integers
        for parameter_name in ['max_depth', 'n_estimators']:
            params[parameter_name] = int(params[parameter_name])
                
        rf_results = RandomForestClassifier(**params, random_state=randomseed)
        rf_results.fit(X_train, y_train)

        pred = rf_results.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, pred)
        #print ('The AUC for iteration ', ITERATION, ' is {:.4f}.'.format(auc_score), '\n')
        return {'loss': (1 - auc_score), 'status': STATUS_OK, 'params': params, 'iteration': ITERATION}
    
    def optimize():
        # Keep track of evals
        global ITERATION
        ITERATION = 0
        
        global trials
        trials = Trials()
        space = {
            'max_depth' : hp.quniform('max_depth', 2, 8, 1),
            'max_features': hp.choice('max_features', range(1, (X_train.shape[:][1] - 1))),
            'criterion': hp.choice('criterion', ["gini", "entropy"]),
            'n_estimators': hp.choice('n_estimators', np.arange(10, 100))
        }
        
        # Run optimization
        best = fmin(fn = rf_model.rf_score, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = trials, rstate = np.random.RandomState(randomseed))
        best = trials.best_trial['result']['params']
        return best, trials
    
    def rf_train(best_params):
        model = RandomForestClassifier(random_state = randomseed)
        model.set_params(**best_params)
        model.fit(X_train, y_train)
        return model
    
    def rf_predict(X_test, y_test, model, mode = "validate"):
        pred = model.predict_proba(X_test)[:, 1]
        
        if mode == "validate":
            auc_score = roc_auc_score(y_test, pred)
            print('The AUC is {:.4f}.'.format(auc_score), '\n')
        else:
            return pred

In [72]:
best, trials = rf_model.optimize()

print(1 - trials.average_best_error(), '\n')

model = rf_model.rf_train(best)

rf_model.rf_predict(X_test=X_test, model=model, y_test=y_test, mode='validate')

0.6713795310409031 

The AUC is 0.6714. 

