In [1]:
## base functions for xgboost, catboost and lightgbm are to be added
## has tuning using bayesian framework
## also will follow up with LIME/tree interpreters for the same

## sample data used is FLight delays dataset ##

# lightgbm done
# xgboost needs to be setup
# start on catboost

In [40]:
## importing the various packages

# clear the workspace
%reset -f

# print list of files in directory
import os
print(os.listdir())

# print/display all plots inline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc
import ast # for the AST segment, parsing json to table and so on

# the various packages/modules used across processing (sklearn), modelling (lightgbm) and bayesian optimization (hyperopt, bayes_opt)
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing
from sklearn.cross_validation import cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from bayes_opt import BayesianOptimization
from tqdm import tqdm
from hyperopt import hp, tpe, STATUS_OK, fmin, Trials
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample

# modelling algo
import xgboost as xgb

# Evaluation of the model
from sklearn import model_selection
from sklearn.model_selection import KFold

# Exporting packages for LIME
import lime
import lime.lime_tabular

# define the global variables used later
MAX_EVALS = 10 # number of iterations/parameter sets created towards tuning
N_FOLDS = 5 # number of cv folds
random_seed = 1 # the value for the random state used at various points in the pipeline

['.ipynb_checkpoints', 'ABI_LGBM_LIME.ipynb', 'ABI_LGB_binary_classification.ipynb', 'ABI_XGB_LIME.ipynb', 'airlines.csv', 'airports.csv', 'example lightgbm.ipynb', 'flights.csv', 'flights_sample.csv', 'gbm_trials.csv', 'hyperparameter-optimization-master']


In [41]:
%who

BayesianOptimization	 KFold	 MAX_EVALS	 N_FOLDS	 STATUS_OK	 StratifiedKFold	 StratifiedShuffleSplit	 Trials	 ast	 
collections	 cross_val_score	 csv	 fmin	 gc	 hp	 lime	 metrics	 model_selection	 
np	 os	 pd	 plt	 preprocessing	 random_seed	 sample	 sns	 time	 
tpe	 tqdm	 train_test_split	 xgb	 


In [42]:
## function to get frequency count of elements in a vector/list
def freq_count (input_vector):
    return collections.Counter(input_vector)

## function to create 
def prepare_data(input_file_path = 'flights_sample.csv', response = 'ARRIVAL_DELAY'):
    train = pd.read_csv(input_file_path)
    train = train.sample(frac = 0.1, random_state = random_seed)
    train = train[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
    train.dropna(inplace = True)
    print(train.shape)
    
    x = list(train.dtypes)
    x_1 = [1 if x == 'O' else 0 for x in x]
    categorical_column_indices = [i for i, x in enumerate(x_1) if x == 1]
    
    categorical_column_names = train.select_dtypes(include=['object']).columns.values
    for column in tqdm(categorical_column_names):
        le = preprocessing.LabelEncoder()
        train[column] = le.fit_transform(train[column].astype(str))
    
    train[response] = (train[response] > 20)*1
    print(freq_count(train[response]))

    y = train[response].values
    X = train.drop([response], axis = 1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed)

    feature_names_train = list(X_train.columns.values)
    
    return X_train, X_test, y_train, y_test, feature_names_train, categorical_column_names, categorical_column_indices

In [43]:
## preparing the different train/test features/labels datasets
X_train, X_test, y_train, y_test, feature_names, categ_cols, categ_cols_indices = prepare_data()

num_feature = X_train.shape[1]

(5707, 11)


100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 200.10it/s]


Counter({0: 4857, 1: 850})


In [44]:
model = xgb.XGBClassifier()
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [45]:
# baseline model

from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer

start = timer()
model.fit(X=X_train, y=y_train, verbose=True)

train_time = timer() - start

predictions = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, predictions)

print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))

The baseline score on the test set is 0.6899.
The baseline training time is 0.2168 seconds


In [46]:
explainer = lime.lime_tabular.LimeTabularExplainer(training_data = X_train, training_labels = y_train,
                                                   feature_names = feature_names,
                                                   class_names = ['not delayed', 'delayed'],
                                                   categorical_features=categ_cols_indices, 
                                                   categorical_names=list(categ_cols))

TypeError: unhashable type: 'slice'

In [27]:
xtest = np.array(X_test)
exp = explainer.explain_instance(xtest[0], model.predict_proba, num_features=num_feature)

ValueError: feature_names mismatch: ['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'DESTINATION_AIRPORT', 'ORIGIN_AIRPORT', 'AIR_TIME', 'DEPARTURE_TIME', 'DISTANCE'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
expected DAY, MONTH, DESTINATION_AIRPORT, DEPARTURE_TIME, AIR_TIME, AIRLINE, DISTANCE, ORIGIN_AIRPORT, FLIGHT_NUMBER, DAY_OF_WEEK in input data
training data did not have the following fields: f4, f1, f2, f8, f6, f7, f3, f0, f9, f5

In [13]:
X_train.head(4)
#categ_cols_indices

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE
9540,9,24,4,13,275,249,345,118.0,1751.0,882
33457,8,6,4,3,1182,269,345,73.0,650.0,448
510,2,10,2,9,5616,171,221,78.0,619.0,594
54424,3,1,7,7,3424,277,141,74.0,702.0,409
