Training a Light GBM model
=================
**Author:** `Grej Segura`

In [1]:

import random
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
import pickle
import gc
random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\food-expo-attendee-prediction-project')

# load the cleanData
data = pd.read_pickle(r'.\data\output\cleanData.pkl')
data = data.drop(['10 Digit Card Number', 'show', 'latitude', 'longitude', 'count_per_comp_website', 'count_per_website'], axis=1)

In [2]:
data.columns

Index(['with_website', 'days_to_go', 'weeks_to_go', 'distance', 'arabic_page',
       'armed_forces_police', 'attended', 'bakery', 'bar_cafe_restaurant',
       'c_level_president_chairman',
       ...
       'region_1_Western Africa', 'region_1_Western Asia',
       'region_1_Western Europe', 'region_2_Africa', 'region_2_Americas',
       'region_2_Australia-Asia', 'region_2_Europe', 'region_2_ME GCC',
       'region_2_ME Non-GCC', 'count_per_company'],
      dtype='object', length=273)

In [3]:
def preprocess_data(data):
    data = data.rename(columns={'attended': 'labels'})
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']

    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 0)

    gather_data = pd.DataFrame(columns = ['feature', 'mean', 'std'])
    for i in cols:
        ave = X_train[i].mean()
        std = X_train[i].std()
        appended = pd.DataFrame({'feature': i, 'mean': ave, 'std': std}, index = [0])
        gather_data = gather_data.append(appended, ignore_index=True)
        x_test[i] = (x_test[i]-ave)/std
        X_train[i] = (X_train[i]-ave)/std

    # First, scale the Data - only those numerical/non-categorical
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return X_train, x_test, Y_train, y_test, gather_data

X_train, x_test, Y_train, y_test, gather_data = preprocess_data(data)
gather_data.to_csv(r'.\data\output\mean_std_scaler.csv', index=False)
gc.collect()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


14

In [4]:
x_test

array([[ 0.        , 26.        ,  1.62041659, ...,  1.        ,
         0.        , -0.37269518],
       [ 0.        ,  1.        , -1.30729043, ...,  1.        ,
         0.        , -0.37269518],
       [ 0.        , 18.        ,  0.88848984, ...,  0.        ,
         0.        , -0.18058648],
       ...,
       [ 0.        , 41.        ,  3.0842701 , ...,  0.        ,
         0.        , -0.37269518],
       [ 0.        , 39.        ,  3.0842701 , ...,  0.        ,
         1.        , -0.27664083],
       [ 0.        , 13.        ,  0.15656308, ...,  1.        ,
         0.        , -0.37269518]])

In [5]:
# clear memory
gc.collect()

def upsample_data(train_data, label):
    # apply oversampling (SMOTE) since the data is very imbalanced
    smote = SMOTE(random_state=1, ratio=1.0)
    X_train, Y_train = smote.fit_resample(train_data, label)
    return X_train, Y_train

X_train, Y_train = upsample_data(X_train, Y_train)

In [6]:
print(len(Y_train[Y_train==1]))
print(len(Y_train[Y_train==0]))

37543
37543


In [7]:
def train_model(X_train, Y_train):
    d_train = lgb.Dataset(X_train, label=Y_train)
    params = {}
    params['learning_rate'] = 0.1
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'binary'
    params['metric'] = 'binary_logloss'
    params['sub_feature'] = 0.7
    params['num_leaves'] = 24
    params['min_data'] = 22
    params['lambda_l1'] = 0
    params['lambda_l2'] = 0.5
    params['reg_alpha'] = 0.1
    model = lgb.train(params, d_train, 3000)
    return model, params

model, parameters = train_model(X_train, Y_train)

In [8]:
def predict_data(X, Y, data_type):
    ## predicting test data
    y_pred = model.predict(X)
    for i in range(len(X)):
        if y_pred[i]>=.5:       # setting threshold to .5
           y_pred[i]=1
        else:  
           y_pred[i]=0

    #print accuracy_score(y_test, predicted) for test data
    accuracy = metrics.accuracy_score(y_pred, Y)
    print('\n\n\nThe following are metrices for ', data_type, ' data')
    print('\nACCURACY is ' + accuracy.astype(str))
    preds = pd.DataFrame({'true': Y, 'predicted': y_pred})
    confusion = pd.crosstab(preds['predicted'], preds['true'])
    print('\n CONFUSION MATRIX:\n', confusion)
    precision = metrics.precision_score(Y, y_pred)
    print('\n', data_type ,'DATA PRECISION ' + precision.astype(str))
    recall = metrics.recall_score(Y, y_pred)
    print('\n', data_type ,'DATA RECALL ' + recall.astype(str))
    return y_pred, accuracy, precision, recall

y_pred_train,_,_,_ = predict_data(X_train, Y_train, data_type='TRAIN')
y_pred_test, accuracy, precision, recall = predict_data(x_test, y_test, data_type='TEST')
## save the model - this will be used in the deployment part to generate new predictions
pickle.dump(model, open(r'.\models\lightGBMmodel.sav', 'wb'))




The following are metrices for  TRAIN  data

ACCURACY is 0.9241802732866313

 CONFUSION MATRIX:
 true         0.0    1.0
predicted              
0.0        32286    436
1.0         5257  37107

 TRAIN DATA PRECISION 0.8759087904824852

 TRAIN DATA RECALL 0.9883866499746957



The following are metrices for  TEST  data

ACCURACY is 0.8302563805775797

 CONFUSION MATRIX:
 true        0.0    1.0
predicted             
0.0        2331    544
1.0        2389  12015

 TEST DATA PRECISION 0.8341432935295752

 TEST DATA RECALL 0.9566844493988375


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69115 entries, 0 to 69114
Columns: 273 entries, with_website to count_per_company
dtypes: float64(48), int64(1), uint8(224)
memory usage: 41.1 MB


In [10]:
data.columns

Index(['with_website', 'days_to_go', 'weeks_to_go', 'distance', 'arabic_page',
       'armed_forces_police', 'attended', 'bakery', 'bar_cafe_restaurant',
       'c_level_president_chairman',
       ...
       'region_1_Western Africa', 'region_1_Western Asia',
       'region_1_Western Europe', 'region_2_Africa', 'region_2_Americas',
       'region_2_Australia-Asia', 'region_2_Europe', 'region_2_ME GCC',
       'region_2_ME Non-GCC', 'count_per_company'],
      dtype='object', length=273)

In [11]:
len(data.columns)

273

In [12]:
def log_training(data, parameters, accuracy, precision, recall):
    from datetime import datetime
    log = pd.read_csv(r'.\data\training_log.csv')
    length =  len(log)+1
    log['date_time'][length] = datetime.now()
    log['parameters'][length] = parameters
    log['columns'][length] = data.columns
    log['columns_count'][length] = len(data.columns)
    log['accuracy'][length] = accuracy
    log['precision'][length] = precision
    log['recall'][length] = recall
    log.to_csv(r'.\data\training_log.csv', index=False)
    print(log)
    return log
log = log_training(data, parameters, accuracy, precision, recall)

        date_time columns columns_count parameters accuracy precision recall
0  2/11/2019 0:00       a             a          a        a         a      a


In [13]:
from datetime import datetime
datetime.today()

datetime.datetime(2019, 2, 24, 16, 55, 3, 336864)