Training a Light GBM model
=================
**Author:** `Grej Segura`

In [1]:
# -*- coding: utf-8 -*-

import random
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
import pickle
import gc
random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\middle-east-event-show-prediction-project')


# load the cleanData
data = pd.read_pickle(r'.\data\output\cleanData.pkl')
data = data.drop(['card_number', 'show'], axis=1)

In [2]:
def preprocess_data(data):
    data = data.rename(columns={'target': 'labels'})
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']

    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 0)

    gather_data = pd.DataFrame(columns = ['feature', 'mean', 'std'])
    for i in cols:
        ave = X_train[i].mean()
        std = X_train[i].std()
        appended = pd.DataFrame({'feature': i, 'mean': ave, 'std': std}, index = [0])
        gather_data = gather_data.append(appended, ignore_index=True)
        x_test[i] = (x_test[i]-ave)/std
        X_train[i] = (X_train[i]-ave)/std

    # First, scale the Data - only those numerical/non-categorical
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return X_train, x_test, Y_train, y_test, gather_data

X_train, x_test, Y_train, y_test, gather_data = preprocess_data(data)
#gather_data.to_csv(r'.\data\output\mean_std_scaler.csv', index=False)
gc.collect()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


14

In [3]:
'''# clear memory
gc.collect()

def upsample_data(train_data, label):
    # apply oversampling (SMOTE) since the data is very imbalanced
    smote = SMOTE(random_state=1, ratio=1.0)
    X_train, Y_train = smote.fit_resample(train_data, label)
    return X_train, Y_train

X_train, Y_train = upsample_data(X_train, Y_train)'''

In [4]:
print(len(Y_train[Y_train==1]))
print(len(Y_train[Y_train==0]))

1147
1147


In [47]:
def train_model(X_train, Y_train):
    d_train = lgb.Dataset(X_train, label=Y_train)
    params = {}
    params['learning_rate'] = 0.1
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'binary'
    params['metric'] = 'auc'
    params['sub_feature'] = 0.6
    params['num_leaves'] = 14
    params['min_data'] = 20
    params['lambda_l1'] = 0.4
    params['lambda_l2'] = 0
    params['reg_alpha'] = 0
    model = lgb.train(params, d_train, 400)
    return model

model = train_model(X_train, Y_train)

In [48]:
def predict_data(X, Y, data_type):
    ## predicting test data
    y_pred = model.predict(X)
    for i in range(len(X)):
        if y_pred[i]>=.5:       # setting threshold to .5
           y_pred[i]=1
        else:  
           y_pred[i]=0

    #print accuracy_score(y_test, predicted) for test data
    accuracy = metrics.accuracy_score(y_pred, Y)
    print('\n\n\nThe following are metrices for ', data_type, ' data')
    print('\nACCURACY is ' + accuracy.astype(str))
    preds = pd.DataFrame({'true': Y, 'predicted': y_pred})
    confusion = pd.crosstab(preds['predicted'], preds['true'])
    print('\n CONFUSION MATRIX:\n', confusion)
    precision = metrics.precision_score(Y, y_pred)
    print('\n', data_type ,'DATA PRECISION ' + precision.astype(str))
    recall = metrics.recall_score(Y, y_pred)
    print('\n', data_type ,'DATA RECALL ' + recall.astype(str))
    return y_pred

y_pred_train = predict_data(X_train, Y_train, data_type='TRAIN')
y_pred_test = predict_data(x_test, y_test, data_type='TEST')
## save the model - this will be used in the deployment part to generate new predictions
pickle.dump(model, open(r'.\models\lightGBMmodel.sav', 'wb'))




The following are metrices for  TRAIN  data

ACCURACY is 0.9760244115082825

 CONFUSION MATRIX:
 true        0.0   1.0
predicted            
0.0        1138    46
1.0           9  1101

 TRAIN DATA PRECISION 0.9918918918918919

 TRAIN DATA RECALL 0.959895379250218



The following are metrices for  TEST  data

ACCURACY is 0.7440585009140768

 CONFUSION MATRIX:
 true       0.0  1.0
predicted          
0.0        329   99
1.0         41   78

 TEST DATA PRECISION 0.6554621848739496

 TEST DATA RECALL 0.4406779661016949


In [12]:
data.columns

Index(['opened_email_broadcast', 'count_number_email_opened', 'days_to_go',
       'weeks_to_go', 'distance', 'association', 'attended', 'av_production',
       'brand_communication_agencies', 'concert_event_promoter',
       'conference_organiser', 'content_development', 'corporate_marketing',
       'creative_agency_public_relations_and_sales', 'design_agencies_',
       'destination_management', 'destination_management_company',
       'email_from_mese', 'entertainment', 'entertainment_provider',
       'event_management', 'event_management_agency',
       'event_management_technology', 'event_organiser', 'event_organizers',
       'event_production', 'exhibition', 'exhibitor', 'facebook',
       'hospitality_and_catering', 'hotels_and_catering',
       'invite_from_a_sponsor_exhibitor_of_the_event',
       'invite_from_an_association_or_partner', 'linkedin',
       'magazine_advertisement',
       'marketing_or_procurement_professional_corporate',
       'marketing_or_procurement_p