Training a Light GBM model
=================
**Author:** `Grej Segura`

In [63]:
# -*- coding: utf-8 -*-

import random
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn import svm
import matplotlib.pyplot as plt
import gc
import pickle

random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\middle-east-event-show-prediction-project')

# load the cleanData
data = pd.read_pickle(r'.\data\output\cleanData.pkl')
data = data.drop(['10 Digit Card Number', 'show'], axis=1)
data.loc[data['attended']==0, 'attended'] = -1

In [64]:
def preprocess_data(data):
    data = data.rename(columns={'attended': 'labels'})
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']

    # First, scale the Data - only those numerical/non-categorical
    names = dataX.columns
    scaler = preprocessing.StandardScaler()
    # Fit your data on the scaler object
    scaled_data = scaler.fit_transform(dataX)
    scaled_data = pd.DataFrame(scaled_data, columns=names)
    scaled_data = scaled_data[cols] ###------------------->> cols are non-categorical columns
    dataX = dataX.drop(cols, axis=1)
    dataX = pd.concat([scaled_data, dataX], axis=1)
    
    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 50)
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return X_train, x_test, Y_train, y_test

X_train, x_test, Y_train, y_test = preprocess_data(data)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [65]:
# clear memory
gc.collect()

def upsample_data(train_data, label):
    # apply oversampling (SMOTE) since the data is very imbalanced
    smote = SMOTE(random_state=1, ratio=1.0)
    X_train, Y_train = smote.fit_resample(train_data, label)
    return X_train, Y_train

X_train, Y_train = upsample_data(X_train, Y_train)

In [66]:
print(len(Y_train[Y_train==1]))
print(len(Y_train[Y_train==0]))

1212
0


In [67]:
def train_model(X_train, Y_train):
    model = svm.SVC(gamma=0.5, C=12, )
    model.fit(X_train, Y_train)
    return model

model = train_model(X_train, Y_train)

In [68]:
def predict_data(X, Y, data_type):
    ## predicting test data
    y_pred = model.predict(X)
    for i in range(len(X)):
        if y_pred[i]>=.5:       # setting threshold to .5
           y_pred[i]=1
        else:  
           y_pred[i]=-1

    #print accuracy_score(y_test, predicted) for test data
    accuracy = metrics.accuracy_score(y_pred, Y)
    print('\n\n\nThe following are metrices for ', data_type, ' data')
    print('\nACCURACY is ' + accuracy.astype(str))
    preds = pd.DataFrame({'true': Y, 'predicted': y_pred})
    confusion = pd.crosstab(preds['predicted'], preds['true'])
    print('\n CONFUSION MATRIX:\n', confusion)
    precision = metrics.precision_score(Y, y_pred)
    print('\n', data_type ,'DATA PRECISION ' + precision.astype(str))
    recall = metrics.recall_score(Y, y_pred)
    print('\n', data_type ,'DATA RECALL ' + recall.astype(str))
    return y_pred

y_pred_train = predict_data(X_train, Y_train, data_type='TRAIN')
y_pred_test = predict_data(x_test, y_test, data_type='TEST')
## save the model - this will be used in the deployment part to generate new predictions
pickle.dump(model, open(r'.\models\SVMModel.sav', 'wb'))




The following are metrices for  TRAIN  data

ACCURACY is 0.9843234323432343

 CONFUSION MATRIX:
 true       -1.0   1.0
predicted            
-1.0       1201    27
 1.0         11  1185

 TRAIN DATA PRECISION 0.9908026755852842

 TRAIN DATA RECALL 0.9777227722772277



The following are metrices for  TEST  data

ACCURACY is 0.6581059390048154

 CONFUSION MATRIX:
 true       -1.0   1.0
predicted            
-1.0        362   179
 1.0         34    48

 TEST DATA PRECISION 0.5853658536585366

 TEST DATA RECALL 0.21145374449339208


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2489 entries, 0 to 2488
Columns: 147 entries, with_website to count_per_website
dtypes: float64(65), int64(1), uint8(81)
memory usage: 1.5 MB


In [70]:
data.columns

Index(['with_website', 'with_email', 'days_to_go', 'weeks_to_go', 'latitude',
       'longitude', 'distance', 'association', 'attended', 'av_production',
       ...
       'region_1_Western Europe', 'region_2_Africa', 'region_2_Americas',
       'region_2_Australia-Asia', 'region_2_Europe', 'region_2_ME GCC',
       'region_2_ME Non-GCC', 'count_per_company', 'count_per_comp_website',
       'count_per_website'],
      dtype='object', length=147)