Training the model using boosting
=============

In [1]:

import random
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import matplotlib.pyplot as plt
import gc
random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\food-expo-attendee-prediction-project')

# load the cleanData
data = pd.read_pickle(r'.\data\output\cleanData.pkl')
data = data.drop(['10 Digit Card Number', 'show', 'latitude', 'longitude', 'count_per_comp_website'], axis=1)

#### Scale the numerical variables and Split data into train and test

In [2]:
def preprocess_data(data):
    data = data.rename(columns={'attended': 'labels'})
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']
    
    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 0)

    gather_data = pd.DataFrame(columns = ['feature', 'mean', 'std'])
    for i in cols:
        ave = X_train[i].mean()
        std = X_train[i].std()
        appended = pd.DataFrame({'feature': i, 'mean': ave, 'std': std}, index = [0])
        gather_data = gather_data.append(appended, ignore_index=True)
        x_test[i] = (x_test[i]-ave)/std
        X_train[i] = (X_train[i]-ave)/std
    
    # First, scale the Data - only those numerical/non-categorical
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return X_train, x_test, Y_train, y_test, gather_data

X_train, x_test, Y_train, y_test, gather_data = preprocess_data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
gather_data

Unnamed: 0,feature,mean,std
0,weeks_to_go,1.789958,1.680179
1,distance,1523.845603,2418.404437
2,count_per_company,4.102244,8.965713
3,count_per_website,1.303601,2.526408


#### Upsample to cure the imbalance bias

In [4]:
def upsample_data(train_data, label):
    # apply oversampling (SMOTE) since the data is very imbalanced
    smote = SMOTE(random_state=1, ratio=1.0)
    X_train, Y_train = smote.fit_resample(train_data, label)
    return X_train, Y_train

X_train, Y_train = upsample_data(X_train, Y_train)

In [5]:
print(len(Y_train[Y_train==1]))
print(len(Y_train[Y_train==0]))

53705
53705


#### Tune the model by finding the best combination of parameters
This training will be using a 10-fold cross validation evaluation

In [6]:
def grid_search(X_train, Y_train):
    model = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', metric='binary_logloss')

    parametersGrid = {
                    'learning_rate': [0.1],
                    'num_boost_round': [2000,3000],
                    'num_leaves': [22,24],
                    'reg_alpha': [0.1],
                    'min_data_in_leaf': [18,22],
                    'lambda_l1': [0],
                    'lambda_l2': [0.1, 0.5],
                    'sub_feature':[0.7]
                     }
    grid = GridSearchCV(model, parametersGrid, cv=10, n_jobs=8) ## 10-fold cross-validation
    grid.fit(X_train, Y_train)
    print ('\nThe best parameters are ', grid.best_params_)
    print('\nThe best score is ', grid.best_score_)
    return grid

grid = grid_search(X_train, Y_train)

KeyboardInterrupt: 

In [None]:
data.info()

#### The best parameters :
        The best parameters are  {'num_leaves': 22, 'lambda_l2': 0.1, 'min_data_in_leaf': 18, 'learning_rate': 0.1, 'num_boost_round': 2000, 'lambda_l1': 0, 'sub_feature': 0.7, 'reg_alpha': 0.1}