Training the model using Light GBM
=============

In [1]:
# -*- coding: utf-8 -*-

import random
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import matplotlib.pyplot as plt
import gc
random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\middle-east-event-show-prediction-project')

# load the cleanData
data = pd.read_pickle(r'.\data\output\cleanData.pkl')
data = data.drop(['card_number', 'show'], axis=1)

#### Scale the numerical variables and Split data into train and test

In [3]:
def preprocess_data(data):
    data = data.rename(columns={'attended': 'labels'})
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']

    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 0)

    gather_data = pd.DataFrame(columns = ['feature', 'mean', 'std'])
    for i in cols:
        ave = X_train[i].mean()
        std = X_train[i].std()
        appended = pd.DataFrame({'feature': i, 'mean': ave, 'std': std}, index = [0])
        gather_data = gather_data.append(appended, ignore_index=True)
        x_test[i] = (x_test[i]-ave)/std
        X_train[i] = (X_train[i]-ave)/std

    # First, scale the Data - only those numerical/non-categorical
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    return X_train, x_test, Y_train, y_test, gather_data

X_train, x_test, Y_train, y_test, gather_data = preprocess_data(data)
gather_data.to_csv(r'.\data\output\mean_std_scaler.csv', index=False)
gc.collect()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


14

#### Upsample to cure the imbalance bias

In [4]:
def upsample_data(train_data, label):
    # apply oversampling (SMOTE) since the data is very imbalanced
    smote = SMOTE(random_state=1, ratio=1.0)
    X_train, Y_train = smote.fit_resample(train_data, label)
    return X_train, Y_train

X_train, Y_train = upsample_data(X_train, Y_train)

#### Tune the model by finding the best combination of parameters
This training will be using a 10-fold cross validation evaluation

In [6]:
'category+NOB+Interest+howd_hear+likelihood 0.7925021795989537'
'No Questions  0.6935483870967742'
'category+NOB+Interest+howd_hear 0.7898866608544028'
'no category  0.7890148212728858'
'no nob 0.7868352223190933'
'no interest 0.7890148212728858'
'no how\'d you hear 0.7903225806451613'

"no how'd you hear 0.7903225806451613"

In [7]:
'''def grid_search(X_train, Y_train):
    model = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', metric='auc')

    parametersGrid = {
                    'learning_rate': [0.03, 0.1],
                    'num_boost_round': [400,500,600],
                    'num_leaves': [12,13,14],
                    'reg_alpha': [0.1],
                    'min_data_in_leaf': [18,19,20],
                    'lambda_l1': [0.1],
                    'lambda_l2': [0,0.1],
                    'sub_feature':[ 0.5, 0.6, 0.7]
                     }
    grid = GridSearchCV(model, parametersGrid, cv=10, n_jobs=12) ## 10-fold cross-validation
    grid.fit(X_train, Y_train)
    print ('\nThe best parameters are ', grid.best_params_)
    print('\nThe best score is ', grid.best_score_)
    return grid

grid = grid_search(X_train, Y_train)'''

"def grid_search(X_train, Y_train):\n    model = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', metric='auc')\n\n    parametersGrid = {\n                    'learning_rate': [0.03, 0.1],\n                    'num_boost_round': [400,500,600],\n                    'num_leaves': [12,13,14],\n                    'reg_alpha': [0.1],\n                    'min_data_in_leaf': [18,19,20],\n                    'lambda_l1': [0.1],\n                    'lambda_l2': [0,0.1],\n                    'sub_feature':[ 0.5, 0.6, 0.7]\n                     }\n    grid = GridSearchCV(model, parametersGrid, cv=10, n_jobs=12) ## 10-fold cross-validation\n    grid.fit(X_train, Y_train)\n    print ('\nThe best parameters are ', grid.best_params_)\n    print('\nThe best score is ', grid.best_score_)\n    return grid\n\ngrid = grid_search(X_train, Y_train)"

In [8]:
def grid_search(X_train, Y_train):
    model = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', metric='auc')

    param_dist = {
                    'learning_rate': [0.001, 0.1, 0.01],
                    'num_boost_round': [400,500,600],
                    'num_leaves': [12,13,14],
                    'reg_alpha': [0, 0.1, 0.4, 0.7],
                    'min_data_in_leaf': [18,19,20],
                    'lambda_l1': [0, 0.1, 0.4, 0.7],
                    'lambda_l2': [0, 0.1, 0.4, 0.7],
                    'sub_feature':[ 0.5, 0.6, 0.7]
    }

    n_iter_search = 1000
    grid = RandomizedSearchCV(model, cv=10, param_distributions=param_dist, n_jobs=12, n_iter=n_iter_search) ## 10-fold cross-validation
    grid.fit(X_train, Y_train)
    print ('\nThe best parameters are ', grid.best_params_)
    print('\nThe best score is ', grid.best_score_)
    return grid

grid = grid_search(X_train, Y_train)




The best parameters are  {'lambda_l1': 0.4, 'learning_rate': 0.1, 'lambda_l2': 0, 'sub_feature': 0.6, 'num_leaves': 14, 'min_data_in_leaf': 20, 'reg_alpha': 0.4, 'num_boost_round': 400}

The best score is  0.8108108108108109


#### The best parameters :
The best parameters are  {'lambda_l1': 0.4, 'learning_rate': 0.1, 'lambda_l2': 0, 'sub_feature': 0.6, 'num_leaves': 14, 'min_data_in_leaf': 20, 'reg_alpha': 0.4, 'num_boost_round': 400}
