Training a Light GBM model
=================
**Author:** `Grej Segura`

In [1]:
# -*- coding: utf-8 -*-

import random
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
import pickle
import gc
random.seed(23)

## set the directory
os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\santander-customer-prediction')

# load the cleanData
data = pd.read_csv(r'.\data\trainFinal.csv')
testdata = pd.read_csv(r'.\data\testFinal.csv')


In [2]:
features = pd.read_csv(r'.\data\features.csv')
feature_imp = features[features['0']>0]
feature_selected = np.array(feature_imp['1'])
feature_selected = np.append(feature_selected, 'target')

#### Split data - train and test

In [3]:
def preprocess_data(data):
    data = data.rename(columns={'target': 'labels'})

    # separate the labels/target variable
    dataX = data.drop(['labels'], axis = 1)
    dataY = data['labels']
    
    # Create train and test dataset
    X_train, x_test, Y_train, y_test = train_test_split(dataX, dataY, random_state = 0)
    
    # First, scale the Data - only those numerical/non-categorical
    return X_train, x_test, Y_train, y_test, gather_data

X_train, x_test, Y_train, y_test, gather_data = preprocess_data(data)

In [5]:
print(len(Y_train[Y_train==1]))
print(len(Y_train[Y_train==0]))

15000
135000


In [6]:
def train_model(X_train, Y_train):
    d_train = lgb.Dataset(X_train, label=Y_train)
    params = {'boosting_type':'gbdt',  'objective':'binary', 'metric':'auc',
             'bagging_fraction': 0.55,
             'bagging_freq': 5,
             'bagging_seed': 1234,
             'boost_from_average': False,
             'data_random_seed': 1234,
             'drop_seed': 1234,
             'feature_fraction_seed': 1234,
             'is_unbalance': True,
             'lambda_l1': 0.1,
             'lambda_l2': 0.1,
             'learning_rate': 0.1,
             'max_bin': 63,
             'max_depth': 14,
             'min_data': 25,
             'num_boost_round': 600,
             'num_leaves': 6,
             'reg_alpha': 0.4,
             'save_binary': True,
             'seed': 123,
             'sub_feature': 0.5,
             'verbose': 1}
    model = lgb.train(params, d_train)
    return model, params

model, parameters = train_model(X_train, Y_train)



In [7]:
def predict_data(X, Y, data_type):
    ## predicting test data
    y_pred = model.predict(X)
    for i in range(len(X)):
        if y_pred[i]>=.5:       # setting threshold to .5
           y_pred[i]=1
        else:  
           y_pred[i]=0

    #print accuracy_score(y_test, predicted) for test data
    accuracy = metrics.accuracy_score(y_pred, Y)
    print('\n\n\nThe following are metrices for ', data_type, ' data')
    print('\nACCURACY is ' + accuracy.astype(str))
    preds = pd.DataFrame({'true': Y, 'predicted': y_pred})
    confusion = pd.crosstab(preds['predicted'], preds['true'])
    print('\n CONFUSION MATRIX:\n', confusion)
    precision = metrics.precision_score(Y, y_pred)
    print('\n', data_type ,'DATA PRECISION ' + precision.astype(str))
    recall = metrics.recall_score(Y, y_pred)
    print('\n', data_type ,'DATA RECALL ' + recall.astype(str))
    auc = metrics.roc_auc_score(np.asarray(Y), y_pred).astype(str)
    print('\n', data_type ,'DATA AUC ' + auc.astype(str))

    return y_pred, accuracy, precision, recall

y_pred_train,_,_,_ = predict_data(X_train, Y_train, data_type='TRAIN')
y_pred_test, accuracy, precision, recall = predict_data(x_test, y_test, data_type='TEST')
## save the model - this will be used in the deployment part to generate new predictions
pickle.dump(model, open(r'.\models\lightGBMmodel.sav', 'wb'))




The following are metrices for  TRAIN  data

ACCURACY is 0.8404733333333333

 CONFUSION MATRIX:
 true            0      1
predicted               
0.0        113411   2340
1.0         21589  12660

 TRAIN DATA PRECISION 0.36964582907530147

 TRAIN DATA RECALL 0.844

 TRAIN DATA AUC 0.8420407407407406



The following are metrices for  TEST  data

ACCURACY is 0.82814

 CONFUSION MATRIX:
 true           0     1
predicted             
0.0        37406  1097
1.0         7496  4001

 TEST DATA PRECISION 0.3480038270853266

 TEST DATA RECALL 0.7848175755198117

 TEST DATA AUC 0.8089381183019753


In [8]:
### best local score
'''
params = {'boost_from_average': False, 'min_data': 25, 'boosting_type': 'gbdt', 'max_depth': 13, 'min_sum_hessian_in_leaf': 0.000446, 'bagging_fraction': 0.55, 'sub_feature': 0.7, 'bagging_seed': 1234, 'objective': 'binary', 'is_unbalance': True, 'lambda_l2': 0.1, 'bagging_freq': 5, 'verbose': 1, 'learning_rate': 0.1, 'reg_alpha': 0.1, 'num_boost_round': 600, 'max_bin': 63, 'save_binary': True, 'metric': 'auc', 'lambda_l1': 0.1, 'num_leaves': 6, 'feature_fraction_seed': 1234, 'data_random_seed': 1234, 'seed': 123, 'drop_seed': 1234}


The following are metrices for  TRAIN  data

ACCURACY is 0.8398466666666666

 CONFUSION MATRIX:
 true            0      1
predicted               
0.0        113284   2380
1.0         21643  12693

 TRAIN DATA PRECISION 0.36967031686859275

 TRAIN DATA RECALL 0.8421017713792874

 TRAIN DATA AUC 0.840848257601863



The following are metrices for  TEST  data

ACCURACY is 0.82842

 CONFUSION MATRIX:
 true           0     1
predicted             
0.0        37479  1083
1.0         7496  3942

 TEST DATA PRECISION 0.344640671446057

 TEST DATA RECALL 0.7844776119402985

 TEST DATA AUC 0.8089036197555856'''

"\nparams = {'boost_from_average': False, 'min_data': 25, 'boosting_type': 'gbdt', 'max_depth': 13, 'min_sum_hessian_in_leaf': 0.000446, 'bagging_fraction': 0.55, 'sub_feature': 0.7, 'bagging_seed': 1234, 'objective': 'binary', 'is_unbalance': True, 'lambda_l2': 0.1, 'bagging_freq': 5, 'verbose': 1, 'learning_rate': 0.1, 'reg_alpha': 0.1, 'num_boost_round': 600, 'max_bin': 63, 'save_binary': True, 'metric': 'auc', 'lambda_l1': 0.1, 'num_leaves': 6, 'feature_fraction_seed': 1234, 'data_random_seed': 1234, 'seed': 123, 'drop_seed': 1234}\n\n\nThe following are metrices for  TRAIN  data\n\nACCURACY is 0.8398466666666666\n\n CONFUSION MATRIX:\n true            0      1\npredicted               \n0.0        113284   2380\n1.0         21643  12693\n\n TRAIN DATA PRECISION 0.36967031686859275\n\n TRAIN DATA RECALL 0.8421017713792874\n\n TRAIN DATA AUC 0.840848257601863\n\n\n\nThe following are metrices for  TEST  data\n\nACCURACY is 0.82842\n\n CONFUSION MATRIX:\n true           0     1\npred

In [9]:
data.columns

Index(['target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6',
       'var_7', 'var_8',
       ...
       'var_198', 'var_199', 'sum_rows', 'median_rows', 'mean_rows',
       'min_rows', 'max_rows', 'std_rows', 'skew_rows', 'kurt_rows'],
      dtype='object', length=209)

In [10]:
def log_training(data, parameters, accuracy, precision, recall):
    from datetime import datetime
    log = pd.read_csv(r'.\data\training_log.csv')
    length =  len(log)+1
    log['date_time'][length] = datetime.now()
    log['parameters'][length] = parameters
    log['columns'][length] = data.columns
    log['columns_count'][length] = len(data.columns)
    log['accuracy'][length] = accuracy
    log['precision'][length] = precision
    log['recall'][length] = recall
    log.to_csv(r'.\data\training_log.csv', index=False)
    print(log)
    return log
log = log_training(data, parameters, accuracy, precision, recall)

FileNotFoundError: File b'.\\data\\training_log.csv' does not exist

In [None]:
from datetime import datetime
datetime.today()