# Light GBM
### LGBM implementation 
#### Import Requirements 

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
#from pandas.tools.plotting import table

#memory management
import gc

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

#Logistic regression
from lightgbm import LGBMClassifier
#to measure ROC AUC performance
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.model_selection import KFold

from scipy import interp
import matplotlib.pyplot as plt


### Set folder paths for getting input and saving outputs

In [6]:
#Set input data folder 
dataFolder = os.getcwd()+os.sep+os.pardir+os.sep + 'CodeOutputs'
if(not os.path.exists(dataFolder)):
    print("Input Data folder not found. Please specify data folder path as dataFolder variable to proceed")
    raise NotADirectoryError
    
#Create output folder is it does not exist
outputFolder = os.getcwd()+os.sep+os.pardir+os.sep + 'CodeOutputs'

if not os.path.exists(outputFolder):
    os.makedirs(outputFolder)
    print('Output Folder created')

### Read input

In [7]:
def setup_input(filename):
    # Read Training data
    df = pd.read_csv(dataFolder + os.sep + filename)
    labels = df.pop('TARGET')
    if('Unnamed: 0' in df):
        df = df.drop(columns='Unnamed: 0')
    
    df = df.drop(columns='SK_ID_CURR')
    return df, labels

In [8]:
#Funtion that performs 10 fold cv on input data and returns AUC score
def LGBM_Classifier(df, labels):
    probas_ = np.zeros(df.shape[0])
    # Run classifier with cross-validation
    cv = KFold(n_splits=10)

    for trainSet, testSet in cv.split(df, labels):
        clf = LGBMClassifier(n_jobs=-1, silent=True, )
        model = clf.fit(df.iloc[trainSet], labels.iloc[trainSet], eval_set=[(df.iloc[trainSet], labels.iloc[trainSet]),
            (df.iloc[testSet], labels.iloc[testSet])], eval_metric= 'auc', verbose= False, early_stopping_rounds= 200)
        probas_[testSet] = model.predict_proba(df.iloc[testSet], num_iteration=clf.best_iteration_)[:,1]
        del clf

    return roc_auc_score(labels, probas_)

In [56]:
file_names = ['DataSetVersion1_a.csv', 'DataSetVersion1_b.csv', 'DataSetVersion1_c.csv', 'DataSetVersion2_a.csv',
              'DataSetVersion2_b.csv', 'DataSetVersion2_c.csv', 'DataSetVersion3_a.csv', 'DataSetVersion3_b.csv',
             'DataSetVersion3_c.csv']

for filename in file_names:
    input_df, labels = setup_input(filename)
    try:
        print('File: {},  AUC score : {:0.3f}'.format(filename,LGBM_Classifier(input_df, labels)))
        del input_df,labels
        gc.collect()
    except:
        print('Classifier could not run on the data set. Put X instead of score.')
        del input_df,labels
        gc.collect()
        continue

File: DataSetVersion1_a.csv,  AUC score : 0.742
File: DataSetVersion1_b.csv,  AUC score : 0.752
File: DataSetVersion1_c.csv,  AUC score : 0.752
File: DataSetVersion2_a.csv,  AUC score : 0.746
File: DataSetVersion2_b.csv,  AUC score : 0.756
File: DataSetVersion2_c.csv,  AUC score : 0.756
File: DataSetVersion3_a.csv,  AUC score : 0.746
File: DataSetVersion3_b.csv,  AUC score : 0.751
File: DataSetVersion3_c.csv,  AUC score : 0.752


In [None]:
file_names = ['DataSetVersion4_a.csv', 'DataSetVersion4_b.csv', 'DataSetVersion4_c.csv']

for filename in file_names:
    input_df, labels = setup_input(filename)
    try:
        print('File: {},  AUC score : {:0.3f}'.format(filename,LGBM_Classifier(input_df, labels)))
        del input_df,labels
        gc.collect()
    except:
        print('Classifier could not run on the data set. Put X instead of score.')
        del input_df,labels
        gc.collect()
        continue