# LightGBM on Application data

## Load data

In [1]:
import numpy as np
import pandas as pd

import basic_application_data_cleaner as cleaner

In [2]:
path_to_kaggle_data='~/kaggle_JPFGM/Data/'  # location of all the unzipped data files on local machine

In [3]:
# TODO: make LightGBM work with raw categorical data
app_train, app_test = cleaner.read_raw_application_data(path_to_kaggle_data)

df_train, df_test = cleaner.load_cleaned_application_data(path_to_kaggle_data)

Raw training data size: (307511, 121)
Raw test data size: (48744, 120)
Raw training data size: (307511, 121)
Raw test data size: (48744, 120)
Cleaned training data shape:  (307511, 246)
Cleaned testing data shape:  (48744, 245)


## Create train and validation sets

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# all of the training data (with labels)
# SK_ID is set as index in previous data cleaning
X = df_train.drop(['TARGET'], axis=1)
y = df_train['TARGET']

In [6]:
# Create train and validation sets in stratified way
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)

In [7]:
print('Fraction of positive samples in training set: %.2f%%' % (100*sum(y_train==1)/len(y_train)))
print('Fraction of positive samples in validation set: %.2f%%' % (100*sum(y_val==1)/len(y_val)))

Fraction of positive samples in training set: 8.07%
Fraction of positive samples in validation set: 8.07%


Note: X and X_train are all still pandas dataframes, not numpy arrays.

## Default LightGBM on cleaned training data

In [9]:
import lightgbm as lgb

In [10]:
train_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_val, y_val)

In [12]:
param = {'num_leaves':31, 'num_trees':100, 'objective':'binary'}
param['metric'] = 'auc'

In [23]:
num_round = 100
bst = lgb.train(param, train_data, num_round, valid_sets=[val_data]);

[1]	valid_0's auc: 0.707732
[2]	valid_0's auc: 0.712091
[3]	valid_0's auc: 0.716871
[4]	valid_0's auc: 0.71983
[5]	valid_0's auc: 0.721601
[6]	valid_0's auc: 0.723919
[7]	valid_0's auc: 0.726027
[8]	valid_0's auc: 0.7272
[9]	valid_0's auc: 0.728626
[10]	valid_0's auc: 0.729757
[11]	valid_0's auc: 0.731393
[12]	valid_0's auc: 0.732719
[13]	valid_0's auc: 0.733883
[14]	valid_0's auc: 0.73548
[15]	valid_0's auc: 0.736782
[16]	valid_0's auc: 0.737646
[17]	valid_0's auc: 0.738194
[18]	valid_0's auc: 0.739336
[19]	valid_0's auc: 0.740335
[20]	valid_0's auc: 0.741024
[21]	valid_0's auc: 0.741787
[22]	valid_0's auc: 0.7424
[23]	valid_0's auc: 0.743238
[24]	valid_0's auc: 0.743712
[25]	valid_0's auc: 0.744473
[26]	valid_0's auc: 0.744921
[27]	valid_0's auc: 0.745509
[28]	valid_0's auc: 0.746039
[29]	valid_0's auc: 0.746558
[30]	valid_0's auc: 0.747359
[31]	valid_0's auc: 0.747914
[32]	valid_0's auc: 0.748438
[33]	valid_0's auc: 0.748822
[34]	valid_0's auc: 0.749129
[35]	valid_0's auc: 0.749261


In [24]:
bst.save_model('model.txt')

In [25]:
bst = lgb.Booster(model_file='model.txt')  #init model

In [26]:
ypred = bst.predict(X_val)

There is no predict_proba!?