In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Use plotly offline for fancy plots
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# use cufflinks to bind plotly to pandas
import cufflinks as cf 

ImportError: No module named cufflinks

In [5]:
def loadZippedCSV(path):
    return pd.read_csv(path, compression='zip', header=0, sep=',', quotechar='"')

application_train = loadZippedCSV('./application_train.csv.zip')
POS_CASH_balance = loadZippedCSV('./POS_CASH_balance.csv.zip')
bureau_balance = loadZippedCSV('./bureau_balance.csv.zip')
previous_application = loadZippedCSV('./previous_application.csv.zip')
installments_payments = loadZippedCSV('./installments_payments.csv.zip')
bureau = loadZippedCSV('./bureau.csv.zip')
credit_card_balance = loadZippedCSV('./credit_card_balance.csv.zip')
application_test = loadZippedCSV('./application_test.csv.zip')

In [11]:
# Bundle and save all data to HDFS storage to allow faster loading
rawDataStore = pd.HDFStore('./rawData.h5')
rawDataStore.put('/rawData/application_train', application_train, format='table')
rawDataStore.put('/rawData/POS_CASH_balance', POS_CASH_balance, format='table')
rawDataStore.put('/rawData/bureau_balance', bureau_balance, format='table')
rawDataStore.put('/rawData/previous_application', previous_application, format='table')
rawDataStore.put('/rawData/installments_payments', installments_payments, format='table')
rawDataStore.put('/rawData/bureau', bureau, format='table')
rawDataStore.put('/rawData/credit_card_balance', credit_card_balance, format='table')
rawDataStore.put('/rawData/application_test', application_test, format='table')
rawDataStore.close()

In [21]:
# Load data from HDFS storage into workspace
rawDataStore = pd.HDFStore('./rawData.h5')
for key in rawDataStore.keys():
    print('loading {0} ...'.format(key))
    locals()[key.split('/')[-1]] = rawDataStore[key]
print('all data loaded')

loading /rawData/POS_CASH_balance ...
loading /rawData/application_test ...
loading /rawData/application_train ...
loading /rawData/bureau ...
loading /rawData/bureau_balance ...
loading /rawData/credit_card_balance ...
loading /rawData/installments_payments ...
loading /rawData/previous_application ...


In [40]:
application_train.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
targetCnt = application_train["TARGET"].value_counts()
df = pd.DataFrame({
    'labels': targetCnt.index.astype('bool'),
    'values': targetCnt.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Has Payment Difficulties (Target)')

In [64]:
ownCarCntT = application_train.loc[application_train["TARGET"]==1, "FLAG_OWN_CAR"].value_counts()
ownCarCnt = application_train.loc[application_train["TARGET"]==0, "FLAG_OWN_CAR"].value_counts()
ownRealtyCntT = application_train.loc[application_train["TARGET"]==1, "FLAG_OWN_REALTY"].value_counts()
ownRealtyCnt = application_train.loc[application_train["TARGET"]==0, "FLAG_OWN_REALTY"].value_counts()

trace1 = go.Bar(
    x = ['Own Car', 'Own Realty'],
    y = [ownCarCntT.Y / (ownCarCntT.Y + ownCarCntT.N), ownRealtyCntT.Y / (ownRealtyCntT.Y + ownRealtyCntT.N)] * 100,
    name='Target'
)
trace2 = go.Bar(
    x = ['Own Car', 'Own Realty'],
    y = [ownCarCnt.Y / (ownCarCnt.Y + ownCarCnt.N), ownRealtyCnt.Y / (ownRealtyCnt.Y + ownRealtyCnt.N)] * 100,
    name='Non-target'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Target vs. non-Target in car and realty ownership",
    width = 800,
    xaxis = dict(
        title = 'Ownership Type',
        tickfont = dict(
            size=14
        )
    ),
    yaxis=dict(
        title = 'Percentage (%)',
        titlefont = dict(
            size=16
        ),
        tickfont = dict(
            size = 14
        )
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [134]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Lock pseudo-number seed
randSeed = 1
np.random.seed(randSeed)

In [101]:
# Pre-process data
dataCols = application_train.columns

# categoricalize
categoricalFeatures = [col for col in dataCols if data[col].dtype == 'object']

for column in categorical_columns:
    application_train[column] = application_train[column].astype('category')
    application_test[column] = application_test[column].astype('category')

In [135]:
ignoredFields = ['TARGET']
# Assemble into I/O dataset format
#   X - All fields other than 'TARGET'
#   Y - 'TARGET' fields
trainDataMask = [col for col in dataCols if col not in ignoredFields]
x = application_train[trainDataMask]
y = application_train['TARGET']

# Split training data randomly using train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    test_size = 0.25,
    random_state = randSeed
)

# Create lgb dataset
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_test = lgb.Dataset(data=x_test, label=y_test)

In [165]:
params = {
    'task': 'train',
    'device' : 'cpu',
    'nthread': 4,            # [CPU] number of OpenMP threads
    'gpu_use_dp' : 'false',  # [GPU] set to 1 to enable 64bit float point
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_iteration': 1000,
    'num_leaves': 32,
    'metric': 'auc',
    'reg_alpha': 5,
    'reg_lambda': 10,
    'learning_rate': 0.05,
    'max_bin': 256,
    'max_depth' : 10,
    'min_data_in_leaf': 32,
    'min_split_gain': 0.5,
    'min_child_weight': 1,
    'min_child_samples': 5,
    'subsample_for_bin': 200,
    'subsample': 1,
    'subsample_freq': 1,
    'colsample_bytree': 0.6,
    'verbose': 0
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round = 50,
    valid_sets = lgb_test,
    early_stopping_rounds = 50,
    verbose_eval = 50
)


Found `num_iteration` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds.
[50]	valid_0's auc: 0.744373
[100]	valid_0's auc: 0.75339
[150]	valid_0's auc: 0.759146
[200]	valid_0's auc: 0.76165
[250]	valid_0's auc: 0.762997
[300]	valid_0's auc: 0.763662
[350]	valid_0's auc: 0.763995
[400]	valid_0's auc: 0.764199
[450]	valid_0's auc: 0.764599
Early stopping, best iteration is:
[468]	valid_0's auc: 0.764712
