In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Use plotly offline for fancy plots
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# use cufflinks to bind plotly to pandas
import cufflinks as cf 
from os import listdir
# for display control
from IPython.display import display
# Gradient boosting using LightBGM
import lightgbm as lgb
from sklearn.model_selection import train_test_split
# Garbage collection
import gc
gc.enable()
# Lock pseudo-number seed
randSeed = 1
np.random.seed(randSeed)

In [2]:
# Load all data
dataFiles = listdir("../data/home-credit-default-risk/")
for filename in dataFiles:
    print(f'loading {filename} ...')
    if '.csv.zip' in filename:
        # compressed data file
        locals()[filename.rstrip('.csv.zip')] = pd.read_csv(
            f'../data/home-credit-default-risk/{filename}',
            compression='zip', 
            header=0, 
            sep=',', 
            quotechar='"'
        )

dataTrain = application_train
dataTest = application_test
print('done')

loading application_train.csv.zip ...
loading previous_application.csv.zip ...
loading bureau.csv.zip ...
loading credit_card_balance.csv.zip ...
loading application_test.csv.zip ...
loading installments_payments.csv.zip ...
loading bureau_balance.csv.zip ...
loading POS_CASH_balance.csv.zip ...
loading HomeCredit_columns_description.csv ...
loading sample_submission.csv.zip ...
done


In [8]:
# Utility function to transform all catagorical fields using one hot ending
def oneHotEncoding(df):
    # Get list categorical features
    catFeatures = [col for col in df.columns if df[col].dtype == 'object']
    # Convert to one hot encoding
    ohe = pd.get_dummies(df, columns=catFeatures)
    return ohe

In [53]:
## Preprocess bureau datasets
print('Raw bureau_balance dataset')
display(bureau_balance.head(5))
# Count by status
bureauBalance = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts(normalize = False)
# Pivot into table of status values
bureauBalance = bureauBalance.unstack('STATUS')
# Add months balance data as new fileds
bureauBalance['MONTHS_COUNT'] = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].size()
bureauBalance['MONTHS_MAX'] = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max()
bureauBalance['MONTHS_MIN'] = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].min()
print('Formatted')
display(bureauBalance.head(5))

# Finally, merge the two bureau table together 
bureauData = bureau.join(bureauBalance, how='left', on='SK_ID_BUREAU')

# Transform features
bureauData = oneHotEncoding(bureauData).groupby('SK_ID_CURR').mean()
print('Merged and transformed')
display(bureauData.head(5))

# Merge features into main training dataset
dataTrain = application_train.merge(
    right = bureauData,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = application_test.merge(
    right = bureauData,
    how = 'left',
    on = 'SK_ID_CURR'
)
# collect
gc.collect();

Raw bureau_balance dataset


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


Formatted


STATUS,0,1,2,3,4,5,C,X,MONTHS_COUNT,MONTHS_MAX,MONTHS_MIN
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5001709,,,,,,,86.0,11.0,97,0,-96
5001710,5.0,,,,,,48.0,30.0,83,0,-82
5001711,3.0,,,,,,,1.0,4,0,-3
5001712,10.0,,,,,,9.0,,19,0,-18
5001713,,,,,,,,22.0,22,0,-21


Merged and transformed


Unnamed: 0_level_0,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,5896633.0,-735.0,0.0,82.428571,-825.5,,0.0,207623.571429,85240.928571,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100002,6153272.125,-874.0,0.0,-349.0,-697.5,1681.029,0.0,108131.945625,49156.2,7997.14125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100003,5885878.5,-1400.75,0.0,-544.5,-1097.333333,0.0,0.0,254350.125,0.0,202500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100004,6829133.5,-867.0,0.0,-488.5,-532.5,0.0,0.0,94518.9,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100005,6735201.0,-190.666667,0.0,439.333333,-123.0,0.0,0.0,219042.0,189469.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
## Preproces previous_application
# Transform with one hot encoding
prevApplication = oneHotEncoding(previous_application)

# Compute number of previous applications by counting SK_ID_PREV
prevApplicationCnt = prevApplication[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
# Remove the SK_ID_PREV feature because the average of it is meaningless
del prevApplication['SK_ID_PREV']
# Group by mean
prevApplication = prevApplication.groupby('SK_ID_CURR').mean()
# Add back the number of previous applications
prevApplication['PREV_APPLICATION_CNT'] = prevApplicationCnt['SK_ID_PREV']

# Remove temporary variables

Unnamed: 0_level_0,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,...,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PREV_APPLICATION_CNT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,3951.0,24835.5,23787.0,2520.0,24835.5,13.0,1.0,0.104326,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
100002,9251.775,179055.0,179055.0,0.0,179055.0,9.0,1.0,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
100003,56553.99,435436.5,484191.0,3442.5,435436.5,14.666667,1.0,0.05003,,,...,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,3
100004,5357.25,24282.0,20106.0,4860.0,24282.0,5.0,1.0,0.212008,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
100005,4813.2,22308.75,20076.75,4464.0,44617.5,10.5,1.0,0.108964,,,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,2


In [10]:
#prevApplication.groupby('SK_ID_CURR').mean().head(5)
prevApplication[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()

Unnamed: 0_level_0,SK_ID_PREV
SK_ID_CURR,Unnamed: 1_level_1
100001,1
100002,1
100003,3
100004,1
100005,2
100006,9
100007,6
100008,5
100009,7
100010,1


In [101]:
## Pre-process data
# Get all data columns
dataCols = application_train.columns

# categoricalize
categoricalFeatures = [col for col in dataCols if data[col].dtype == 'object']

for column in categorical_columns:
    application_train[column] = application_train[column].astype('category')
    application_test[column] = application_test[column].astype('category')

In [135]:
ignoredFields = ['TARGET']
# Assemble into I/O dataset format
#   X - All fields other than 'TARGET'
#   Y - 'TARGET' fields
trainDataMask = [col for col in dataCols if col not in ignoredFields]
x = application_train[trainDataMask]
y = application_train['TARGET']

# Split training data randomly using train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    test_size = 0.25,
    random_state = randSeed
)

# Create lgb dataset
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_test = lgb.Dataset(data=x_test, label=y_test)

In [165]:
params = {
    'task': 'train',
    'device' : 'cpu',
    'nthread': 4,            # [CPU] number of OpenMP threads
    'gpu_use_dp' : 'false',  # [GPU] set to 1 to enable 64bit float point
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_iteration': 1000,
    'num_leaves': 32,
    'metric': 'auc',
    'reg_alpha': 5,
    'reg_lambda': 10,
    'learning_rate': 0.05,
    'max_bin': 256,
    'max_depth' : 10,
    'min_data_in_leaf': 32,
    'min_split_gain': 0.5,
    'min_child_weight': 1,
    'min_child_samples': 5,
    'subsample_for_bin': 200,
    'subsample': 1,
    'subsample_freq': 1,
    'colsample_bytree': 0.6,
    'verbose': 0
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round = 50,
    valid_sets = lgb_test,
    early_stopping_rounds = 50,
    verbose_eval = 50
)


Found `num_iteration` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds.
[50]	valid_0's auc: 0.744373
[100]	valid_0's auc: 0.75339
[150]	valid_0's auc: 0.759146
[200]	valid_0's auc: 0.76165
[250]	valid_0's auc: 0.762997
[300]	valid_0's auc: 0.763662
[350]	valid_0's auc: 0.763995
[400]	valid_0's auc: 0.764199
[450]	valid_0's auc: 0.764599
Early stopping, best iteration is:
[468]	valid_0's auc: 0.764712
