In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Use plotly offline for fancy plots
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# use cufflinks to bind plotly to pandas
import cufflinks as cf 
from os import listdir
# for display control
from IPython.display import display
# Gradient boosting using LightBGM
import lightgbm as lgb
from sklearn.model_selection import train_test_split
# Garbage collection
import gc
gc.enable()
# Lock pseudo-number seed
randSeed = 1
np.random.seed(randSeed)

In [2]:
# Global verbose control
PREVIEW_DATASET = 0

In [3]:
# Utility function to transform all catagorical fields using one hot ending
def oneHotEncoding(df):
    # Get list categorical features
    catFeatures = [col for col in df.columns if df[col].dtype == 'object']
    # Convert to one hot encoding
    ohe = pd.get_dummies(df, columns=catFeatures)
    return ohe

# Utility function to add descriptive statistics as secondary fields in the dataframe
def addStatsFields(df, field):
    df['MEAN_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .mean()[field]
    )
    df['MEDIAN_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .median()[field]
    )
    df['MAX_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .max()[field]
    )
    df['MIN_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .min()[field]
    )
    df['SUM_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .sum()[field]
    )
    df['VAR_' + field] = (
        df[['SK_ID_CURR', field]]
            .groupby('SK_ID_CURR')
            .var()[field]
    )
    return df

In [4]:
# Load all data
dataFiles = listdir("../data/home-credit-default-risk/")
for filename in dataFiles:
    print(f'loading {filename} ...')
    if '.csv.zip' in filename:
        # compressed data file
        locals()[filename.rstrip('.csv.zip')] = pd.read_csv(
            f'../data/home-credit-default-risk/{filename}',
            compression='zip', 
            header=0, 
            sep=',', 
            quotechar='"'
        )

dataTrain = oneHotEncoding(application_train)
dataTest = oneHotEncoding(application_test)
print('done')

loading application_train.csv.zip ...
loading previous_application.csv.zip ...
loading lowpass_submission_v1.csv ...
loading bureau.csv.zip ...
loading credit_card_balance.csv.zip ...
loading application_test.csv.zip ...
loading installments_payments.csv.zip ...
loading bureau_balance.csv.zip ...
loading POS_CASH_balance.csv.zip ...
loading HomeCredit_columns_description.csv ...
loading sample_submission.csv.zip ...
done


In [5]:
## Preprocess bureau datasets
if PREVIEW_DATASET:
    print('Raw bureau_balance dataset')
    display(bureau_balance.head(5))

# Count by status
bureauBalance = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts(normalize = False)
# Pivot into table of status values
bureauBalance = bureauBalance.unstack('STATUS')
# Add months balance data as new fileds
bureauBalance['MONTHS_COUNT'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU') 
        .MONTHS_BALANCE          
        .size()
)
bureauBalance['MONTHS_MAX'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU')
        .MONTHS_BALANCE
        .max()
)
bureauBalance['MONTHS_MIN'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU')
        .MONTHS_BALANCE
        .min()
)
if PREVIEW_DATASET:
    print('Formatted')
    display(bureauBalance.head(5))

# Finally, merge the two bureau table together 
bureauData = bureau.join(bureauBalance, how='left', on='SK_ID_BUREAU')

# Transform features
bureauData = oneHotEncoding(bureauData).groupby('SK_ID_CURR').mean()
if PREVIEW_DATASET:
    print('Merged and transformed')
    display(bureauData.head(5))

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = bureauData,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = bureauData,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'bureau datasets processed, {newFeatureCnt} new features added')

# Remove temporary variables and clean up memory
del bureauBalance
del bureauData
del bureau_balance
del bureau
gc.collect();

bureau datasets processed, 47 new features added


In [6]:
## Preproces previous_application
# Transform with one hot encoding
prevApplication = oneHotEncoding(previous_application)

# Compute number of previous applications by counting SK_ID_PREV
prevApplicationCnt = (
    prevApplication[['SK_ID_CURR', 'SK_ID_PREV']]
        .groupby('SK_ID_CURR')
        .count()
)
# Remove the SK_ID_PREV feature because the average of it is meaningless
del prevApplication['SK_ID_PREV']
# Group by mean
prevApplication = prevApplication.groupby('SK_ID_CURR').mean()
# Add back the number of previous applications
prevApplication['PREV_APPLICATION_CNT'] = prevApplicationCnt['SK_ID_PREV']

# Display merged dataset
if PREVIEW_DATASET:
    print('Merged dataset')
    display(prevApplication.head(5))

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = prevApplication,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = prevApplication,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'previous_application datasets processed, {newFeatureCnt} new features added')

# Remove temporary variables and clean up memory
del prevApplicationCnt
del prevApplication
del previous_application
gc.collect();

previous_application datasets processed, 163 new features added


In [7]:
## Preproces POS_CASH_balance
# Transform with one hot encoding
posCashBal = oneHotEncoding(POS_CASH_balance)

# Add some secondary features
posCashBal = addStatsFields(posCashBal, 'SK_DPD')
posCashBal = addStatsFields(posCashBal, 'SK_DPD_DEF')
posCashBal = addStatsFields(posCashBal, 'CNT_INSTALMENT_FUTURE')
posCashBal = addStatsFields(posCashBal, 'CNT_INSTALMENT')

# Group by UserID
del posCashBal['SK_ID_PREV']
posCashBal = posCashBal.groupby('SK_ID_CURR').mean()

# Display merged dataset
if PREVIEW_DATASET:
    print('Processed dataset')
    display(posCashBal.head())
    
# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = posCashBal,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = posCashBal,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'POS_CASH_balance dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del posCashBal
del POS_CASH_balance
gc.collect();    

POS_CASH_balance dataset processed, 38 new features added


In [8]:
## Preproces credit_card_balance
# Transform with one hot encoding
# Rename fields that have already used in POS_CASH_balance
creditCardBal = credit_card_balance.rename(
    index = str, 
    columns = {
        'NAME_CONTRACT_STATUS' : 'NAME_CONTRACT_STATUS_CREDIT',
        'SK_DPD' : 'SK_DPD_CREDIT',
        'SK_DPD_DEF' : 'SK_DPD_DEFCREDIT'
    }
)
creditCardBal = oneHotEncoding(creditCardBal)

# Add some secondary features
creditCardBal = addStatsFields(creditCardBal, 'AMT_BALANCE')
creditCardBal = addStatsFields(creditCardBal, 'AMT_CREDIT_LIMIT_ACTUAL')
creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_ATM_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_OTHER_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'AMT_DRAWINGS_POS_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'AMT_INST_MIN_REGULARITY')
creditCardBal = addStatsFields(creditCardBal, 'AMT_PAYMENT_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'AMT_PAYMENT_TOTAL_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'AMT_RECEIVABLE_PRINCIPAL')
creditCardBal = addStatsFields(creditCardBal, 'AMT_RECIVABLE')
creditCardBal = addStatsFields(creditCardBal, 'AMT_TOTAL_RECEIVABLE')
creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_ATM_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_OTHER_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'CNT_DRAWINGS_POS_CURRENT')
creditCardBal = addStatsFields(creditCardBal, 'CNT_INSTALMENT_MATURE_CUM')

# Group by UserID
del creditCardBal['SK_ID_PREV']
creditCardBal = creditCardBal.groupby('SK_ID_CURR').mean()

# Display merged dataset
if PREVIEW_DATASET:
    print('Processed dataset')
    display(creditCardBal.head())
    
# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = creditCardBal,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = creditCardBal,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'credit_card_balance dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del creditCardBal
del credit_card_balance
gc.collect();

POS_CASH_balance dataset processed, 129 new features added


In [29]:
colDictMean = {}
colDictMin = {}
colDictMax = {}
colDictMedian = {}
colDictVar = {}
for col in installments_payment.columns:
    if col not in ['SK_ID_CURR', 'SK_ID_PREV']:
        colDictMean[col] = 'MEAN_' + col
        colDictMin[col] = 'MIN_' + col
        colDictMax[col] = 'MAX_' + col
        colDictMedian[col] = 'MEDIAN_' + col
        colDictVar[col] = 'VAR_' + col

# Add mean
installPayment = installments_payment.groupby('SK_ID_CURR').mean()
del installPayment['SK_ID_PREV']
installPayment = installPayment.rename(
    index = str, 
    columns = colDictMean
)
# Add min
installPaymentMin = installments_payment.groupby('SK_ID_CURR').min()
del installPaymentMin['SK_ID_PREV']
installPaymentMin = installPaymentMin.rename(
    index = str, 
    columns = colDictMin
)
installPayment = installPayment.merge(
    right = installPaymentMin,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentMin
# Add max
installPaymentMax = installments_payment.groupby('SK_ID_CURR').max()
del installPaymentMax['SK_ID_PREV']
installPaymentMax = installPaymentMax.rename(
    index = str, 
    columns = colDictMax
)
installPayment = installPayment.merge(
    right = installPaymentMax,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentMax
# Add median
installPaymentMedian = installments_payment.groupby('SK_ID_CURR').median()
del installPaymentMedian['SK_ID_PREV']
installPaymentMedian = installPaymentMedian.rename(
    index = str, 
    columns = colDictMedian
)
installPayment = installPayment.merge(
    right = installPaymentMedian,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentMedian
# Add variance
installPaymentVar = installments_payment.groupby('SK_ID_CURR').var()
del installPaymentVar['SK_ID_PREV']
installPaymentVar = installPaymentVar.rename(
    index = str, 
    columns = colDictVar
)
installPayment = installPayment.merge(
    right = installPaymentVar,
    how = 'left',
    on = 'SK_ID_CURR'
)
del installPaymentVar

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = installPayment,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = installPayment,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'installments_payment dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del installPayment
del installments_payment
gc.collect();

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [42]:
installments_payment.groupby('SK_ID_CURR').var()

Unnamed: 0_level_0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100001,6.645846e+10,0.142857,1.238095,4.277026e+05,4.146127e+05,2.577265e+07,2.577265e+07
100002,0.000000e+00,0.052632,31.666667,2.850000e+04,2.960426e+04,1.011641e+08,1.011641e+08
100003,1.027132e+11,0.040000,9.826667,5.737359e+05,5.735418e+05,1.221966e+10,1.221966e+10
100004,0.000000e+00,0.333333,1.000000,9.000000e+02,1.157333e+03,9.071372e+06,9.071372e+06
100005,0.000000e+00,0.111111,7.500000,6.750000e+03,8.200028e+03,1.832709e+07,1.832709e+07
100006,4.017569e+09,0.116667,8.395833,3.181700e+04,3.922212e+04,2.825681e+10,2.825681e+10
100007,6.411097e+10,0.141026,16.751748,4.652980e+05,4.619340e+05,6.166821e+07,6.477128e+07
100008,2.209274e+11,0.028571,7.937815,9.619617e+05,9.569290e+05,4.989257e+09,5.004286e+09
100009,2.557923e+11,0.000000,7.972549,6.279699e+05,6.262159e+05,9.411493e+06,9.411493e+06
100010,0.000000e+00,0.000000,9.166667,8.250000e+03,9.268544e+03,2.016968e+03,2.016968e+03


In [10]:
## Preproces installments_payments
# Display merged dataset
if PREVIEW_DATASET:
    print('Processed dataset')
    display(installPayment.head())
    
# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = installPayment,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = installPayment,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'POS_CASH_balance dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del installPayment
del installments_payment
gc.collect();

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.000,25425.000
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130,24350.130
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040,2160.585
5,1137312,164489,1.0,12,-1384.0,-1417.0,5970.375,5970.375
6,2234264,184693,4.0,11,-349.0,-352.0,29432.295,29432.295
7,1818599,111420,2.0,4,-968.0,-994.0,17862.165,17862.165
8,2723183,112102,0.0,14,-197.0,-197.0,70.740,70.740
9,1413990,109741,1.0,4,-570.0,-609.0,14308.470,14308.470


In [11]:
## Preproces POS_CASH_balance
# Transform with one hot encoding
#posCashBal = oneHotEncoding(POS_CASH_balance)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
#display(POS_CASH_balance.head(5))
#oneHotEncoding(POS_CASH_balance).head(5)
#POS_CASH_balance['NAME_CONTRACT_STATUS'].astype(str)
posCashBal = POS_CASH_balance
le = LabelEncoder()
posCashBal.NAME_CONTRACT_STATUS = le.fit_transform(posCashBal['NAME_CONTRACT_STATUS'].astype(str))
posCashBal['NUM_UNIQUE_STATUS'] = (
    posCashBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']]
        .groupby('SK_ID_CURR')
        .nunique()
        .NAME_CONTRACT_STATUS
)
posCashBal['NUM_MAX_STATUS'] = (
    posCashBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']]
        .groupby('SK_ID_CURR')
        .max()
        .NAME_CONTRACT_STATUS
)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,NUM_UNIQUE_STATUS,NUM_MAX_STATUS
0,1803195,182943,-31,48.0,45.0,0,0,0,,
1,1715348,367990,-33,36.0,35.0,0,0,0,,
2,1784872,397406,-32,12.0,9.0,0,0,0,,
3,1903291,269225,-35,48.0,42.0,0,0,0,,
4,2341044,334279,-35,36.0,35.0,0,0,0,,


In [41]:
ignoredFields = ['TARGET']
# Assemble into I/O dataset format
#   X - All fields other than 'TARGET'
#   Y - 'TARGET' fields
dataCols = dataTrain.columns
trainDataMask = [col for col in dataCols if col not in ignoredFields]
x = application_train[trainDataMask]
y = application_train['TARGET']

# Split training data randomly using train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    test_size = 0.10,
    random_state = randSeed
)

# Create lgb dataset
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_test = lgb.Dataset(data=x_test, label=y_test)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-41-215170bfd797>", line 5, in <module>
    trainDataMask = [col for col in dataCols if col not in ignoredFields]
NameError: name 'dataCols' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 1863, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1095, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_

NameError: name 'dataCols' is not defined

In [165]:
params = {
    'task': 'train',
    'device' : 'cpu',
    'nthread': 8,            # [CPU] number of OpenMP threads
    'gpu_use_dp' : 'false',  # [GPU] set to 1 to enable 64bit float point
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_iteration': 1000,
    'num_leaves': 32,
    'metric': 'auc',
    'reg_alpha': 5,
    'reg_lambda': 10,
    'learning_rate': 0.05,
    'max_bin': 256,
    'max_depth' : 10,
    'min_data_in_leaf': 32,
    'min_split_gain': 0.5,
    'min_child_weight': 1,
    'min_child_samples': 5,
    'subsample_for_bin': 200,
    'subsample': 1,
    'subsample_freq': 1,
    'colsample_bytree': 0.6,
    'verbose': 0
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round = 50,
    valid_sets = lgb_test,
    early_stopping_rounds = 50,
    verbose_eval = 50
)


Found `num_iteration` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds.
[50]	valid_0's auc: 0.744373
[100]	valid_0's auc: 0.75339
[150]	valid_0's auc: 0.759146
[200]	valid_0's auc: 0.76165
[250]	valid_0's auc: 0.762997
[300]	valid_0's auc: 0.763662
[350]	valid_0's auc: 0.763995
[400]	valid_0's auc: 0.764199
[450]	valid_0's auc: 0.764599
Early stopping, best iteration is:
[468]	valid_0's auc: 0.764712
