In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Use plotly offline for fancy plots
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# use cufflinks to bind plotly to pandas
import cufflinks as cf 
from os import listdir
# for display control
from IPython.display import display
# Gradient boosting using LightBGM
import lightgbm as lgb
from sklearn.model_selection import train_test_split
# Garbage collection
import gc
gc.enable()
# Lock pseudo-number seed
randSeed = 1
np.random.seed(randSeed)

In [2]:
# Global verbose control
PREVIEW_DATASET = 0

In [3]:
# Utility function to transform all catagorical fields using one hot ending
def oneHotEncoding(df):
    # Get list categorical features
    catFeatures = [col for col in df.columns if df[col].dtype == 'object']
    # Convert to one hot encoding
    ohe = pd.get_dummies(df, columns=catFeatures)
    return ohe

In [4]:
# Load all data
dataFiles = listdir("../data/home-credit-default-risk/")
for filename in dataFiles:
    print(f'loading {filename} ...')
    if '.csv.zip' in filename:
        # compressed data file
        locals()[filename.rstrip('.csv.zip')] = pd.read_csv(
            f'../data/home-credit-default-risk/{filename}',
            compression='zip', 
            header=0, 
            sep=',', 
            quotechar='"'
        )

dataTrain = oneHotEncoding(application_train)
dataTest = oneHotEncoding(application_test)
print('done')

loading application_train.csv.zip ...
loading previous_application.csv.zip ...
loading lowpass_submission_v1.csv ...
loading bureau.csv.zip ...
loading credit_card_balance.csv.zip ...
loading application_test.csv.zip ...
loading installments_payments.csv.zip ...
loading bureau_balance.csv.zip ...
loading POS_CASH_balance.csv.zip ...
loading HomeCredit_columns_description.csv ...
loading sample_submission.csv.zip ...
done


In [5]:
## Preprocess bureau datasets
if PREVIEW_DATASET:
    print('Raw bureau_balance dataset')
    display(bureau_balance.head(5))

# Count by status
bureauBalance = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts(normalize = False)
# Pivot into table of status values
bureauBalance = bureauBalance.unstack('STATUS')
# Add months balance data as new fileds
bureauBalance['MONTHS_COUNT'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU') 
        .MONTHS_BALANCE          
        .size()
)
bureauBalance['MONTHS_MAX'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU')
        .MONTHS_BALANCE
        .max()
)
bureauBalance['MONTHS_MIN'] = (
    bureau_balance
        .groupby('SK_ID_BUREAU')
        .MONTHS_BALANCE
        .min()
)
if PREVIEW_DATASET:
    print('Formatted')
    display(bureauBalance.head(5))

# Finally, merge the two bureau table together 
bureauData = bureau.join(bureauBalance, how='left', on='SK_ID_BUREAU')

# Transform features
bureauData = oneHotEncoding(bureauData).groupby('SK_ID_CURR').mean()
if PREVIEW_DATASET:
    print('Merged and transformed')
    display(bureauData.head(5))

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = bureauData,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = bureauData,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'bureau datasets processed, {newFeatureCnt} new features added')

# Remove temporary variables and clean up memory
del bureauBalance
del bureauData
del bureau_balance
del bureau
gc.collect();

bureau datasets preprocessed, 47 new features added


In [6]:
## Preproces previous_application
# Transform with one hot encoding
prevApplication = oneHotEncoding(previous_application)

# Compute number of previous applications by counting SK_ID_PREV
prevApplicationCnt = (
    prevApplication[['SK_ID_CURR', 'SK_ID_PREV']]
        .groupby('SK_ID_CURR')
        .count()
)
# Remove the SK_ID_PREV feature because the average of it is meaningless
del prevApplication['SK_ID_PREV']
# Group by mean
prevApplication = prevApplication.groupby('SK_ID_CURR').mean()
# Add back the number of previous applications
prevApplication['PREV_APPLICATION_CNT'] = prevApplicationCnt['SK_ID_PREV']

# Display merged dataset
if PREVIEW_DATASET:
    print('Merged dataset')
    display(prevApplication.head(5))

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = prevApplication,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = prevApplication,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'previous_application datasets processed, {newFeatureCnt} new features added')

# Remove temporary variables and clean up memory
del prevApplicationCnt
del prevApplication
del previous_application
gc.collect();

Unnamed: 0_level_0,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,...,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PREV_APPLICATION_CNT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,3951.0,24835.5,23787.0,2520.0,24835.5,13.0,1.0,0.104326,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
100002,9251.775,179055.0,179055.0,0.0,179055.0,9.0,1.0,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
100003,56553.99,435436.5,484191.0,3442.5,435436.5,14.666667,1.0,0.05003,,,...,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,3
100004,5357.25,24282.0,20106.0,4860.0,24282.0,5.0,1.0,0.212008,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
100005,4813.2,22308.75,20076.75,4464.0,44617.5,10.5,1.0,0.108964,,,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,2


previous_application datasets preprocessed, 163 new features added


In [15]:
## Preproces POS_CASH_balance
# Transform with one hot encoding
posCashBal = oneHotEncoding(POS_CASH_balance)

# Add some secondary features
posCashBal['MAX_DPD'] = (
    posCashBal[['SK_ID_CURR', 'SK_DPD']]
        .groupby('SK_ID_CURR')
        .max()
        .SK_DPD
)
posCashBal['MEAN_DPD'] = (
    posCashBal[['SK_ID_CURR', 'SK_DPD']]
        .groupby('SK_ID_CURR')
        .mean()
        .SK_DPD
)
posCashBal['MIN_DPD'] = (
    posCashBal[['SK_ID_CURR', 'SK_DPD']]
        .groupby('SK_ID_CURR')
        .min()
        .SK_DPD
)
posCashBal['MAX_DPD_DEF'] = (
    posCashBal[['SK_ID_CURR', 'SK_DPD_DEF']]
        .groupby('SK_ID_CURR')
        .max()
        .SK_DPD_DEF
)
posCashBal['MEAN_DPD_DEF'] = (
    posCashBal[['SK_ID_CURR', 'SK_DPD_DEF']]
        .groupby('SK_ID_CURR')
        .mean()
        .SK_DPD_DEF
)
posCashBal['MIN_DPD_DEF'] = (
    posCashBal[['SK_ID_CURR', 'SK_DPD_DEF']]
        .groupby('SK_ID_CURR')
        .min()
        .SK_DPD_DEF
)
posCashBal['MAX_CNT_INSTALMENT_FUTURE'] = (
    posCashBal[['SK_ID_CURR', 'CNT_INSTALMENT_FUTURE']]
        .groupby('SK_ID_CURR')
        .max()
        .CNT_INSTALMENT_FUTURE
)
posCashBal['MEAN_CNT_INSTALMENT_FUTURE'] = (
    posCashBal[['SK_ID_CURR', 'CNT_INSTALMENT_FUTURE']]
        .groupby('SK_ID_CURR')
        .mean()
        .CNT_INSTALMENT_FUTURE
)
posCashBal['MIN_CNT_INSTALMENT_FUTURE'] = (
    posCashBal[['SK_ID_CURR', 'CNT_INSTALMENT_FUTURE']]
        .groupby('SK_ID_CURR')
        .min()
        .CNT_INSTALMENT_FUTURE
)
posCashBal['MAX_CNT_INSTALMENT'] = (
    posCashBal[['SK_ID_CURR', 'CNT_INSTALMENT']]
        .groupby('SK_ID_CURR')
        .max()
        .CNT_INSTALMENT
)
posCashBal['MEAN_CNT_INSTALMENT'] = (
    posCashBal[['SK_ID_CURR', 'CNT_INSTALMENT']]
        .groupby('SK_ID_CURR')
        .mean()
        .CNT_INSTALMENT
)
posCashBal['MIN_CNT_INSTALMENT'] = (
    posCashBal[['SK_ID_CURR', 'CNT_INSTALMENT']]
        .groupby('SK_ID_CURR')
        .min()
        .CNT_INSTALMENT
)

# Group by UserID
posCashBal = posCashBal.groupby('SK_ID_CURR')
del posCashBal['SK_ID_PREV']

# Display merged dataset
if PREVIEW_DATASET:
    print('Processed dataset')
display(posCashBal.head())

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = posCashBal,
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = posCashBal,
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'POS_CASH_balance dataset processed, {newFeatureCnt} new features added')

# Remove temporary variables
del posCashBal
del POS_CASH_balance
gc.collect();

MemoryError: 

In [16]:
gc.collect()

902

In [11]:
## Preproces POS_CASH_balance
# Transform with one hot encoding
#posCashBal = oneHotEncoding(POS_CASH_balance)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
#display(POS_CASH_balance.head(5))
#oneHotEncoding(POS_CASH_balance).head(5)
#POS_CASH_balance['NAME_CONTRACT_STATUS'].astype(str)
posCashBal = POS_CASH_balance
le = LabelEncoder()
posCashBal.NAME_CONTRACT_STATUS = le.fit_transform(posCashBal['NAME_CONTRACT_STATUS'].astype(str))
posCashBal['NUM_UNIQUE_STATUS'] = (
    posCashBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']]
        .groupby('SK_ID_CURR')
        .nunique()
        .NAME_CONTRACT_STATUS
)
posCashBal['NUM_MAX_STATUS'] = (
    posCashBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']]
        .groupby('SK_ID_CURR')
        .max()
        .NAME_CONTRACT_STATUS
)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,NUM_UNIQUE_STATUS,NUM_MAX_STATUS
0,1803195,182943,-31,48.0,45.0,0,0,0,,
1,1715348,367990,-33,36.0,35.0,0,0,0,,
2,1784872,397406,-32,12.0,9.0,0,0,0,,
3,1903291,269225,-35,48.0,42.0,0,0,0,,
4,2341044,334279,-35,36.0,35.0,0,0,0,,


In [20]:
(
    posCashBal[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']]
        .groupby('SK_ID_CURR')
        .max()
        .NAME_CONTRACT_STATUS
)

SK_ID_CURR
100001    4
100002    0
100003    4
100004    4
100005    7
100006    6
100007    7
100008    7
100009    4
100010    4
100011    4
100012    4
100013    7
100014    4
100015    4
100016    4
100017    4
100018    4
100019    4
100020    4
100021    4
100022    0
100023    4
100025    4
100026    4
100027    4
100028    4
100029    4
100030    4
100032    4
         ..
456225    4
456226    7
456227    7
456228    7
456230    4
456231    4
456232    7
456233    0
456234    4
456235    4
456236    7
456237    4
456238    7
456239    4
456240    4
456241    7
456242    4
456243    4
456244    4
456245    4
456246    4
456247    4
456248    4
456249    4
456250    4
456251    7
456252    4
456253    4
456254    0
456255    4
Name: NAME_CONTRACT_STATUS, Length: 337252, dtype: int64

In [101]:
## Pre-process data
# Get all data columns
dataCols = application_train.columns

# categoricalize
categoricalFeatures = [col for col in dataCols if data[col].dtype == 'object']

for column in categorical_columns:
    application_train[column] = application_train[column].astype('category')
    application_test[column] = application_test[column].astype('category')

In [41]:
ignoredFields = ['TARGET']
# Assemble into I/O dataset format
#   X - All fields other than 'TARGET'
#   Y - 'TARGET' fields
trainDataMask = [col for col in dataCols if col not in ignoredFields]
x = application_train[trainDataMask]
y = application_train['TARGET']

# Split training data randomly using train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    test_size = 0.10,
    random_state = randSeed
)

# Create lgb dataset
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_test = lgb.Dataset(data=x_test, label=y_test)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-41-215170bfd797>", line 5, in <module>
    trainDataMask = [col for col in dataCols if col not in ignoredFields]
NameError: name 'dataCols' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 1863, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1095, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_

NameError: name 'dataCols' is not defined

In [165]:
params = {
    'task': 'train',
    'device' : 'cpu',
    'nthread': 8,            # [CPU] number of OpenMP threads
    'gpu_use_dp' : 'false',  # [GPU] set to 1 to enable 64bit float point
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_iteration': 1000,
    'num_leaves': 32,
    'metric': 'auc',
    'reg_alpha': 5,
    'reg_lambda': 10,
    'learning_rate': 0.05,
    'max_bin': 256,
    'max_depth' : 10,
    'min_data_in_leaf': 32,
    'min_split_gain': 0.5,
    'min_child_weight': 1,
    'min_child_samples': 5,
    'subsample_for_bin': 200,
    'subsample': 1,
    'subsample_freq': 1,
    'colsample_bytree': 0.6,
    'verbose': 0
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round = 50,
    valid_sets = lgb_test,
    early_stopping_rounds = 50,
    verbose_eval = 50
)


Found `num_iteration` in params. Will use it instead of argument



Training until validation scores don't improve for 10 rounds.
[50]	valid_0's auc: 0.744373
[100]	valid_0's auc: 0.75339
[150]	valid_0's auc: 0.759146
[200]	valid_0's auc: 0.76165
[250]	valid_0's auc: 0.762997
[300]	valid_0's auc: 0.763662
[350]	valid_0's auc: 0.763995
[400]	valid_0's auc: 0.764199
[450]	valid_0's auc: 0.764599
Early stopping, best iteration is:
[468]	valid_0's auc: 0.764712
