In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Use plotly offline for fancy plots
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# use cufflinks to bind plotly to pandas
import cufflinks as cf 
import os
from os import listdir
# for display control
from IPython.display import display
# Gradi
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
# Use LabelEncoder that seems to yield better result
from sklearn.preprocessing import LabelEncoder
# For parallel multi-threading
from multiprocessing import Pool, cpu_count, Array
# Garbage collection
import gc
gc.enable()
# Lock pseudo-number seed
randSeed = 1
np.random.seed(randSeed)

In [26]:
# Global verbose control# Globa 
PREVIEW_DATASET = 1
ADD_STATS_FEATURES = 1

In [69]:
# Load all data
dataPath = '../data/home-credit-default-risk/'
dataFiles = listdir(f"{dataPath}")
for filename in dataFiles:
    print(f'loading {filename} ...')
    if '.csv.zip' in filename:
        # compressed data file
        locals()[filename.rstrip('.csv.zip')] = pd.read_csv(
            f'{dataPath}/{filename}',
            compression='zip', 
            header=0, 
            sep=',', 
            quotechar='"'
        )
    else:
        print('---> Not loaded!')

# Get output label and remove it from feature list
dataTrain = application_train
dataTest = application_test
y = dataTrain['TARGET']

# Transform using One Hot Encoding 
# (using only the training dataset features as reference)
catFeatures = [
    col 
    for col in dataTrain.columns 
    if dataTrain[col].dtype == 'object'
]
ohe = pd.concat([dataTrain,dataTest], sort=False)
ohe = pd.get_dummies(ohe, columns = catFeatures)
dataTrain = ohe.iloc[:dataTrain.shape[0],:]
dataTest = ohe.iloc[dataTrain.shape[0]:,]
del dataTrain['TARGET']
del dataTest['TARGET']

# Summarize dataset
featureCnt = len(dataTrain.keys()) - 1
numSamples = len(dataTrain)
print(f'Training dataset has {numSamples} samples, and {featureCnt} features')
featureCnt = len(dataTest.keys()) - 1
numSamples = len(dataTest)
print(f'Testing dataset has {numSamples} samples, and {featureCnt} features')

print('done')

loading bureau.csv.zip ...
loading bureau_balance.csv.zip ...
loading previous_application.csv.zip ...
loading sample_submission.csv.zip ...
loading HomeCredit_columns_description.csv ...
---> Not loaded!
loading POS_CASH_balance.csv.zip ...
loading credit_card_balance.csv.zip ...
loading installments_payments.csv.zip ...
loading application_train.csv.zip ...
loading application_test.csv.zip ...
Training dataset has 307511 samples, and 244 features
Testing dataset has 48744 samples, and 244 features
done


In [70]:
# Utility function to transform all catagorical fields using one hot ending
def oneHotEncoding(df):
    # Get list categorical features
    catFeatures = [col for col in df.columns if df[col].dtype == 'object']
    # Convert to one hot encoding
    ohe = pd.get_dummies(df, columns=catFeatures)
    return ohe

In [71]:
## Preprocess bureau datasets
if PREVIEW_DATASET:
    print('Raw bureau_balance dataset')
    display(bureau_balance.head(5))
    
### Mohan's New 
grouped_bureau_balance = bureau_balance.groupby('SK_ID_BUREAU')
bureauBalance = (
    grouped_bureau_balance
        .STATUS
        .value_counts(normalize = False)
        .unstack('STATUS')
)

# Rename columns to avoid conflict
bureauBalance.rename(
    columns = dict([(c, 'STATUS_'+c) for c in bureauBalance.columns]), 
    inplace=True
)

# Add months balance data as new features
bureauBalance = bureauBalance.join(
    grouped_bureau_balance.MONTHS_BALANCE
        .agg([len, max, min])                                   
        .rename(columns={'len':'MOMTHS_COUNT',
                         'max':'MOMTHS_MAX',
                         'min':'MONTHS_MIN'})
)


if PREVIEW_DATASET:
    print('Formatted')
    display(bureauBalance.head(5))

# Finally, merge the two bureau table together 
bureauData = bureau.join(bureauBalance, how='left', on='SK_ID_BUREAU')

# Transform features
bureauData = (
    oneHotEncoding(bureauData)
        .groupby('SK_ID_CURR')
        .mean()
        .assign(CNT_BURO=
                bureau
                    .groupby('SK_ID_CURR')
                    .SK_ID_BUREAU
                    .count()
               )
    .drop('SK_ID_BUREAU', axis=1)
)

if PREVIEW_DATASET:
    print('Merged and transformed')
    display(bureauData.head(5))

# Merge features into main training dataset
featureCntBefore = len(dataTrain.keys()) - 1
dataTrain = dataTrain.merge(
    right = bureauData.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
dataTest = dataTest.merge(
    right = bureauData.reset_index(),
    how = 'left',
    on = 'SK_ID_CURR'
)
featureCntAfter = len(dataTrain.keys()) - 1
newFeatureCnt = featureCntAfter - featureCntBefore
# Show stats
print(f'bureau datasets processed, {newFeatureCnt} new features added')

# # Remove temporary variables and clean up memory
# del bureauBalance
# del bureauData
# del bureau_balance
# del bureau
gc.collect();

Raw bureau_balance dataset


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


Formatted


Unnamed: 0_level_0,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,MOMTHS_COUNT,MOMTHS_MAX,MONTHS_MIN
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5001709,,,,,,,86.0,11.0,97,0,-96
5001710,5.0,,,,,,48.0,30.0,83,0,-82
5001711,3.0,,,,,,,1.0,4,0,-3
5001712,10.0,,,,,,9.0,,19,0,-18
5001713,,,,,,,,22.0,22,0,-21


Merged and transformed


Unnamed: 0_level_0,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,...,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan,CNT_BURO
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,-735.0,0.0,82.428571,-825.5,,0.0,207623.571429,85240.928571,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
100002,-874.0,0.0,-349.0,-697.5,1681.029,0.0,108131.945625,49156.2,7997.14125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
100003,-1400.75,0.0,-544.5,-1097.333333,0.0,0.0,254350.125,0.0,202500.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
100004,-867.0,0.0,-488.5,-532.5,0.0,0.0,94518.9,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
100005,-190.666667,0.0,439.333333,-123.0,0.0,0.0,219042.0,189469.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


bureau datasets processed, 47 new features added
