In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Use plotly offline for fancy plots
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# use cufflinks to bind plotly to pandas
import cufflinks as cf 

In [5]:
def loadZippedCSV(path):
    return pd.read_csv(path, compression='zip', header=0, sep=',', quotechar='"')

application_train = loadZippedCSV('./application_train.csv.zip')
POS_CASH_balance = loadZippedCSV('./POS_CASH_balance.csv.zip')
bureau_balance = loadZippedCSV('./bureau_balance.csv.zip')
previous_application = loadZippedCSV('./previous_application.csv.zip')
installments_payments = loadZippedCSV('./installments_payments.csv.zip')
bureau = loadZippedCSV('./bureau.csv.zip')
credit_card_balance = loadZippedCSV('./credit_card_balance.csv.zip')
application_test = loadZippedCSV('./application_test.csv.zip')

In [11]:
# Bundle and save all data to HDFS storage to allow faster loading
rawDataStore = pd.HDFStore('./rawData.h5')
rawDataStore.put('/rawData/application_train', application_train, format='table')
rawDataStore.put('/rawData/POS_CASH_balance', POS_CASH_balance, format='table')
rawDataStore.put('/rawData/bureau_balance', bureau_balance, format='table')
rawDataStore.put('/rawData/previous_application', previous_application, format='table')
rawDataStore.put('/rawData/installments_payments', installments_payments, format='table')
rawDataStore.put('/rawData/bureau', bureau, format='table')
rawDataStore.put('/rawData/credit_card_balance', credit_card_balance, format='table')
rawDataStore.put('/rawData/application_test', application_test, format='table')
rawDataStore.close()

In [21]:
# Load data from HDFS storage into workspace
rawDataStore = pd.HDFStore('./rawData.h5')
for key in rawDataStore.keys():
    print('loading {0} ...'.format(key))
    globals()[key.split('/')[-1]] = rawDataStore[key]
print('all data loaded')

loading /rawData/POS_CASH_balance ...
loading /rawData/application_test ...
loading /rawData/application_train ...
loading /rawData/bureau ...
loading /rawData/bureau_balance ...
loading /rawData/credit_card_balance ...
loading /rawData/installments_payments ...
loading /rawData/previous_application ...


In [40]:
application_train.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
targetCnt = application_train["TARGET"].value_counts()
df = pd.DataFrame({
    'labels': targetCnt.index.astype('bool'),
    'values': targetCnt.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Has Payment Difficulties (Target)')

In [64]:
ownCarCntT = application_train.loc[application_train["TARGET"]==1, "FLAG_OWN_CAR"].value_counts()
ownCarCnt = application_train.loc[application_train["TARGET"]==0, "FLAG_OWN_CAR"].value_counts()
ownRealtyCntT = application_train.loc[application_train["TARGET"]==1, "FLAG_OWN_REALTY"].value_counts()
ownRealtyCnt = application_train.loc[application_train["TARGET"]==0, "FLAG_OWN_REALTY"].value_counts()

trace1 = go.Bar(
    x = ['Own Car', 'Own Realty'],
    y = [ownCarCntT.Y / (ownCarCntT.Y + ownCarCntT.N), ownRealtyCntT.Y / (ownRealtyCntT.Y + ownRealtyCntT.N)] * 100,
    name='Target'
)
trace2 = go.Bar(
    x = ['Own Car', 'Own Realty'],
    y = [ownCarCnt.Y / (ownCarCnt.Y + ownCarCnt.N), ownRealtyCnt.Y / (ownRealtyCnt.Y + ownRealtyCnt.N)] * 100,
    name='Non-target'
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Target vs. non-Target in car and realty ownership",
    width = 800,
    xaxis = dict(
        title = 'Ownership Type',
        tickfont = dict(
            size=14
        )
    ),
    yaxis=dict(
        title = 'Percentage (%)',
        titlefont = dict(
            size=16
        ),
        tickfont = dict(
            size = 14
        )
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)