In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
from utils import discrete_plot

warnings.simplefilter('ignore')
pd.options.display.max_rows = 100
init_notebook_mode(connected=True)
%matplotlib inline

# Load Dataset

In [2]:
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

print('Train:')
print(train.info(verbose=False), '\n')
print('Test:')
print(test.info(verbose=False))

Train:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722142 entries, 0 to 722141
Columns: 67 entries, loan_amnt to loan_to_inc
dtypes: float64(17), int64(41), object(9)
memory usage: 369.1+ MB
None 

Test:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98727 entries, 0 to 98726
Columns: 67 entries, loan_amnt to loan_to_inc
dtypes: float64(17), int64(41), object(9)
memory usage: 50.5+ MB
None


# Data Basic Information.

In [3]:
# imbalanced dataset
target1 = train['target'].sum()
target0 = (1 - train['target']).sum()

print('Target 0:\t', target0, '\t', np.round(target0 / len(train), 4))
print('Target 1:\t', target1, '\t', np.round(target1 / len(train), 4))
print('0/1 Ratio:\t', np.round(target0 / target1, 4))

Target 0:	 549951 	 0.7616
Target 1:	 172191 	 0.2384
0/1 Ratio:	 3.1938


In [4]:
# visualize the target count distribution
data = [go.Bar(x=['status 0'], y=[target0], name='Status 0'), 
        go.Bar(x=['status 1'], y=[target1], name='Status 1')]

margin=go.layout.Margin(l=50, r=50, b=30, t=40, pad=4)
legend = dict(orientation='v', xanchor='auto')

layout = go.Layout(title='Loan Status Count Plot', xaxis=dict(title='Loan Status'), 
                   yaxis=dict(title='Count'), autosize=False, width=700, height=400, 
                   margin=margin, legend=legend)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

# Visualization

In [5]:
# define categorical and numerical features
cat_features = ['term', 'home_ownership', 'verification_status', 'purpose', 
                'title', 'addr_state', 'initial_list_status', 'application_type', 
                'grade', 'sub_grade']

num_features = ['loan_amnt', 'loan_to_inc', 'int_rate', 'installment_ratio', 'emp_length', 
                'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 
                'revol_bal', 'revol_util', 'total_acc', 'collections_12_mths_ex_med', 
                'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 
                'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 
                'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 
                'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 
                'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
                'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
                'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 
                'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
                'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                'total_il_high_credit_limit', 'credit_length']

features = cat_features + num_features

# define numerical and categorical features
print('Categorical feature:\t', len(cat_features))
print('Numerical feature:\t', len(num_features))
print('Total feature:\t\t', len(features))

Categorical feature:	 10
Numerical feature:	 55
Total feature:		 65


# Categorical Features

In [6]:
# term
feature = 'term'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))

In [7]:
# home_ownership
feature = 'home_ownership'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))

In [8]:
# verification_status
feature = 'verification_status'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))

In [9]:
# purpose
feature = 'purpose'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [10]:
# title
feature = 'title'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [11]:
# addr_state
state_count = train.groupby('addr_state')['target'].count().reset_index()
state_count = state_count.sort_values(by='target', ascending=False)

# visualization
scl = [[0.0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'], 
       [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
       [0.8, 'rgb(117,107,177)'], [1.0, 'rgb(84,39,143)']]

data = [dict(type='choropleth', colorscale=scl, autocolorscale=False,
        locations=state_count['addr_state'], z=state_count['target'],
        locationmode='USA-states', colorbar=dict(title='Counts'), 
        marker=dict(line=dict(color = 'rgb(255,255,255)', width=2)))]

geo = dict(scope='usa', projection=dict(type='albers usa'), 
           showlakes=True, lakecolor='rgb(255, 255, 255)')

layout = dict(title='Loan Count Distribution by State', geo=geo, 
              margin=go.Margin(l=50, r=50, b=50, t=40, pad=4), 
              width=1000, height=600)
    
fig = dict(data=data, layout=layout)
iplot(fig)

In [12]:
# addr_state
state_rate = train.groupby('addr_state')['target'].mean().reset_index()
state_rate = state_rate.sort_values(by='target', ascending=False)

# visualization
scl = [[0.0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'], 
       [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
       [0.8, 'rgb(117,107,177)'], [1.0, 'rgb(84,39,143)']]

data = [dict(type='choropleth', colorscale=scl, autocolorscale=False,
        locations=state_rate['addr_state'], z=state_rate['target'],
        locationmode='USA-states', colorbar=dict(title='Default Rate'),
        marker=dict(line=dict(color = 'rgb(255,255,255)', width=2)))]

geo = dict(scope='usa', projection=dict(type='albers usa'), 
           showlakes=True, lakecolor='rgb(255, 255, 255)')

layout = dict(title='Loan Default Rate Distribution by State', geo=geo, 
              margin=go.Margin(l=50, r=50, b=50, t=40, pad=4), 
              width=1000, height=600)
    
fig = dict(data=data, layout=layout)
iplot(fig)

In [13]:
# initial_list_status
feature = 'initial_list_status'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))

In [14]:
# application_type
feature = 'application_type'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))

In [15]:
# grade
feature = 'grade'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=450))

In [16]:
# sub_grade
feature = 'sub_grade'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

# Discrete Features

In [17]:
# emp_length
feature = 'emp_length'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [18]:
# delinq_2yrs
feature = 'delinq_2yrs'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [19]:
# inq_last_6mths
feature = 'inq_last_6mths'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [20]:
# pub_rec
feature = 'pub_rec'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [21]:
# collections_12_mths_ex_med
feature = 'collections_12_mths_ex_med'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [22]:
# acc_now_delinq
feature = 'acc_now_delinq'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [23]:
# acc_open_past_24mths
feature = 'acc_open_past_24mths'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [24]:
# chargeoff_within_12_mths
feature = 'chargeoff_within_12_mths'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [25]:
# mort_acc
feature = 'mort_acc'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [26]:
# mths_since_recent_inq
feature = 'mths_since_recent_inq'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [27]:
# num_accts_ever_120_pd
feature = 'num_accts_ever_120_pd'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [28]:
# num_actv_bc_tl
feature = 'num_actv_bc_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [29]:
# num_actv_rev_tl
feature = 'num_actv_rev_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [30]:
# num_bc_sats
feature = 'num_bc_sats'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [31]:
# num_bc_tl
feature = 'num_bc_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [32]:
# num_il_tl
feature = 'num_il_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=700))

In [33]:
# num_op_rev_tl
feature = 'num_op_rev_tl'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [34]:
# num_rev_accts
feature = 'num_rev_accts'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [35]:
# num_rev_tl_bal_gt_0
feature = 'num_rev_tl_bal_gt_0'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [36]:
# num_sats
feature = 'num_sats'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))

In [37]:
# num_tl_120dpd_2m
feature = 'num_tl_120dpd_2m'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [38]:
# num_tl_30dpd
feature = 'num_tl_30dpd'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [39]:
# num_tl_90g_dpd_24m
feature = 'num_tl_90g_dpd_24m'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [40]:
# num_tl_op_past_12m
feature = 'num_tl_op_past_12m'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [41]:
# pub_rec_bankruptcies
feature = 'pub_rec_bankruptcies'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [42]:
# tax_liens
feature = 'tax_liens'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=500))

In [43]:
# credit_length
feature = 'credit_length'
iplot(discrete_plot(data=train, feature=feature, width=1000, height=600))