In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools

from utils import discrete_plot
from utils import numerical_plot

warnings.simplefilter('ignore')
pd.options.display.max_rows = 100
init_notebook_mode(connected=True)
%matplotlib inline

# Load Dataset

In [None]:
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

print('Train:')
print(train.info(verbose=False), '\n')
print('Test:')
print(test.info(verbose=False))

# Data Basic Information.

In [None]:
# imbalanced dataset
target1 = train['target'].sum()
target0 = (1 - train['target']).sum()

print('Target 0:\t', target0, '\t', np.round(target0 / len(train), 4))
print('Target 1:\t', target1, '\t', np.round(target1 / len(train), 4))
print('0/1 Ratio:\t', np.round(target0 / target1, 4))

In [None]:
# visualize the target count distribution
data = [go.Bar(x=['status 0'], y=[target0], name='Status 0'), 
        go.Bar(x=['status 1'], y=[target1], name='Status 1')]
margin=go.layout.Margin(l=50, r=50, b=30, t=40, pad=4)
legend = dict(orientation='h', xanchor='auto', y=-0.2)

layout = go.Layout(title='Loan Status Count Plot', xaxis=dict(title='Loan Status'), 
                   yaxis=dict(title='Count'), autosize=False, width=700, height=400, 
                   margin=margin, legend=legend)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

# Visualization

In [None]:
# define categorical and numerical features
cat_features = ['term', 'home_ownership', 'verification_status', 'purpose', 
                'title', 'addr_state', 'initial_list_status', 'application_type', 
                'grade', 'sub_grade']

num_features = ['loan_amnt', 'loan_to_inc', 'int_rate', 'installment_ratio', 'emp_length', 
                'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 
                'revol_bal', 'revol_util', 'total_acc', 'collections_12_mths_ex_med', 
                'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 
                'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 
                'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 
                'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 
                'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
                'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
                'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 
                'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
                'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                'total_il_high_credit_limit', 'credit_length']

features = cat_features + num_features

# define numerical and categorical features
print('Categorical feature:\t', len(cat_features))
print('Numerical feature:\t', len(num_features))
print('Total feature:\t\t', len(features))

### 2. Numerical Variables

In [None]:
# loan_amnt
feature = 'loan_amnt'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# loan_to_inc
feature = 'loan_to_inc'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=True, w=1000, h=450))

In [None]:
# int_rate
feature = 'int_rate'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# installment_ratio
feature = 'installment_ratio'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# annual_inc
feature = 'annual_inc'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=True, w=1000, h=450))

In [None]:
# dti
feature = 'dti'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=True, w=1000, h=450))

In [None]:
# open_acc
feature = 'open_acc'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# revol_bal
feature = 'revol_bal'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=True, w=1000, h=450))

In [None]:
# revol_util
feature = 'revol_util'
iplot(numerical_plot(train, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# total_acc
feature = 'total_acc'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# tot_coll_amt
feature = 'tot_coll_amt'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# tot_cur_bal
feature = 'tot_cur_bal'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# total_rev_hi_lim
feature = 'total_rev_hi_lim'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# avg_cur_bal
feature = 'avg_cur_bal'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# bc_open_to_buy
feature = 'bc_open_to_buy'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# bc_util
feature = 'bc_util'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# delinq_amnt
feature = 'delinq_amnt'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# mo_sin_old_il_acct
feature = 'mo_sin_old_il_acct'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# mo_sin_old_rev_tl_op
feature = 'mo_sin_old_rev_tl_op'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# mo_sin_rcnt_rev_tl_op
feature = 'mo_sin_rcnt_rev_tl_op'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# mo_sin_rcnt_tl
feature = 'mo_sin_rcnt_tl'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# mths_since_recent_bc
feature = 'mths_since_recent_bc'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# pct_tl_nvr_dlq
feature = 'pct_tl_nvr_dlq'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# percent_bc_gt_75
feature = 'percent_bc_gt_75'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# tot_hi_cred_lim
feature = 'tot_hi_cred_lim'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# total_bal_ex_mort
feature = 'total_bal_ex_mort'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# total_bc_limit
feature = 'total_bc_limit'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))

In [None]:
# total_il_high_credit_limit
feature = 'total_il_high_credit_limit'
iplot(numerical_plot(data, feature, hist_bins=40, scatter_bins=100, log=False, w=1000, h=450))