In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

pd.options.display.max_rows = 100
init_notebook_mode(connected=True)
%matplotlib inline

# Load Dataset

In [2]:
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

print('Train:')
print(train.info(verbose=False), '\n')
print('Test:')
print(test.info(verbose=False))

Train:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722142 entries, 0 to 722141
Columns: 68 entries, loan_amnt to credit_length
dtypes: float64(16), int64(41), object(11)
memory usage: 374.6+ MB
None 

Test:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98727 entries, 0 to 98726
Columns: 68 entries, loan_amnt to credit_length
dtypes: float64(16), int64(41), object(11)
memory usage: 51.2+ MB
None


# Data Basic Information.

In [5]:
# imbalanced dataset
target1 = train['target'].sum()
target0 = (1 - train['target']).sum()
print('Target 1:\t', np.round(target1 / len(train), 4))
print('Target 0:\t', np.round(target0 / len(train), 4))
print('Ratio:\t\t', np.round(target0 / target1, 4))

Target 1:	 0.2384
Target 0:	 0.7616
Ratio:		 3.1938


In [6]:
# visualize the target count distribution
data = [go.Bar(x=['status 0', 'status 1'], y=[target0, target1], 
               marker=dict(color=['rgba(55, 128, 191, 0.7)', 'rgba(219, 64, 82, 0.7)']))]
layout = go.Layout(title='Loan Status Count Distribution', xaxis=dict(title='Target'), 
                   yaxis=dict(title='Count'), autosize=False, width=800, height=500)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

# Visualization

In [7]:
# define categorical, discrete and continuous features
cat_features = ['term', 'home_ownership', 'verification_status', 'purpose', 'title',
                'addr_state', 'initial_list_status', 'application_type', 'hardship_flag', 
                'debt_settlement_flag']

dis_features = ['grade', 'sub_grade', 'emp_length', 'delinq_2yrs', 'inq_last_6mths',
                'open_acc', 'pub_rec', 'total_acc', 'acc_now_delinq', 'acc_open_past_24mths',
                'chargeoff_within_12_mths', 'mort_acc', 'num_accts_ever_120_pd', 
                'num_actv_bc_tl', 'num_actv_rev_tl','num_bc_sats', 'num_bc_tl', 
                'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 
                'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 
                'num_tl_op_past_12m','pub_rec_bankruptcies', 'tax_liens']

con_features = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti', 'revol_bal', 
                'revol_util', 'collections_12_mths_ex_med', 'tot_coll_amt', 'tot_cur_bal', 
                'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'delinq_amnt', 
                'mo_sin_old_il_acct','mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 
                'mo_sin_rcnt_tl', 'mths_since_recent_bc', 'mths_since_recent_inq',
                'pct_tl_nvr_dlq', 'percent_bc_gt_75',  'tot_hi_cred_lim', 'total_bal_ex_mort', 
                'total_bc_limit', 'total_il_high_credit_limit', 'credit_length']

features = cat_features + dis_features + con_features

# define numerical and categorical features
print('Categorical:\t', len(cat_features))
print('Discrete:\t', len(dis_features))
print('Continuous:\t', len(con_features))
print('Total:\t\t', len(features))

Categorical:	 10
Discrete:	 28
Continuous:	 28
Total:		 66


### 1. Categorical Variables

### 2. Discrete Variables

### 3. Continuous Variables