In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Replace the path with the correct path for your data.
y2015 = pd.read_csv(
    'LoanStats3d.csv',
    skipinitialspace=True,
    header=1, nrows=50000
)

# Convert ID and Interest Rate to numeric.
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
y2015.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace=True)

In [4]:
y2015.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,68009401,72868139,16000,16000,16000,60 months,14.85,379.39,C,10+ years,...,0,2,78.9,0.0,0,2,298100,31329,281300,13400
1,68354783,73244544,9600,9600,9600,36 months,7.49,298.58,A,8 years,...,0,2,100.0,66.7,0,0,88635,55387,12500,75635
2,68466916,73356753,25000,25000,25000,36 months,7.49,777.55,A,10+ years,...,0,0,100.0,20.0,0,0,373572,68056,38400,82117
3,68466961,73356799,28000,28000,28000,36 months,6.49,858.05,A,10+ years,...,0,0,91.7,22.2,0,0,304003,74920,41500,42503
4,68495092,73384866,8650,8650,8650,36 months,19.89,320.99,E,8 years,...,0,12,100.0,50.0,1,0,38998,18926,2750,18248


In [5]:
pd.get_dummies(y2015)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,...,last_credit_pull_d_May-2016,last_credit_pull_d_Nov-2015,last_credit_pull_d_Nov-2016,last_credit_pull_d_Oct-2016,last_credit_pull_d_Sep-2016,application_type_INDIVIDUAL,application_type_JOINT,verification_status_joint_Not Verified,verification_status_joint_Source Verified,verification_status_joint_Verified
0,68009401,72868139,16000,16000,16000,14.85,379.39,48000.0,33.18,0,...,0,0,0,0,0,1,0,0,0,0
1,68354783,73244544,9600,9600,9600,7.49,298.58,60000.0,22.44,0,...,0,0,0,0,0,1,0,0,0,0
2,68466916,73356753,25000,25000,25000,7.49,777.55,109000.0,26.02,0,...,0,0,0,0,0,1,0,0,0,0
3,68466961,73356799,28000,28000,28000,6.49,858.05,92000.0,21.60,0,...,0,0,0,0,0,1,0,0,0,0
4,68495092,73384866,8650,8650,8650,19.89,320.99,55000.0,25.49,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,65674101,70306867,20000,20000,20000,12.05,664.77,80000.0,14.39,0,...,0,0,0,0,0,1,0,0,0,0
49996,65344445,69931230,8400,8400,8400,6.89,258.95,88600.0,2.60,0,...,0,0,0,0,0,1,0,0,0,0
49997,65654000,70286779,4500,4500,4500,9.99,145.19,75000.0,6.29,1,...,0,0,0,0,0,1,0,0,0,0
49998,65833004,70465761,4450,4450,4400,16.55,157.66,94300.0,38.22,0,...,0,0,0,0,0,1,0,0,0,0


In [7]:

from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = y2015.drop('loan_status', 1)
Y = y2015['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=5)

array([0.98210358, 0.98110189, 0.98140186, 0.9803    , 0.96978792])

In [8]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(X)
X_pca = pca.transform(X)

In [9]:
rfc = ensemble.RandomForestClassifier()
cross_val_score(rfc, X_pca, Y, cv=10)

array([0.93502599, 0.93342663, 0.93702519, 0.93881224, 0.94181164,
       0.9386    , 0.9389878 , 0.93838768, 0.93558712, 0.93893894])

In [10]:
columns = ['out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int','last_pymnt_amnt']

X_trim = X.drop(columns=columns)

# Rerun model with reduced data set
rfc = ensemble.RandomForestClassifier()
cross_val_score(rfc, X_trim, Y, cv=10)

array([0.94922031, 0.95141943, 0.95461815, 0.95740852, 0.9560088 ,
       0.9572    , 0.95759152, 0.95519104, 0.9559912 , 0.95475475])