In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import warnings 
warnings.filterwarnings('ignore')

pd.set_option('max.column', None)

import plotly.express as px
import category_encoders as ce

In [27]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler 

# import various functions from sklearn
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter
from sklearn.feature_selection import RFE
from sklearn.model_selection import RandomizedSearchCV
# import the functions for visualizing the decision tree
# import pydotplus
# from IPython.display import Image  

In [28]:
#!pip install imbalanced-learn --user

In [29]:
from imblearn.over_sampling import SMOTE

In [30]:
# read dataset

df = pd.read_csv('Lending Club 2016_2018.csv')

In [31]:
df.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,collections_12_mths_ex_med,mths_since_last_major_derog,tot_coll_amt,tot_cur_bal,issue_d,chargeoff_within_12_mths,acc_now_delinq,mort_acc,mths_since_recent_bc_dlq,num_accts_ever_120_pd,pct_tl_nvr_dlq,pub_rec_bankruptcies,tot_hi_cred_lim,loan_status
0,130956066,3000.0,3000.0,3000.0,36 months,7.34,93.1,A,9 years,RENT,52000.0,Source Verified,major_purchase,WA,0.58,0.0,0.0,26.0,,7.0,0.0,141.0,0.5,30.0,w,0.0,,0.0,150592.0,Mar-2018,0.0,0.0,4.0,,0.0,96.7,0.0,191216.0,0
1,130968727,5000.0,5000.0,5000.0,36 months,11.98,166.03,B,10+ years,OWN,55000.0,Not Verified,other,GA,14.18,0.0,0.0,74.0,82.0,14.0,1.0,11449.0,33.9,24.0,w,0.0,74.0,0.0,28880.0,Mar-2018,0.0,0.0,0.0,,1.0,95.7,1.0,61551.0,0
2,130910225,7000.0,7000.0,7000.0,36 months,11.98,232.44,B,< 1 year,MORTGAGE,40000.0,Verified,home_improvement,TX,20.25,0.0,0.0,60.0,,13.0,0.0,5004.0,36.0,29.0,w,0.0,60.0,0.0,131726.0,Mar-2018,0.0,0.0,0.0,64.0,3.0,89.7,0.0,132817.0,0
3,130966492,30000.0,30000.0,30000.0,36 months,21.85,1143.39,D,10+ years,OWN,57000.0,Verified,debt_consolidation,FL,27.58,0.0,1.0,68.0,,11.0,0.0,29222.0,53.2,26.0,w,0.0,68.0,0.0,157566.0,Mar-2018,0.0,0.0,2.0,,1.0,96.0,0.0,188780.0,0
4,130942737,21000.0,21000.0,21000.0,60 months,20.39,560.94,D,10+ years,OWN,85000.0,Source Verified,house,NY,15.76,1.0,0.0,2.0,,15.0,0.0,14591.0,34.2,27.0,w,0.0,,0.0,128270.0,Mar-2018,0.0,0.0,3.0,,0.0,92.6,0.0,172433.0,0


In [32]:
df.shape

(518706, 39)

In [33]:
df_copy = df.copy()

In [34]:
df_copy.drop(['id','issue_d','funded_amnt','funded_amnt_inv'],axis=1,inplace=True)

In [35]:
for i in ['loan_amnt', 'int_rate',
       'installment', 'annual_inc', 'dti', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'revol_util', 'total_acc']:
    Q1 = df_copy[i].quantile(0.25)
    Q3 = df_copy[i].quantile(0.75)

    IQR = Q3-Q1
    df_temp = df_copy[(df_copy[i] > (Q1 - 1.5*IQR)) & (df_copy[i] < (Q3 + 1.5*IQR))]

In [36]:
for i in ['collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'tot_cur_bal', 'chargeoff_within_12_mths',
       'acc_now_delinq', 'mort_acc', 'mths_since_recent_bc_dlq',
       'num_accts_ever_120_pd','tot_hi_cred_lim']:
    Q1 = df_temp[i].quantile(0.25)
    Q3 = df_temp[i].quantile(0.75)

    IQR = Q3-Q1
    df_no_out = df_temp[(df_temp[i] > (Q1 - 1.5*IQR)) & (df_temp[i] < (Q3 + 1.5*IQR))]

In [37]:
df_no_out.dropna(subset = ['emp_length','dti','revol_util','pct_tl_nvr_dlq','inq_last_6mths'], inplace = True)


In [38]:
cols = ['mths_since_last_major_derog','mths_since_recent_bc_dlq','mths_since_last_record','mths_since_last_delinq']
for i in cols:
    df_no_out[i] = pd.cut(df[i], 
           bins = np.arange(0,262,36), 
           labels=['0-3','4-6','7-9','10-12','13-15','16-18','18+'])

    df_no_out[i] = df_no_out[i].astype('object')

    df_no_out[i].replace(np.nan, 'None', inplace = True)

In [39]:
df_no_out.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,collections_12_mths_ex_med,mths_since_last_major_derog,tot_coll_amt,tot_cur_bal,chargeoff_within_12_mths,acc_now_delinq,mort_acc,mths_since_recent_bc_dlq,num_accts_ever_120_pd,pct_tl_nvr_dlq,pub_rec_bankruptcies,tot_hi_cred_lim,loan_status
0,3000.0,36 months,7.34,93.1,A,9 years,RENT,52000.0,Source Verified,major_purchase,WA,0.58,0.0,0.0,0-3,,7.0,0.0,141.0,0.5,30.0,w,0.0,,0.0,150592.0,0.0,0.0,4.0,,0.0,96.7,0.0,191216.0,0
1,5000.0,36 months,11.98,166.03,B,10+ years,OWN,55000.0,Not Verified,other,GA,14.18,0.0,0.0,7-9,7-9,14.0,1.0,11449.0,33.9,24.0,w,0.0,7-9,0.0,28880.0,0.0,0.0,0.0,,1.0,95.7,1.0,61551.0,0
2,7000.0,36 months,11.98,232.44,B,< 1 year,MORTGAGE,40000.0,Verified,home_improvement,TX,20.25,0.0,0.0,4-6,,13.0,0.0,5004.0,36.0,29.0,w,0.0,4-6,0.0,131726.0,0.0,0.0,0.0,4-6,3.0,89.7,0.0,132817.0,0
3,30000.0,36 months,21.85,1143.39,D,10+ years,OWN,57000.0,Verified,debt_consolidation,FL,27.58,0.0,1.0,4-6,,11.0,0.0,29222.0,53.2,26.0,w,0.0,4-6,0.0,157566.0,0.0,0.0,2.0,,1.0,96.0,0.0,188780.0,0
4,21000.0,60 months,20.39,560.94,D,10+ years,OWN,85000.0,Source Verified,house,NY,15.76,1.0,0.0,0-3,,15.0,0.0,14591.0,34.2,27.0,w,0.0,,0.0,128270.0,0.0,0.0,3.0,,0.0,92.6,0.0,172433.0,0


In [40]:
df_no_out.shape

(453227, 35)

#### VIF logic starts

In [41]:
# plt.figure(figsize=(25,10))
# sns.heatmap(df_no_out.corr()[np.abs(df_no_out.corr()) > 0.7], annot = True, annot_kws = {"size": 13}, cmap = 'Blues')

From the above heatmap, we can see (installment and loan_amount), (tot_hi_cred_lim & tot_cur_bal) have high collinearity

In [42]:
def vif(df_numeric_features_vif):
    # for each numeric variable, calculate VIF and save it in a dataframe 'vif'

    # use for loop to iterate the VIF function 
    for ind in range(len(df_numeric_features_vif.columns)):

        # create an empty dataframe
        vif = pd.DataFrame()

        # calculate VIF using list comprehension
        # use for loop to access each variable 
        # calculate VIF for each variable and create a column 'VIF_Factor' to store the values 
        vif["VIF_Factor"] = [variance_inflation_factor(df_numeric_features_vif.values, i) for i in range(df_numeric_features_vif.shape[1])]

        # create a column of variable names
        vif["Features"] = df_numeric_features_vif.columns

        # filter the variables with VIF greater than 10 and store it in a dataframe 'multi' 
        # one can choose the threshold other than 10 (it depends on the business requirements)
        multi = vif[vif['VIF_Factor'] > 10]

        # if dataframe 'multi' is not empty, then sort the dataframe by VIF values
        # if dataframe 'multi' is empty (i.e. all VIF <= 10), then print the dataframe 'vif' and break the for loop using 'break' 
        # 'by' sorts the data using given variable(s)
        # 'ascending = False' sorts the data in descending order
        if(multi.empty == False):
            df_sorted = multi.sort_values(by = 'VIF_Factor', ascending = False)
        else:
            print(vif)
            break

        # use if-else to drop the variable with the highest VIF
        # if  dataframe 'df_sorted' is not empty, then drop the first entry in the column 'Features' from the numeric variables
        # select the variable using 'iloc[]'
        # 'axis=1' drops the corresponding column
        #  else print the final dataframe 'vif' with all values after removal of variables with VIF less than 10  
        if (df_sorted.empty == False):
            df_numeric_features_vif = df_numeric_features_vif.drop(df_sorted.Features.iloc[0], axis=1)
        else:
            print(vif)
            
    return df_numeric_features_vif

In [43]:
df_num = df_no_out.select_dtypes(include=[np.number])
df_numeric_features_vif = vif(df_num)

    VIF_Factor                    Features
0     7.876274                    int_rate
1     4.324333                 installment
2     2.361169                  annual_inc
3     4.130222                         dti
4     1.244398                 delinq_2yrs
5     1.588155              inq_last_6mths
6     5.801907                    open_acc
7     2.007157                     pub_rec
8     2.580833                   revol_bal
9     5.313950                  revol_util
10    1.028327  collections_12_mths_ex_med
11    1.001401                tot_coll_amt
12    3.532774                 tot_cur_bal
13    1.038555    chargeoff_within_12_mths
14    1.020454              acc_now_delinq
15    2.512151                    mort_acc
16    1.240229       num_accts_ever_120_pd
17    2.030819        pub_rec_bankruptcies
18    1.398649                 loan_status


In [44]:
df_numeric_features_vif.shape

(453227, 19)

#### Conacatinating vif columns and categorical columns to create new df

In [45]:
col = list(df_numeric_features_vif.columns)

In [46]:
df_no_out_num_vif = df_no_out[col]

In [47]:
df_no_out_num_vif.shape

(453227, 19)

In [48]:
df_no_out_cat = df_no_out.select_dtypes(include='object')

In [49]:
df_no_out_cat.shape

(453227, 12)

In [50]:
df_no_out_num_vif.reset_index(drop=True,inplace=True)
df_no_out_cat.reset_index(drop=True,inplace=True)

In [51]:
df_no_out_vif = pd.concat([df_no_out_num_vif,df_no_out_cat],axis=1)

In [52]:
df_no_out_vif.head()

Unnamed: 0,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,chargeoff_within_12_mths,acc_now_delinq,mort_acc,num_accts_ever_120_pd,pub_rec_bankruptcies,loan_status,term,grade,emp_length,home_ownership,verification_status,purpose,addr_state,mths_since_last_delinq,mths_since_last_record,initial_list_status,mths_since_last_major_derog,mths_since_recent_bc_dlq
0,7.34,93.1,52000.0,0.58,0.0,0.0,7.0,0.0,141.0,0.5,0.0,0.0,150592.0,0.0,0.0,4.0,0.0,0.0,0,36 months,A,9 years,RENT,Source Verified,major_purchase,WA,0-3,,w,,
1,11.98,166.03,55000.0,14.18,0.0,0.0,14.0,1.0,11449.0,33.9,0.0,0.0,28880.0,0.0,0.0,0.0,1.0,1.0,0,36 months,B,10+ years,OWN,Not Verified,other,GA,7-9,7-9,w,7-9,
2,11.98,232.44,40000.0,20.25,0.0,0.0,13.0,0.0,5004.0,36.0,0.0,0.0,131726.0,0.0,0.0,0.0,3.0,0.0,0,36 months,B,< 1 year,MORTGAGE,Verified,home_improvement,TX,4-6,,w,4-6,4-6
3,21.85,1143.39,57000.0,27.58,0.0,1.0,11.0,0.0,29222.0,53.2,0.0,0.0,157566.0,0.0,0.0,2.0,1.0,0.0,0,36 months,D,10+ years,OWN,Verified,debt_consolidation,FL,4-6,,w,4-6,
4,20.39,560.94,85000.0,15.76,1.0,0.0,15.0,0.0,14591.0,34.2,0.0,0.0,128270.0,0.0,0.0,3.0,0.0,0.0,0,60 months,D,10+ years,OWN,Source Verified,house,NY,0-3,,w,,


### VIF logic ends

In [53]:
df_no_out_vif.select_dtypes(include='object').head()

Unnamed: 0,term,grade,emp_length,home_ownership,verification_status,purpose,addr_state,mths_since_last_delinq,mths_since_last_record,initial_list_status,mths_since_last_major_derog,mths_since_recent_bc_dlq
0,36 months,A,9 years,RENT,Source Verified,major_purchase,WA,0-3,,w,,
1,36 months,B,10+ years,OWN,Not Verified,other,GA,7-9,7-9,w,7-9,
2,36 months,B,< 1 year,MORTGAGE,Verified,home_improvement,TX,4-6,,w,4-6,4-6
3,36 months,D,10+ years,OWN,Verified,debt_consolidation,FL,4-6,,w,4-6,
4,60 months,D,10+ years,OWN,Source Verified,house,NY,0-3,,w,,


In [54]:
term_values = {' 36 months': 36, ' 60 months': 60}
df_no_out_vif['term'] = df_no_out_vif.term.map(term_values)

In [55]:
LE = LabelEncoder()
df_no_out_vif['emp_length']= LE.fit_transform(df_no_out_vif['emp_length'])

In [56]:
dummies = ['grade','home_ownership','verification_status','purpose','initial_list_status','mths_since_last_delinq','mths_since_last_record','mths_since_last_major_derog','mths_since_recent_bc_dlq']

In [57]:
data = pd.get_dummies(df_no_out_vif, columns=dummies, drop_first=True)

In [58]:
data.head()

Unnamed: 0,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,chargeoff_within_12_mths,acc_now_delinq,mort_acc,num_accts_ever_120_pd,pub_rec_bankruptcies,loan_status,term,emp_length,addr_state,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_w,mths_since_last_delinq_10-12,mths_since_last_delinq_13-15,mths_since_last_delinq_16-18,mths_since_last_delinq_18+,mths_since_last_delinq_4-6,mths_since_last_delinq_7-9,mths_since_last_delinq_None,mths_since_last_record_10-12,mths_since_last_record_4-6,mths_since_last_record_7-9,mths_since_last_record_None,mths_since_last_major_derog_10-12,mths_since_last_major_derog_13-15,mths_since_last_major_derog_16-18,mths_since_last_major_derog_18+,mths_since_last_major_derog_4-6,mths_since_last_major_derog_7-9,mths_since_last_major_derog_None,mths_since_recent_bc_dlq_10-12,mths_since_recent_bc_dlq_13-15,mths_since_recent_bc_dlq_16-18,mths_since_recent_bc_dlq_4-6,mths_since_recent_bc_dlq_7-9,mths_since_recent_bc_dlq_None
0,7.34,93.1,52000.0,0.58,0.0,0.0,7.0,0.0,141.0,0.5,0.0,0.0,150592.0,0.0,0.0,4.0,0.0,0.0,0,36,9,WA,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
1,11.98,166.03,55000.0,14.18,0.0,0.0,14.0,1.0,11449.0,33.9,0.0,0.0,28880.0,0.0,0.0,0.0,1.0,1.0,0,36,1,GA,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2,11.98,232.44,40000.0,20.25,0.0,0.0,13.0,0.0,5004.0,36.0,0.0,0.0,131726.0,0.0,0.0,0.0,3.0,0.0,0,36,10,TX,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
3,21.85,1143.39,57000.0,27.58,0.0,1.0,11.0,0.0,29222.0,53.2,0.0,0.0,157566.0,0.0,0.0,2.0,1.0,0.0,0,36,1,FL,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
4,20.39,560.94,85000.0,15.76,1.0,0.0,15.0,0.0,14591.0,34.2,0.0,0.0,128270.0,0.0,0.0,3.0,0.0,0.0,0,60,1,NY,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1


In [59]:
te_encoder = ce.target_encoder.TargetEncoder()

data['addr_state'] = te_encoder.fit_transform(data.addr_state,data.loan_status)

In [60]:
data.head()

Unnamed: 0,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,chargeoff_within_12_mths,acc_now_delinq,mort_acc,num_accts_ever_120_pd,pub_rec_bankruptcies,loan_status,term,emp_length,addr_state,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_w,mths_since_last_delinq_10-12,mths_since_last_delinq_13-15,mths_since_last_delinq_16-18,mths_since_last_delinq_18+,mths_since_last_delinq_4-6,mths_since_last_delinq_7-9,mths_since_last_delinq_None,mths_since_last_record_10-12,mths_since_last_record_4-6,mths_since_last_record_7-9,mths_since_last_record_None,mths_since_last_major_derog_10-12,mths_since_last_major_derog_13-15,mths_since_last_major_derog_16-18,mths_since_last_major_derog_18+,mths_since_last_major_derog_4-6,mths_since_last_major_derog_7-9,mths_since_last_major_derog_None,mths_since_recent_bc_dlq_10-12,mths_since_recent_bc_dlq_13-15,mths_since_recent_bc_dlq_16-18,mths_since_recent_bc_dlq_4-6,mths_since_recent_bc_dlq_7-9,mths_since_recent_bc_dlq_None
0,7.34,93.1,52000.0,0.58,0.0,0.0,7.0,0.0,141.0,0.5,0.0,0.0,150592.0,0.0,0.0,4.0,0.0,0.0,0,36,9,0.165564,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
1,11.98,166.03,55000.0,14.18,0.0,0.0,14.0,1.0,11449.0,33.9,0.0,0.0,28880.0,0.0,0.0,0.0,1.0,1.0,0,36,1,0.204373,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2,11.98,232.44,40000.0,20.25,0.0,0.0,13.0,0.0,5004.0,36.0,0.0,0.0,131726.0,0.0,0.0,0.0,3.0,0.0,0,36,10,0.225365,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
3,21.85,1143.39,57000.0,27.58,0.0,1.0,11.0,0.0,29222.0,53.2,0.0,0.0,157566.0,0.0,0.0,2.0,1.0,0.0,0,36,1,0.237443,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
4,20.39,560.94,85000.0,15.76,1.0,0.0,15.0,0.0,14591.0,34.2,0.0,0.0,128270.0,0.0,0.0,3.0,0.0,0.0,0,60,1,0.254741,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1


In [61]:
# set of independent variables
# drop the target variable using 'drop()'
# 'axis = 1' drops the specified column
X = data.drop('loan_status', axis = 1)
 
# consider the dependent variable
y = data['loan_status']

# split data into train subset and test subset
# set 'random_state' to generate the same dataset each time you run the code 
# 'test_size' returns the proportion of data to be included in the testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 10)

# initiate linear regression model to use in feature selection
rf_classification = RandomForestClassifier(n_estimators = 10, random_state = 10)

# build the RFE model
# pass the regression model to 'estimator'
# pass number of required features to 'n_features_to_select'
# if we do not pass the number of features, RFE considers half of the features
rfe_model = RFE(estimator=rf_classification)

# fit the RFE model on the training dataset using fit()
rfe_model = rfe_model.fit(X_train, y_train)

# create a series containing feature and its corresponding rank obtained from RFE
# 'ranking_' returns the rank of each variable after applying RFE
# pass the ranks as the 'data' of a series
# 'index' assigns feature names as index of a series 
feat_index = pd.Series(data = rfe_model.ranking_, index = X_train.columns)

# select the features with rank = 1
# 'index' returns the indices of a series (i.e. features with rank=1) 
signi_feat_rfe = feat_index[feat_index==1].index

# print the significant features obtained from RFE
print(signi_feat_rfe)

Index(['int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'tot_coll_amt', 'tot_cur_bal', 'mort_acc', 'num_accts_ever_120_pd',
       'pub_rec_bankruptcies', 'term', 'emp_length', 'addr_state', 'grade_C',
       'grade_D', 'home_ownership_MORTGAGE', 'home_ownership_RENT',
       'verification_status_Source Verified', 'verification_status_Verified',
       'purpose_credit_card', 'purpose_debt_consolidation', 'purpose_other',
       'initial_list_status_w', 'mths_since_last_delinq_4-6',
       'mths_since_last_delinq_None', 'mths_since_last_record_None',
       'mths_since_last_major_derog_4-6', 'mths_since_last_major_derog_None',
       'mths_since_recent_bc_dlq_4-6', 'mths_since_recent_bc_dlq_None'],
      dtype='object')


In [62]:
data = data[['int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'tot_coll_amt', 'tot_cur_bal', 'mort_acc', 'num_accts_ever_120_pd',
       'pub_rec_bankruptcies', 'term', 'emp_length', 'addr_state', 'grade_C',
       'grade_D', 'home_ownership_MORTGAGE', 'home_ownership_RENT',
       'verification_status_Source Verified', 'verification_status_Verified',
       'purpose_credit_card', 'purpose_debt_consolidation', 'purpose_other',
       'initial_list_status_w', 'mths_since_last_delinq_4-6',
       'mths_since_last_delinq_None', 'mths_since_last_record_None',
       'mths_since_last_major_derog_4-6', 'mths_since_last_major_derog_None',
       'mths_since_recent_bc_dlq_4-6', 'mths_since_recent_bc_dlq_None','loan_status']]

In [63]:
## CHECK WHICH FEATURES WERE DROPPED AND SEE IF THEY WERE IMP
## SHOW EVERYTHIN IN A GRAPH..


In [64]:
df_target = data['loan_status']
X = data.drop('loan_status', axis = 1)

#### SMOTE

In [65]:
Counter(df_target)

Counter({0: 352797, 1: 100430})

In [66]:
sm = SMOTE(random_state=10)

In [67]:
X_res, y_res = sm.fit_resample(X, df_target)

In [68]:
Counter(y_res)

Counter({0: 352797, 1: 352797})

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state = 10, test_size = 0.2)

print('X_train', X_train.shape)
print('y_train', y_train.shape)

print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (564475, 35)
y_train (564475,)
X_test (141119, 35)
y_test (141119,)


In [199]:
# tuned_paramaters = [{'criterion': ['entropy', 'gini'],
#                      'n_estimators': [10, 30, 50, 70, 90],
#                      'max_depth': [10, 15, 20],
#                      'max_features': ['sqrt', 'log2'],
#                      'min_samples_split': [2, 5, 8, 11],
#                      'min_samples_leaf': [1, 5, 9],
#                      'max_leaf_nodes': [2, 5, 8, 11]}]
 

# random_forest_classification = RandomForestClassifier(random_state = 10)

# rf_grid = GridSearchCV(estimator = random_forest_classification, 
#                        param_grid = tuned_paramaters,
#                        cv = 5)

# # use fit() to fit the model on the train set
# rf_grid_model = rf_grid.fit(X_train, y_train)

# # get the best parameters
# print('Best parameters for random forest classifier: ', rf_grid_model.best_params_, '\n')


Best parameters for random forest classifier:  {'n_estimators': 30, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_leaf_nodes': 11, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'gini'} 



In [201]:
# rf_model = RandomForestClassifier(criterion = rf_grid_model.best_params_.get('criterion'), 
#                                   n_estimators = rf_grid_model.best_params_.get('n_estimators'),
#                                   max_depth = rf_grid_model.best_params_.get('max_depth'),
#                                   max_features = rf_grid_model.best_params_.get('max_features'),
#                                   max_leaf_nodes = rf_grid_model.best_params_.get('max_leaf_nodes'),
#                                   min_samples_leaf = rf_grid_model.best_params_.get('min_samples_leaf'),
#                                   min_samples_split = rf_grid_model.best_params_.get('min_samples_split'),
#                                   random_state = 10)

# # use fit() to fit the model on the train set
# rf_model = rf_model.fit(X_train, y_train)

In [71]:
rf_classification = RandomForestClassifier(n_estimators = 10, random_state = 10)

# use fit() to fit the model on the train set
rf_model = rf_classification.fit(X_train, y_train)

In [72]:
y_train_pred = rf_model.predict(X_train)

print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    282147
           1       1.00      0.98      0.99    282328

    accuracy                           0.99    564475
   macro avg       0.99      0.99      0.99    564475
weighted avg       0.99      0.99      0.99    564475



In [73]:
y_test_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85     70650
           1       0.91      0.75      0.82     70469

    accuracy                           0.84    141119
   macro avg       0.85      0.84      0.84    141119
weighted avg       0.85      0.84      0.84    141119

