# Importing data and exploring data

In [30]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve
from sklearn.model_selection import train_test_split

In [31]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [32]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [33]:
train_df.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

In [34]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [35]:
test_df.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

## Preprocessing Converting categorical data to numeric

In [54]:
# Convert categorical data to numeric and separate target feature for training data
X = train_df.drop('loan_status', axis =1)
X_train = pd.get_dummies(X)
X_train = X_train.drop('Unnamed: 0',axis=1)
X_train.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [55]:
y = pd.get_dummies(train_df['loan_status'])
y_train = y.drop('high_risk', axis= 1)
y_train = y_train.rename(columns={'low_risk':'loan_status'})
y_train = y_train.values.ravel()
y_train                                

array([1, 1, 1, ..., 0, 0, 0], dtype=uint8)

In [56]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop('loan_status', axis =1)
X_tester = pd.get_dummies(X_test)
X_tester = X_tester.drop('Unnamed: 0',axis=1)
X_tester.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1


In [57]:
y_test = pd.get_dummies(test_df['loan_status'])
y_tester = y_test.drop('high_risk', axis= 1)
y_tester = y_tester.rename(columns={'low_risk':'loan_status'})
y_tester.head()

Unnamed: 0,loan_status
0,1
1,1
2,1
3,1
4,1


In [60]:
# missing column check
ck_mtch_col = set(X_train.columns) - set (X_tester.columns)
ck_mtch_col

{'debt_settlement_flag_Y'}

In [62]:
# add missing dummy variables to testing set
for c in ck_mtch_col:
    X_tester['debt_settlement_flag_Y'] = 0
X_tester = X_tester[X_train.columns]
X_tester.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [63]:
print(X_train.shape, y_train.shape)
print(X_tester.shape, y_tester.shape)

(12180, 93) (12180,)
(4702, 93) (4702, 1)


# Predictions as to which model I think will perform better: 

 I believe Random Forest Classifier will better modal to use as it controls overfitting.

# Logistic Regression Model

In [64]:
# Train the Logistic Regression model on the unscaled data and print the model score
log_mod = LogisticRegression(max_iter= 19000, random_state=5).fit(X_train,y_train)
print("Logistic Regression Training Score: ", log_mod.score(X_train, y_train))
print("Logistic Regression Testing Score: ", log_mod.score(X_tester, y_tester))

Logistic Regression Training Score:  0.6979474548440066
Logistic Regression Testing Score:  0.5618885580603998


# Random Forest Classifier Model

In [65]:
# Train a Random Forest Classifier model and print the model score
rando_class = RandomForestClassifier(random_state=1)
rando_class.fit(X_train, y_train)
y_predict = rando_class.predict(X_tester)
print(classification_report(y_tester, y_predict))

              precision    recall  f1-score   support

           0       0.75      0.50      0.60      2351
           1       0.63      0.83      0.71      2351

    accuracy                           0.67      4702
   macro avg       0.69      0.67      0.66      4702
weighted avg       0.69      0.67      0.66      4702



In [66]:
print(f"Random Forest Classifier Training Score: {rando_class.score(X_train, y_train)}")
print(f"Random Forest Classifier Testing Score: {rando_class.score(X_tester, y_tester)}")

Random Forest Classifier Training Score: 1.0
Random Forest Classifier Testing Score: 0.6671629094002552


# Results: Random Forest Classifier did do better then logistic regression. 

# Scaling the data : 

In [67]:
# Scale the data
scaled_data = StandardScaler()
scaled_data.fit(X_train)
X_train_scaled = scaled_data.transform(X_train)
X_test_scaled = scaled_data.transform(X_tester)

# Logistics Regression with Scaled Data:

In [68]:
# Train the Logistic Regression model on the scaled data and print the model score
log_scaled_mod = LogisticRegression(max_iter = 19000, random_state=5).fit(X_train_scaled,y_train)
print("Logistic Regression Scaled Score: ", log_scaled_mod.score(X_test_scaled, y_tester))

Logistic Regression Scaled Score:  0.7205444491705657


# Random Forest Classifier with Scaled Data: 

In [69]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rando_scaled_mod = RandomForestClassifier(random_state = 5, n_estimators=50).fit(X_train_scaled,y_train)
print("Random Forest Classifier Scaled Score:", rando_scaled_mod.score(X_test_scaled, y_tester))

Random Forest Classifier Scaled Score: 0.6295193534666099


# Final Results and Thoughts: 
Original prediction was correct on the unscaled data as Random Forest Classifier gave better results. 
Scaled Data decreased score under Random Forest Classifier, Logistics Regression returned better results under scaled data. 