In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
#Dropping uncessary data column 'index'
X_2019 = train_df.drop(['index', 'loan_status'], axis = 1)
y_2019 = train_df['loan_status']
X_2019.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,n,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,n,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,n,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,n,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,n,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [5]:
# Convert categorical data to numeric and separate target feature for training data
X_2019_dummies = pd.get_dummies(X_2019)
print(X_2019_dummies.columns)

Index(['Unnamed: 0', 'loan_amnt', 'int_rate', 'installment', 'annual_inc',
       'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_s

In [6]:
X_2019_dummies.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [7]:
#Dropping target data 'loan_status' and uncessary data column 'index'
X_2020 = test_df.drop(['loan_status','index'], axis = 1)
y_2020 = test_df['loan_status']
X_2020

Unnamed: 0.1,Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,40000.0,0.0819,814.70,MORTGAGE,140000.0,Not Verified,n,19.75,0.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,6000.0,0.1524,208.70,RENT,55000.0,Not Verified,n,11.52,2.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,n,6.74,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,n,12.13,0.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,3600.0,0.1240,120.27,RENT,50000.0,Not Verified,n,16.08,0.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,n,15.74,0.0,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,n,26.81,0.0,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,n,38.51,0.0,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,n,16.36,0.0,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


# Prediction Write-up

Based on the data as is, we should see better performance with scaled data than with unscaled data.  Scaling data prior to training will normalize it by removing any units associated with the data.  Normalizing the data will also take away any assumptions that the larger numbers within the dataset will have more weight or have more significant "value;" for example, in this dataset the machine learning model will not see a significant difference between the principal "loan amount" value and the value presented in the dataset for the person's credit limit value.

In addition, we should also see better performance in our Random Forest Classifier than the Logistical Regression model.  The Random Forest Classifier looks at the data as "1's" or "0's" whereas the Logistical Regression model is based on probability of the "observation falling into a category." The two datasets are in the same format and report relatively the same type of data.  However, because we are looking to determine the best predictor for loans based on various categories; by randomly choosing the categories for our training set, our test prediction should show better performance using the Random Forest Classifier model.  The Logistical Regression Model works best in weighted datasets, in which our dataset is better suited not to be weighted for better predictive performance.

Therefore, in this dataset, we should see better performance with the scaled dataset as to unscaled dataset.   

In [8]:
# Convert categorical data to numeric and separate target feature for testing data
X_2020_dummies = pd.get_dummies(X_2020)
print(X_2020_dummies.columns)

Index(['Unnamed: 0', 'loan_amnt', 'int_rate', 'installment', 'annual_inc',
       'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_s

In [9]:
X_2020_dummies.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1


In [10]:
# Add missing dummy variables to testing set
X_2020_dummies['debt_settlement_flag_Y'] = 0
X_2020_dummies.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [11]:
# Train the Logistic Regression model on the unscaled data and print the model score using 2019 DATA
X_train, X_test, y_train, y_test = train_test_split(X_2019_dummies, y_2019, random_state=1)
clf = LogisticRegression().fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.6553913519430761
Testing Score: 0.6469622331691297


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Train the Logistic Regression model on the unscaled data and print the model score using 2020 DATA
X_train, X_test, y_train, y_test = train_test_split(X_2020_dummies, y_2020, random_state=1)
clf1 = LogisticRegression().fit(X_train, y_train)
print(f'Training Score: {clf1.score(X_train, y_train)}')
print(f'Testing Score: {clf1.score(X_test, y_test)}')

Training Score: 0.8275666477595008
Testing Score: 0.8290816326530612


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Train a Random Forest Classifier model and print the model score using 2019 DATA
clf_2019_rf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_2019_dummies, y_2019)
print(f'Training Score: {clf_2019_rf.score(X_train, y_train)}')
print(f'Testing Score: {clf_2019_rf.score(X_test, y_test)}')

Training Score: 0.660805445263755
Testing Score: 0.6862244897959183


In [14]:
#  Train a Random Forest Classifier model and pring the model score using 2020 DATA
clf_2020_rf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_2020_dummies, y_2020)
print(f'Training Score: {clf_2020_rf.score(X_train, y_train)}')
print(f'Testing Score: {clf_2020_rf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 1.0


In [15]:
# Scale the data using 2019 DATA
scaler = StandardScaler().fit(X_2019_dummies)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
# Train a Logistic Regression model on the scaled data and print the model score using 2019 DATA
clf_2019_scaled = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {clf_2019_scaled.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_2019_scaled.score(X_test_scaled, y_test)}')

Training Score: 0.8905275099262621
Testing Score: 0.8869047619047619


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Train a Random Forest Classifier model on the scaled data and print the model score using 2019 DATA
clf_2019_rf_scaled = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf_2019_rf_scaled.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_2019_rf_scaled.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.8971088435374149


In [19]:
# Scale the data using 2020 DATA
scaler_2020 = StandardScaler().fit(X_2020_dummies)
X_train_scaled = scaler_2020.transform(X_train)
X_test_scaled = scaler_2020.transform(X_test)

In [20]:
# Train a Logistic Regression model on the scaled data and print the model score using 2020 DATA
clf_2020_lr_scaled = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {clf_2020_lr_scaled.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_2020_lr_scaled.score(X_test_scaled, y_test)}')

Training Score: 0.8910947249007374
Testing Score: 0.8869047619047619


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Train a Random Forest Classifier model on the scaled data and print the model score using 2020 DATA
clf_2020_scaled = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf_2020_scaled.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_2020_scaled.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.8979591836734694
