In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

# Convert categorical data to numeric and separate target feature for training data

In [3]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [4]:
# check data types
train_df.dtypes

Unnamed: 0                      int64
index                           int64
loan_amnt                     float64
int_rate                      float64
installment                   float64
                               ...   
total_bal_ex_mort             float64
total_bc_limit                float64
total_il_high_credit_limit    float64
hardship_flag                  object
debt_settlement_flag           object
Length: 86, dtype: object

In [5]:
# view data 
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [6]:
# Separate Target Feature
# get target 
y_train = train_df[['loan_status']]
y_test = test_df[['loan_status']]
y_train.head()

Unnamed: 0,loan_status
0,low_risk
1,low_risk
2,low_risk
3,low_risk
4,low_risk


In [7]:
#get x and drop unnamed,index column
X_train = train_df.drop(['loan_status','Unnamed: 0','index'],axis=1)
X_test = test_df.drop(['loan_status','Unnamed: 0','index'],axis=1)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,n,29.99,0.0,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,n,11.26,2.0,0.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,n,11.28,0.0,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,3000.0,0.124,100.22,RENT,45000.0,Not Verified,n,18.08,0.0,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,n,27.77,0.0,2.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [8]:
#find non numeric columns 
cat_df = X_train.select_dtypes(exclude=['number'])
cat_df.head()

Unnamed: 0,home_ownership,verification_status,pymnt_plan,initial_list_status,application_type,hardship_flag,debt_settlement_flag
0,MORTGAGE,Not Verified,n,w,Individual,N,N
1,MORTGAGE,Source Verified,n,w,Individual,N,N
2,MORTGAGE,Source Verified,n,w,Individual,N,N
3,RENT,Not Verified,n,w,Individual,N,N
4,MORTGAGE,Source Verified,n,w,Individual,N,N


In [9]:
# One-hot encoding the entire dataframe
train_dummies = pd.get_dummies(X_train)
test_dummies = pd.get_dummies(X_test)
print(train_dummies.columns)


Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_in

In [10]:
train_dummies.dtypes

loan_amnt                     float64
int_rate                      float64
installment                   float64
annual_inc                    float64
dti                           float64
                               ...   
application_type_Joint App      uint8
hardship_flag_N                 uint8
hardship_flag_Y                 uint8
debt_settlement_flag_N          uint8
debt_settlement_flag_Y          uint8
Length: 92, dtype: object

In [11]:
y_train['loan_status'] = y_train['loan_status'].map({'low_risk': 1, 'high_risk': 0})
y_test['loan_status'] = y_test['loan_status'].map({'low_risk': 1, 'high_risk': 0})
y_train.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['loan_status'] = y_train['loan_status'].map({'low_risk': 1, 'high_risk': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['loan_status'] = y_test['loan_status'].map({'low_risk': 1, 'high_risk': 0})


loan_status
0              6090
1              6090
dtype: int64

In [12]:
test_df["debt_settlement_flag"].value_counts()
# no one was flagged for debt settlement flag as Y in test data frame , 
# will have to add column to make test dummies df match train dummies df

N    4702
Name: debt_settlement_flag, dtype: int64

In [13]:
test_dummies['debt_settlement_flag_Y']=0

# Predictions on model behavior

# Scaled vs Unscaled  
between a model that uses scaled data and unscaled data, the scaled data set should produce better results since it will ensure that each variable is given the same level of importance vs an unscaled data is likely to give more weightage to values that are 'higher' generically

# Logistic vs Random Forest
between a logistic and random forest model, the random forest should perform better as it should be superior at handling a higher level of variables in a model compared to a logistic regression

# Scaled Logistic Regression vs Unscaled Logistic Regression
Logistic regression works on distances between points therefore, theoretically scaled data should produce better results as it will normalize all variables and make the distances comparable across different variables. 


# Scaled Random Forest vs Unscaled Random Forest
Unlike logistic regression random forest is not baed on the distance between points and therofore theoretically scaling the variables should not make a significant difference

In [14]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=10000)
classifier.fit(train_dummies, y_train.values.ravel())
print(f"Training Data Score: {classifier.score(train_dummies, y_train.values.ravel())}")
print(f"Testing Data Score: {classifier.score(test_dummies,y_test.values.ravel())}")

Training Data Score: 0.7083743842364532
Testing Data Score: 0.5803913228413441


In [15]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = classifier.predict(test_dummies)
confusion_matrix(y_true, y_pred)

array([[ 854, 1497],
       [ 476, 1875]], dtype=int64)

In [16]:
tp, fn, fp, tn = confusion_matrix(y_true, y_pred).ravel()
fp_rate = (fp) / (fp + tn)
print(f"False Positive rate: {round(fp_rate*100)}%")

False Positive rate: 20%


In [17]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(train_dummies, y_train.values.ravel())
print(f'Training Score: {clf.score(train_dummies, y_train.values.ravel())}')
print(f'Training Score: {clf.score(test_dummies, y_test.values.ravel())}')

Training Score: 1.0
Training Score: 0.6373883453849426


In [18]:
y_true = y_test
y_pred = clf.predict(test_dummies)
confusion_matrix(y_true, y_pred)

array([[1893,  458],
       [1247, 1104]], dtype=int64)

In [19]:
tp, fn, fp, tn = confusion_matrix(y_true, y_pred).ravel()
fp_rate = (fp) / (fp + tn)
print(f"False Positive rate: {round(fp_rate*100)}%")

False Positive rate: 53%


In [20]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(train_dummies)
X_train_scaled = scaler.transform(train_dummies)
X_test_scaled = scaler.transform(test_dummies)

In [21]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(solver='lbfgs', max_iter=10000)
classifier.fit(X_train_scaled, y_train.values.ravel())
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train.values.ravel())}")
print(f"Test Data Score: {classifier.score(X_test_scaled, y_test.values.ravel())}")


Training Data Score: 0.7079638752052545
Test Data Score: 0.7677584006805614


In [22]:
y_true = y_test
y_pred = classifier.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[1788,  563],
       [ 529, 1822]], dtype=int64)

In [23]:
tp, fn, fp, tn = confusion_matrix(y_true, y_pred).ravel()
fp_rate = (fp) / (fp + tn)
print(f"False Positive rate: {round(fp_rate*100)}%")

False Positive rate: 23%


In [24]:
# Train a Random Forest Classifier model on the scaled data and print the model 
clf = RandomForestClassifier().fit(X_train_scaled, y_train.values.ravel())
print(f'Training Score: {clf.score(X_train_scaled, y_train.values.ravel())}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test.values.ravel())}')

Training Score: 1.0
Testing Score: 0.6237771161207997


In [25]:
y_true = y_test
y_pred = clf.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[1850,  501],
       [1268, 1083]], dtype=int64)

In [26]:
tp, fn, fp, tn = confusion_matrix(y_true, y_pred).ravel()
fp_rate = (fp) / (fp + tn)
print(f"False Positive rate: {round(fp_rate*100)}%")

False Positive rate: 54%


# For this data set logistic regression with scaled data set performed the best 

-true positive in this case = is a low risk and identified it as a low risk

-true negative in this case = is a high risk and identified it as high risk

-false positive in this case = is a high risk but identified as low risk

-false negative in this case = is a low risk but identified as high risk 

i believe the model that has the lowest false positives with a decent accuracy rate is the least risky option for the lending club . In this scenario that is the logistic regression with scaled data with almost 77% accuracy and a 23% False Positive Rate