In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

## 1. Convert categorical data to numeric and separate target feature for training data

In [3]:
# view train_df
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
# Create the X_train values
X = train_df.drop(["Unnamed: 0", "index", "loan_status"], axis=1)
X.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,n,29.99,0.0,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,n,11.26,2.0,0.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,n,11.28,0.0,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,3000.0,0.124,100.22,RENT,45000.0,Not Verified,n,18.08,0.0,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,n,27.77,0.0,2.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [5]:
# Create dummies for the training data
X_dummies_train = pd.get_dummies(X)
X_dummies_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,0,1,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,0,1,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,0,1,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,0,1,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,0,1,0,1,1,0,1,0,1,0


In [6]:
# Get the y train data
Y = train_df["loan_status"]
Y.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: loan_status, dtype: object

## 2. Convert categorical data to numeric and separate target feature for testing data

In [7]:
# Get the test values for X
X_test = test_df.drop(columns = ["Unnamed: 0", "index", "loan_status"])
X_dummies_test = pd.get_dummies(X_test)
X_dummies_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,0,1,0,1,1,0,1,0,1
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,0,1,0,1,1,0,1,0,1
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,0,1,0,1,1,0,1,0,1
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,0,1,0,1,1,0,1,0,1
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,0,1,0,1,1,0,1,0,1


In [8]:
# Get the test values for Y
Y_test = test_df["loan_status"]
Y_test.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: loan_status, dtype: object

## 3. Add missing dummy variables to testing set

In [9]:
# Check to ensure all unique values are in both the test & train data
for i in X_dummies_train:
    if i not in X_dummies_test:
        X_test[i] = 0
        print(f"Missing: {i}")

Missing: debt_settlement_flag_Y


In [10]:
# Overwrite the discrepancy
X_dummies_test["debt_settlement_flag_Y"] = 0

## 4. Train the Logistic Regression model on the unscaled data and print the model score

In [11]:
# Import relevant library
from sklearn.linear_model import LogisticRegression

# Build the model
model = LogisticRegression()

In [12]:
# Fit the model
log_reg = LogisticRegression(solver = "liblinear", random_state = 0)
log_reg.fit(X_dummies_train, Y)

LogisticRegression(random_state=0, solver='liblinear')

In [13]:
# Print the scores
print(f"Trained Data: {log_reg.score(X_dummies_train, Y)}")
print(f"Tested Data: {log_reg.score(X_dummies_test, Y_test)}")

Trained Data: 0.7082922824302135
Tested Data: 0.5772011909825606


## 5. Train a Random Forest Classifier model and print the model score

In [14]:
# Import relevant library
from sklearn.ensemble import RandomForestClassifier

# Fit the model
forest = RandomForestClassifier(random_state = 0, n_estimators = 500).fit(X_dummies_train, Y)

In [15]:
# Print the scores
print(f"Trained Data: {forest.score(X_dummies_train, Y)}")
print(f"Tested Data: {forest.score(X_dummies_test, Y_test)}")

Trained Data: 1.0
Tested Data: 0.6501488728200766


## 6. Scale the data

In [16]:
# Import relevant library
from sklearn.preprocessing import StandardScaler

# Scale the values
scaler = StandardScaler().fit(X_dummies_train)

scaled_x = scaler.transform(X_dummies_train)
scaled_test = scaler.transform(X_dummies_test)

## 7. Train the Logistic Regression model on the scaled data and print the model score

## 8. Train a Random Forest Classifier model on the scaled data and print the model score