In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

I am predicting that the Logistic Regression model will perform better than the RandomForestRegressor model based upon inforomation I read about <a href="https://scholar.smu.edu/datasciencereview/vol1/iss3/9">HERE In This SMU Data Science Review Article</a> in which the authors describe Logistical Regression performing overal better than Random Forest in cases with fewer than 1000 arguments, and that makes sense to me. Also, they backed this up with compelling study and tested their models with various underlying structural changes.

Kirasich, Kaitlin; Smith, Trace; and Sadler, Bivin (2018) "Random Forest vs Logistic Regression: Binary
Classification for Heterogeneous Datasets," SMU Data Science Review: Vol. 1: No. 3, Article 9. 

In [3]:
# Import the data
file_path = Path("Resources/lending_data.csv")
df = pd.read_csv(file_path)
df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [5]:
df.isnull().values.sum()

0

In [6]:
df["loan_status"].value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [8]:
y = df["loan_status"].values
X = df.drop("loan_status", axis=1)

In [9]:
from sklearn.model_selection import train_test_split
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=61)

##### Create a logistic regression model

In [10]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

Fit (train) our model by using the training data

In [11]:
# Train a Logistic Regression model print the model score
classifier.fit(X_train, y_train)

LogisticRegression()

In [42]:
# Check Confusion Matrix Totals FIX TO Corrected SET
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test
y_predict = classifier.predict(X_test)
c_matrix = confusion_matrix(y_true, y_predict, labels=[1,0])
c_matrix

array([[  552,    70],
       [   88, 18674]], dtype=int64)

In [43]:
tp, fn, fp, tn = c_matrix.ravel()
prec = tp / (tp + fp)
acc = tp + tn/(tp + tn + fp + fn)
sens = tp / (tp + fn)
f_one = 2 * (prec * sens)/(prec + sens)

In [44]:
print(f"True Positives: {tp}")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"---")
print(f"Precision: {prec}")
print(f"F1 Score: {f_one}")

True Positives: 552
True Negatives: 18674
False Positives: 88
False Negatives: 70
---
Precision: 0.8625
F1 Score: 0.8748019017432647


In [45]:
print(classification_report(y_true, y_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18762
           1       0.86      0.89      0.87       622

    accuracy                           0.99     19384
   macro avg       0.93      0.94      0.94     19384
weighted avg       0.99      0.99      0.99     19384



Cognitive Check: Individual Precision and F1 Score Match the Classification Report = True

Validate the model by using the test data

In [46]:
print(f"Model: LogisticRegression")
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Model: LogisticRegression
Training Data Score: 0.9917973586463062
Testing Data Score: 0.9918489475856377


##### Create a random forest classifier model

# <span style="color: blue;">CURRENTLY</span> <span style="color: green;">WORKING</span> <span style="color: red;">HERE</span>

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

Fit (train) our model by using the training data

In [22]:
# Train a Random Forest Classifier model and print the model score
def forest_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train Score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=61)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

Validate the model by using the test data

In [24]:
forest_model(RandomForestRegressor(), data)

Model: RandomForestRegressor
Train Score: 0.9382759530370518
Test Score: 0.7753304623263003



Kaitlin Kirasich, Trace Smith, and Bivin Sadler, PhD findings guided me to make a prediction which is supported by this credit risk evaluation sample of the two models.

In [109]:
# Train a Logistic Regression model print the model score

0.9908171687990095

In [110]:
# Train a Random Forest Classifier model and print the model score

0.9910751134956666