In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import os

# Prework predictions

Before beginning work on running the models for this assignment (logistical regression and random forest classifier), my prediction is that the logistic regression model will perform better. Since the dataset has a relatively simple categorical variable - determining if a loan is approved or not - a logistical regression model is designed for this type of task. Random forest models are more effective when there are more categories to classify, so I believe that the logistical regression will perform better.

In [2]:
# Import the data
test_file = os.path.join("Resources/lending_data.csv")
loan_df = pd.read_csv(test_file)
loan_df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [3]:
#Define X and y sets
y = loan_df["loan_status"].values
X = loan_df.drop("loan_status", axis = 1)

In [4]:
print("Shape: ", X.shape, y.shape)

Shape:  (77536, 7) (77536,)


In [5]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Train a Logistic Regression model print the model score

In [6]:
#Create a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [7]:
#Fit/train the model with the training data
classifier.fit(X_train, y_train)

LogisticRegression()

In [8]:
#Validate the model with the test data
print(f'Training Data Score: {classifier.score(X_train, y_train)}')
print(f'Testing Data Score: {classifier.score(X_test, y_test)}')

Training Data Score: 0.9921240885954051
Testing Data Score: 0.9918489475856377


In [18]:
#Predictions
print(f"Actual:\t\t{list(y_test[:15])}")
print(f"Predictions:\t{list(classifier.predict(X_test[:15]))}")

Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]
Predictions:	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]


In [9]:
#Predictions using a confusion matrix
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [12]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(f"TN: {tn}")
print(f"FP: {fp}")
print(f"FN: {fn}")
print(f"TP: {tp}")

TN: 18663
FP: 102
FN: 56
TP: 563


In [13]:
accuracy = (tp + tn)/ (tp + fp + tn + fn)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9918489475856377


# Train a Random Forest Classifier model and print the model score

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [20]:
#Fit the data
scaler = StandardScaler().fit(X_train)

In [21]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
#Random Forest Classifier
clf = RandomForestClassifier(random_state = 1, n_estimators = 500).fit(X_train_scaled, y_train)
print(f"Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"Test Score: {clf.score(X_test_scaled, y_test)}")

Training Score: 0.9975409272252029
Test Score: 0.9917457697069748


# Postwork Analysis

After importing the data and running the models, the results show very similar and close scores for both the logisitc regression and random forest classifier. The test scores for both models had a 0.0001 difference, which is very close to similar results. As such, I think that both models work well when there is a binary choice in results, such as yes or no, and neither is the wrong choice.