In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

I am predicting that the Logistic Regression model will perform better than the RandomForestRegressor model based upon inforomation I read about <a href="https://scholar.smu.edu/datasciencereview/vol1/iss3/9">HERE In This SMU Data Science Review Article</a> in which the authors describe Logistical Regression performing overall better than Random Forest in cases with fewer than 1000 arguments, and that makes sense to me. Also, they backed this up with compelling study and tested their models with various underlying structural changes, and so I'm thinking this will behave similarly.

Kirasich, Kaitlin; Smith, Trace; and Sadler, Bivin (2018) "Random Forest vs Logistic Regression: Binary
Classification for Heterogeneous Datasets," SMU Data Science Review: Vol. 1: No. 3, Article 9. 

In [None]:
# Import & preview the data
file_path = Path("Resources/lending_data.csv")
df = pd.read_csv(file_path)
df

In [None]:
# check for missing values
df.isnull().values.sum()

In [None]:
# examine and compare the size of the target
df["loan_status"].value_counts()

In [None]:
# separate the target as (y) from the features as (X)
y = df["loan_status"].values
X = df.drop("loan_status", axis=1)

In [None]:
# split (X, y) into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=61)

In [None]:
# check that target is removed from training data
X_train.head()

##### Create a logistic regression model as it is currently represented

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

Fit (train) our model by using the training data

In [None]:
# fit the training set to the Logistic Regression classifier
classifier.fit(X_train, y_train)

In [None]:
# Check Confusion Matrix Totals
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

y_true = y_test
y_predict = classifier.predict(X_test)
c_matrix = confusion_matrix(y_true, y_predict, labels=[1, 0])
c_matrix

In [None]:
# assignt matrix variables, and calculation variables
tp, fn, fp, tn = c_matrix.ravel()
prec = tp / (tp + fp)
acc = tp + tn/(tp + tn + fp + fn)
sens = tp / (tp + fn)
f_one = ((prec * sens) + (prec * sens))/(prec + sens)

In [None]:
print(f"True Positives: {tp}")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"---")
print(f"Precision: {prec}")
print(f"F1 Score: {f_one}")

In [None]:
print(classification_report(y_true, y_predict))

Cognitive Check: Individual Precision and F1 Score Match the Classification Report = True

Validate the model by using the test data

In [None]:
print(f"Model: LogisticRegression")
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

##### Create a random forest classifier model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

Fit (train) our model by using scaled training data

In [None]:
# split (X, y) into train and test sets, scaling the features
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=61)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

Validate the model by using the test data

In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f"Model: RamdomForestClassified")
print(f'Training Data Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Data Score: {clf.score(X_test_scaled, y_test)}')

Kaitlin Kirasich, Trace Smith, and Bivin Sadler, PhD findings guided me to make a prediction which is supported by this credit risk evaluation sample of the two models.