# Predicting Credit Risk with Supervised Machine Learning

In [1]:
# Importing sklearn and regression model dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Retrieving the data by reading in our csv under test and training dataframes respectively
df = pd.read_csv(Path('Resources/lending_data.csv'))


In [3]:
# Training df head to view columns and data 
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [4]:
# Assign the feature (X) and target (y) variables (dropping loan status)
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [5]:
# Viewing X dataset
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [6]:
# Viewing y dataset
y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [7]:
# Spliting the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Prediction

Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! Write down (in markdown cells in your Jupyter Notebook or in a separate document) your prediction, and provide justification for your educated guess.

I predict that the Logistic Regression model will have a better final outcome number and model with Logistic Regression are typically balanced and parasimonious when compared to that of the Random Forest whose prone to overfitting and outliers and the final model depends on the strength of the trees in the forest and the correlation between them.

In [8]:
# Training data only we are adding missing dummy variables to the testing set
for column in X_train.columns:
    if column not in X_test.columns:
        X_test[column] = 0

In [9]:
# Training the Logistic Regression model on unscaled data and obtaining score 
reg = LogisticRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.9918489475856377

In [10]:
# Training Random Forest Classifer model and obtaining the score 
randomForestClass = RandomForestClassifier(random_state=0)
randomForestClass.fit(X_train, y_train)
print("RandomForestClassifier score: ", randomForestClass.score(X_test,y_test))

RandomForestClassifier score:  0.9915394139496492


In [11]:
# Here we are scaling the data 
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Results 

In [12]:
# Creating a Logistic Regression model and fitting the data then printing
clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(f"Logistic Regression Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Logistic Regression Testing Data Score: {clf.score(X_test_scaled, y_test)}")

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.98      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384

Logistic Regression Training Data Score: 0.9942908240473243
Logistic Regression Testing Data Score: 0.9936545604622369


In [13]:
# Creating a Random Forest model and fitting the data then printing
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(f'Training Data Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Data Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.89      0.87       619

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384

Training Data Score: 0.9975409272252029
Testing Data Score: 0.9914878250103177


# Final Results

Which model performed better? How does that compare to your prediction? Write down your results and thoughts.

The final model results showed that they both performed well within a fraction of a decimal to one another, with the Logistic Regression Model just edging out the Random Forest Classier. The Final result is the reflects my initial prediction and hypothesis that the Logicstic Regression Model would perform better. This is seen by the differences in training and testingf scores. There may have been more categorical data used than initially expected. 