# Predicting Credit Risk with Supervised Machine Learning

In [1]:
# Importing sklearn and regression model dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Retrieving the data by reading in our csv under test and training dataframes respectively
train_df = pd.read_csv(Path('Resources/lending_data.csv'))
test_df = pd.read_csv(Path('Resources/lending_data.csv'))

In [3]:
# Training df head to view columns and data 
train_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [4]:
# Testing df head to view columns and data 
test_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [5]:
# Parsing target feature for training data 
y_train = train_df["loan_status"]
X_train = train_df.drop(columns = ["loan_status"])
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [6]:
# Parsing target feature for testing data while also converting any cvategorical data to numeric
y_test = test_df["loan_status"]
X_test = test_df.drop(columns = ["loan_status"])
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


# Prediction

Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! Write down (in markdown cells in your Jupyter Notebook or in a separate document) your prediction, and provide justification for your educated guess.

I predict that the Logistic Regression model will have a better final outcome number and model with Logistic Regression are typically balanced and parasimonious when compared to that of the Random Forest whose prone to overfitting and outliers and the final model depends on the strength of the trees in the forest and the correlation between them.

In [7]:
# Training data only we are adding missing dummy variables to the testing set
for column in X_train.columns:
    if column not in X_test.columns:
        X_test[column] = 0

In [8]:
# Training the Logistic Regression model on unscaled data and obtaining score 
reg = LogisticRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.9920037144036319

In [9]:
# Training Random Forest Classifer model and obtaining the score 
randomForestClass = RandomForestClassifier(random_state=0)
randomForestClass.fit(X_train, y_train)
print("RandomForestClassifier score: ", randomForestClass.score(X_test,y_test))

RandomForestClassifier score:  0.9969304581097813


In [10]:
# Here we are scaling the data 
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Results 

In [11]:
# Creating a Logistic Regression model and fitting the data 
logisticRegr_scaled = LogisticRegression(
    solver='lbfgs',
    max_iter=100,
    random_state=0
)
logisticRegr_scaled.fit(X_train_scaled, y_train)
print("Logistic Regression Scaled Score: ", logisticRegr_scaled.score(X_test_scaled, y_test))

Logistic Regression Scaled Score:  0.9941317581510524


In [12]:
# Creating a Random Forest model and fitting the data 
randomForestClass_scaled = RandomForestClassifier(random_state=0)
randomForestClass_scaled.fit(X_train_scaled, y_train)
print("Random Forest Classifier Scaled score: ", randomForestClass_scaled.score(X_test_scaled, y_test))

Random Forest Classifier Scaled score:  0.9969304581097813


Which model performed better? How does that compare to your prediction? Write down your results and thoughts.

The final model results showed that they both performed well within a fraction of a decimal to one another, with the Random Forest Classifier just edging out the Logistic Regression Model. The Final result is the opposite of my initial prediction and hypothesis that the Logicstic Regression Model would perform better. There may have been more categorical data used than initially expected.