In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from matplotlib import pyplot as plt


In [2]:
# Import the data
file_path = Path("Resources/lending_data.csv")
df_lending = pd.read_csv(file_path)
df_lending.head(5)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
print(len(df_lending))

77536


In [4]:
# Find null values
for column in df_lending.columns:
    print(f"Column {column} has {df_lending[column].isnull().sum()} null values")

Column loan_size has 0 null values
Column interest_rate has 0 null values
Column borrower_income has 0 null values
Column debt_to_income has 0 null values
Column num_of_accounts has 0 null values
Column derogatory_marks has 0 null values
Column total_debt has 0 null values
Column loan_status has 0 null values


In [5]:
# Find duplicate entries
print(f"Duplicate entries: {df_lending.duplicated().sum()}")

Duplicate entries: 72307


In [6]:
clean_df_lending=df_lending.drop_duplicates()
clean_df_lending.shape

(5229, 8)

In [7]:
clean_df_lending['loan_status'].value_counts()

0    3653
1    1576
Name: loan_status, dtype: int64

In [8]:
target = clean_df_lending["loan_status"]
target_names = ["negative", "positive"]

In [9]:
data = clean_df_lending.drop("loan_status", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


## Prediction: 
#### The Logistic regression will perform better than the Random forest classifier for this dataset. Because the logistic regression predicts a binary outcome based on a set of independent variables, in this case the target - loan status being 0 and 1.


In [10]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [11]:
X_train.head(5)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
5036,7900.0,6.488,41600,0.278846,2,0,11600
457,13500.0,8.848,63900,0.530516,7,1,33900
49687,12200.0,8.314,58800,0.489796,6,1,28800
8441,8300.0,6.658,43200,0.305556,2,0,13200
76946,20600.0,11.887,92500,0.675676,14,3,62500


# Logistic Regression - Unscaled

In [12]:
# Create the Logistic Model
model = LogisticRegression(max_iter=1000)
model

LogisticRegression(max_iter=1000)

In [13]:
# fit the model with data
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [14]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.8923743942871717
Testing Data Score: 0.918960244648318


In [15]:
y_pred = model.predict(X_test)

In [16]:
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [17]:
# import the metrics class
from sklearn import metrics
from sklearn.metrics import confusion_matrix

metrics.confusion_matrix(y_test, y_pred)


array([[833,  73],
       [ 33, 369]], dtype=int64)

In [18]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 0.918960244648318


In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.96      0.92      0.94       906
    positive       0.83      0.92      0.87       402

    accuracy                           0.92      1308
   macro avg       0.90      0.92      0.91      1308
weighted avg       0.92      0.92      0.92      1308



# Random Forest Classifier - Unscaled

In [20]:
# create the classifier
classifier = RandomForestClassifier(n_estimators=100)

In [21]:
# Train the model using the training sets
classifier.fit(X_train, y_train)

RandomForestClassifier()

In [22]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9655700076511095
Testing Data Score: 0.8707951070336392


In [23]:
# predictin on the test set
y_pred = classifier.predict(X_test)

In [24]:
# Calculate Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8707951070336392


In [25]:
confusion_matrix(y_test, y_pred)

array([[828,  78],
       [ 91, 311]], dtype=int64)

In [26]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 0.8707951070336392


In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.90      0.91      0.91       906
    positive       0.80      0.77      0.79       402

    accuracy                           0.87      1308
   macro avg       0.85      0.84      0.85      1308
weighted avg       0.87      0.87      0.87      1308



## Scale Data

In [28]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression - Scaled

In [29]:
model2 = LogisticRegression()
model2.fit(X_train_scaled, y_train)
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.9155827595001275
Testing Data Score: 0.9327217125382263


In [30]:
y_pred_scaled = model2.predict(X_test_scaled)
metrics.confusion_matrix(y_test, y_pred_scaled)

array([[827,  79],
       [  9, 393]], dtype=int64)

In [31]:
print(classification_report(y_test, y_pred_scaled, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.99      0.91      0.95       906
    positive       0.83      0.98      0.90       402

    accuracy                           0.93      1308
   macro avg       0.91      0.95      0.92      1308
weighted avg       0.94      0.93      0.93      1308



# Random Forest Classifier - Scaled

In [32]:
classifier2 = RandomForestClassifier(n_estimators=100)
classifier2.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier2.score(X_test_scaled, y_test)}")

Training Data Score: 0.9658250446314716
Testing Data Score: 0.867737003058104


In [33]:
# predictin on the test set
y_pred_scaled = classifier2.predict(X_test_scaled)
metrics.confusion_matrix(y_test, y_pred_scaled)

array([[828,  78],
       [ 95, 307]], dtype=int64)

In [34]:
print(classification_report(y_test, y_pred_scaled, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.90      0.91      0.91       906
    positive       0.80      0.76      0.78       402

    accuracy                           0.87      1308
   macro avg       0.85      0.84      0.84      1308
weighted avg       0.87      0.87      0.87      1308



## Results - Unscaled
Logistic Regression Unscaled: Training Data Score: 0.8923743942871717; Testing Data Score: 0.918960244648318

Random Forest Classifier Unscaled: Training Data Score: 0.9655700076511095; Testing Data Score: 0.8707951070336392

On unscaled data, the Logistic Regression performed better with a score of 92%. 

## Results - Scaled
Logistic Regression Scaled: Training Data Score: 0.9155827595001275; Testing Data Score: 0.9327217125382263

Random Forest Classifier Scaled: Training Data Score: 0.9658250446314716; Testing Data Score: 0.867737003058104

On Scaled data, the Logistic Regression performed better with a score of 93%. 

## Final Results: 
Overall, scaling improved the score of the Logistic Regression model from .92 to .93 so that it outperformed the Random Forest model. Testing Data score stayed at .87 for Random Forest Classifier model even after scaling. 

These results concur with my prediction that "Logistic Regression will perform better than Random Forest Classifier model for this dataset".
