In [81]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from matplotlib import pyplot as plt


In [54]:
# Import the data
file_path = Path("Resources/lending_data.csv")
df_lending = pd.read_csv(file_path)
df_lending.head(25)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
5,10100.0,7.438,50600,0.407115,4,1,20600,0
6,10300.0,7.49,51100,0.412916,4,1,21100,0
7,8800.0,6.857,45100,0.334812,3,0,15100,0
8,9300.0,7.096,47400,0.367089,3,0,17400,0
9,9700.0,7.248,48800,0.385246,4,0,18800,0


In [55]:
print(len(df_lending))

77536


In [56]:
# Find null values
for column in df_lending.columns:
    print(f"Column {column} has {df_lending[column].isnull().sum()} null values")

Column loan_size has 0 null values
Column interest_rate has 0 null values
Column borrower_income has 0 null values
Column debt_to_income has 0 null values
Column num_of_accounts has 0 null values
Column derogatory_marks has 0 null values
Column total_debt has 0 null values
Column loan_status has 0 null values


In [57]:
# Find duplicate entries
print(f"Duplicate entries: {df_lending.duplicated().sum()}")

Duplicate entries: 72307


In [58]:
clean_df_lending=df_lending.drop_duplicates()
clean_df_lending.shape

(5229, 8)

In [59]:
clean_df_lending['loan_status'].value_counts()

0    3653
1    1576
Name: loan_status, dtype: int64

In [60]:
target = clean_df_lending["loan_status"]
target_names = ["negative", "positive"]

In [61]:
data = clean_df_lending.drop("loan_status", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [62]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [63]:
X_train.head(5)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
5036,7900.0,6.488,41600,0.278846,2,0,11600
457,13500.0,8.848,63900,0.530516,7,1,33900
49687,12200.0,8.314,58800,0.489796,6,1,28800
8441,8300.0,6.658,43200,0.305556,2,0,13200
76946,20600.0,11.887,92500,0.675676,14,3,62500


# Logistic Regression

In [64]:
# Create the Logistic Model
model = LogisticRegression(max_iter=1000)
model

LogisticRegression(max_iter=1000)

In [65]:
# fit the model with data
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [66]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.8923743942871717
Testing Data Score: 0.918960244648318


In [67]:
y_pred = model.predict(X_test)

In [68]:
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [71]:
# import the metrics class
from sklearn import metrics
from sklearn.metrics import confusion_matrix

metrics.confusion_matrix(y_test, y_pred)


array([[833,  73],
       [ 33, 369]], dtype=int64)

In [72]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 0.918960244648318


In [73]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.96      0.92      0.94       906
    positive       0.83      0.92      0.87       402

    accuracy                           0.92      1308
   macro avg       0.90      0.92      0.91      1308
weighted avg       0.92      0.92      0.92      1308



# Random Forest Classifier

In [74]:
# create the classifier
classifier = RandomForestClassifier(n_estimators=100)

In [75]:
# Train the model using the training sets
classifier.fit(X_train, y_train)

RandomForestClassifier()

In [76]:
# predictin on the test set
y_pred = classifier.predict(X_test)

In [77]:
# Calculate Model Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8669724770642202


In [85]:
confusion_matrix(y_test, y_pred)

array([[827,  79],
       [ 95, 307]], dtype=int64)

In [86]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 0.8669724770642202


In [78]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.90      0.91      0.90       906
    positive       0.80      0.76      0.78       402

    accuracy                           0.87      1308
   macro avg       0.85      0.84      0.84      1308
weighted avg       0.87      0.87      0.87      1308

