In [1]:
#Import dependenices
import csv
import pandas as pd
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
%matplotlib inline





# Machine Learning Models

### Ensemble Model

In [2]:
# Step 1: Read the Clean_train_scaled.csv data from the Resources folder into a Pandas DataFrame.

In [3]:
# Specify the path to your CSV file
csv_file_path = '../Resources/Clean_train_scaled.csv'

# Read the CSV file into a DataFrame
loan_data = pd.read_csv(csv_file_path)

#Print size of data set
print(loan_data.shape)

# Display the DataFrame
loan_data.head(5)

(65072, 15)


Unnamed: 0,Current loan amount,Credit score,Annual income,Monthly debt,Years of credit history,Number of open accounts,Number of credit problems,Current credit balance,Maximum open credit,Long term,Short term,Home mortgage,Own home,Rent,Bankrupt
0,-0.40305,-0.248996,-0.195065,-1.08806,-0.139697,-1.023574,1.699378,-0.175312,-0.046075,0,1,1,0,0,1
1,-0.405859,-0.240861,-0.528623,-0.799685,-0.881886,-0.426898,-0.348918,-0.101568,-0.050396,0,1,0,1,0,0
2,-0.409913,4.212629,-0.445444,-0.176207,-0.125424,-1.023574,-0.348918,-0.209072,-0.067003,1,0,0,1,0,0
3,-0.40008,-0.270013,1.092393,0.011228,0.631038,-1.421359,-0.348918,0.372367,-0.026019,0,1,1,0,0,0
4,-0.409645,-0.228658,0.070678,1.696882,-0.610702,1.760918,-0.348918,0.981391,0.041627,0,1,1,0,0,1


In [4]:
# Step 2: Create the labels set (y) from the “Rent” column, and then create the features (X) DataFrame from the remaining columns.

In [5]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = loan_data['Bankrupt']

# Separate the X variable, the features
X = loan_data.drop(columns=['Bankrupt'])

In [6]:
# Review the y variable Series
y.head()

0    1
1    0
2    0
3    0
4    1
Name: Bankrupt, dtype: int64

In [7]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Current loan amount,Credit score,Annual income,Monthly debt,Years of credit history,Number of open accounts,Number of credit problems,Current credit balance,Maximum open credit,Long term,Short term,Home mortgage,Own home,Rent
0,-0.40305,-0.248996,-0.195065,-1.08806,-0.139697,-1.023574,1.699378,-0.175312,-0.046075,0,1,1,0,0
1,-0.405859,-0.240861,-0.528623,-0.799685,-0.881886,-0.426898,-0.348918,-0.101568,-0.050396,0,1,0,1,0
2,-0.409913,4.212629,-0.445444,-0.176207,-0.125424,-1.023574,-0.348918,-0.209072,-0.067003,1,0,0,1,0
3,-0.40008,-0.270013,1.092393,0.011228,0.631038,-1.421359,-0.348918,0.372367,-0.026019,0,1,1,0,0
4,-0.409645,-0.228658,0.070678,1.696882,-0.610702,1.760918,-0.348918,0.981391,0.041627,0,1,1,0,0


In [8]:
# Check the balance of our target values
y.value_counts()

0    57817
1     7255
Name: Bankrupt, dtype: int64

In [9]:
# Step 3: Split the data into training and testing datasets by using train_test_split.

In [10]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# Create a Logistic Regression Model with the Original Data
# Step 1:Fit a logistic regression model by using the training data (X_train and y_train).

In [12]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [13]:
# Step 2: Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [14]:
# Make a prediction using the testing data
LR_predictions = lr_model.predict(X_test)

In [15]:
# Make a prediction using the testing data
test_predictions = logistic_regression_model.predict(X_test)
pd.DataFrame({'Predictions': test_predictions, 'Actual': y_test})

Unnamed: 0,Predictions,Actual
52624,0,0
43771,0,0
34703,0,0
63592,0,0
64962,0,0
...,...,...
52237,0,0
47472,0,0
50780,0,0
20425,0,0


In [16]:
# Step 3: Evaluate the model’s performance by doing the following:
# 1. Generate Confusion Matrix
# 2. Print Classification report

In [17]:
# Generate a confusion matrix for the model
cf_test_matrix = confusion_matrix(y_test, test_predictions)
cf_test_matrix 

array([[14475,     0],
       [ 1793,     0]], dtype=int64)

In [18]:
cm_imbalanced = confusion_matrix(y_test, LR_predictions)
cm_imbalanced_df = pd.DataFrame(cm_imbalanced, 
                                index = ['Actual Healthy Loans (low-risk)', 
                                'Actual Non-Healthy Loans (high-risk)'], 
                                columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)']
                              )
cm_imbalanced_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Actual Healthy Loans (low-risk),14475,0
Actual Non-Healthy Loans (high-risk),1793,0


In [19]:
# Print the classification report for the model
testing_report = classification_report(y_test, test_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     14475
           1       0.00      0.00      0.00      1793

    accuracy                           0.89     16268
   macro avg       0.44      0.50      0.47     16268
weighted avg       0.79      0.89      0.84     16268



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# My balanced accuracy score is 55%!  It takes into account both sensitivity (true positive rate) and specificity 
#(true negative rate) to measure the performance of my model, especially when dealing with imbalanced datasets. With 80,240 rejected
#loans and only 443 accepted ones. It’s important to have metric that considers this imbalance.

# And guess what? My model’s accuracy score is a whopping 99% That means it correctly predicted 99% of all loans in the dataset.That’s 
# pretty incredible right?

#Looking at the confusion matrix my model did a great job by accepting 20,045 healthy loans and rejecting 10 high-risk loans accurately.
#However, they were a few cases where it made some mistakes like rejecting 30 healthy loans and accepting 86 high-risk loans. But 
#overall, it seems like my model is doing a fantastic job.

# Precision and recall are also important measures. My model has a precision score of 25% for default loans and perfect 100% for healthy 
#loans. This shows how well my prediction matched reality. And when it comes to recall which looks at how well reality matched the 
#predictions my model scored 10% for default loans and 100% for healthy loans.

# Finally, the F1 score. It’s the weighted average of precision and recall scores, and it is the great way to evaluate the overall 
# performance of my model. F1 score is 15% for default loans 100% for healthy loans.

In [22]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ROS_model = RandomOverSampler(random_state = 1)

In [23]:
# Fit the original training data to the random_oversampler model
X_oversampled, y_oversampled = ROS_model.fit_resample(X_train, y_train)

In [24]:
# Count the distinct values of the resampled labels data
y_oversampled.value_counts()

0    43342
1    43342
Name: Bankrupt, dtype: int64

In [25]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_oversampled_model = LogisticRegression(random_state = 1)

In [26]:
# Fit the model using the resampled training data
LR_oversampled_model.fit(X_oversampled, y_oversampled)

In [27]:
# Make a prediction using the testing data
LR_oversampled_pred = LR_oversampled_model.predict(X_test)

In [28]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, LR_oversampled_pred)

0.5117416512305097

In [29]:
# Generate a confusion matrix for the model
cm_oversampled = confusion_matrix(y_test, LR_oversampled_pred)
cm_oversampled_df = pd.DataFrame(cm_oversampled, 
                                index = ['Actual Healthy Loans (low-risk)', 
                                'Actual Non-Healthy Loans (high-risk)'], 
                                columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)']
                              )
cm_oversampled_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Actual Healthy Loans (low-risk),5321,9154
Actual Non-Healthy Loans (high-risk),617,1176


In [30]:
'''
-- There were 19,286 loan status's that are healthy, and 96 high risk non healthy loans which were predicted correctly. 

-- 249 loan status's that are non-healthy (low-risk), the model 
   predicted predicted wrongly. '''

"\n-- There were 19,286 loan status's that are healthy, and 96 high risk non healthy loans which were predicted correctly. \n\n-- 249 loan status's that are non-healthy (low-risk), the model \n   predicted predicted wrongly. "

In [31]:
# Print the classification report for the model
print(classification_report(y_test, LR_oversampled_pred))

              precision    recall  f1-score   support

           0       0.90      0.37      0.52     14475
           1       0.11      0.66      0.19      1793

    accuracy                           0.40     16268
   macro avg       0.50      0.51      0.36     16268
weighted avg       0.81      0.40      0.49     16268



In [32]:
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

  warn(
  warn(


In [33]:
y_pred = rf_model.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

0.5168428817310057


In [34]:
matrix = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    matrix, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Prediceted Low_Risk"])
cm_df

Unnamed: 0,Predicted High_Risk,Prediceted Low_Risk
Actual High-Risk,7537,6938
Actual Low-Risk,922,871


In [35]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.52      0.49      0.66      0.50      0.25     14475
          1       0.11      0.49      0.52      0.18      0.50      0.25      1793

avg / total       0.81      0.52      0.49      0.60      0.50      0.25     16268



In [36]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.12918476554506575, 'Annual income'),
 (0.12893784843980505, 'Maximum open credit'),
 (0.12841595526200028, 'Monthly debt'),
 (0.12709648919123637, 'Current credit balance'),
 (0.12043545484097758, 'Years of credit history'),
 (0.11635559614027689, 'Current loan amount'),
 (0.1151395908899691, 'Credit score'),
 (0.08370366463159516, 'Number of open accounts'),
 (0.01371003142176484, 'Number of credit problems'),
 (0.012298957997739345, 'Home mortgage'),
 (0.011672385408330639, 'Rent'),
 (0.007428618429780446, 'Own home'),
 (0.002881718674058854, 'Short term'),
 (0.0027389231273996922, 'Long term')]

In [37]:
#Easy Ensemble AdaBoost Classifier

In [38]:
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

In [39]:
y_pred = eec.predict(X_test)
acc_score2 = accuracy_score(y_test, y_pred)
print(acc_score2)

0.46004425866732235


In [40]:
cm_df = pd.DataFrame(
    matrix, index=["Actual High-Risk", "Actual Low-Risk"], columns=["Predicted High_Risk", "Prediceted Low_Risk"])
cm_df

Unnamed: 0,Predicted High_Risk,Prediceted Low_Risk
Actual High-Risk,7537,6938
Actual Low-Risk,922,871


In [41]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.45      0.57      0.60      0.51      0.25     14475
          1       0.11      0.57      0.45      0.19      0.51      0.26      1793

avg / total       0.81      0.46      0.56      0.55      0.51      0.25     16268

