# Credit Risk Resampling Techniques

In [48]:
import warnings
warnings.filterwarnings('ignore')

In [49]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV into DataFrame

In [50]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


# Split the Data into Training and Testing

In [51]:
# Create our features
X = df.drop(columns=["loan_status", "homeowner"])

# Create our target
y = df.loan_status.to_frame()

In [52]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0


In [53]:
# Check the balance of our target values
y['loan_status'].value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [54]:
# Create X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [56]:
len(X_train)

62028

In [57]:
X_train

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
48847,9600.0,7.223,48600,0.382716,4,0,18600
19655,10000.0,7.376,50000,0.400000,4,1,20000
27757,9300.0,7.073,47200,0.364407,3,0,17200
8627,11600.0,8.070,56500,0.469027,5,1,26500
5099,10300.0,7.503,51200,0.414062,4,1,21200
...,...,...,...,...,...,...,...
62999,9900.0,7.346,49700,0.396378,4,0,19700
52171,10200.0,7.454,50700,0.408284,4,1,20700
60916,9600.0,7.220,48500,0.381443,4,0,18500
6270,9700.0,7.231,48600,0.382716,4,0,18600


## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [58]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scale_data = StandardScaler()

In [59]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
scale_data.fit(X_train, X_test)

StandardScaler()

In [60]:
# Scale the training and testing data

# X Data
X_scale = scale_data.transform(X_train)
X_train[:5]

X_scale = scale_data.transform(X_test)
X_test[:5]


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
41946,11300.0,7.93,55200,0.456522,5,1,25200
28187,8600.0,6.78,44400,0.324324,3,0,14400
28437,11000.0,7.787,53900,0.443414,5,1,23900
62874,9000.0,6.938,45900,0.346405,3,0,15900
47108,10100.0,7.434,50600,0.407115,4,1,20600


# Simple Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [62]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_scale)
balanced_accuracy_score(y_test, y_pred)

0.8062083610062025

In [63]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 487,    4],
       [5698, 9319]])

In [64]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.08      0.99      0.62      0.15      0.78      0.64       491
   low_risk       1.00      0.62      0.99      0.77      0.78      0.59     15017

avg / total       0.97      0.63      0.98      0.75      0.78      0.59     15508



In [65]:
print(len(y_test), len(y_pred))

15508 15508


In [66]:
print(y_pred)

['high_risk' 'low_risk' 'high_risk' ... 'low_risk' 'low_risk' 'high_risk']


In [67]:
print(y_test)

      loan_status
41946    low_risk
28187    low_risk
28437    low_risk
62874    low_risk
47108    low_risk
...           ...
20526    low_risk
17000    low_risk
24684    low_risk
44320    low_risk
37378    low_risk

[15508 rows x 1 columns]


# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [68]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resample, y_resample = ros.fit_resample(X_train, y_train)

# View the count of target classes with Counter
Counter(y_resample)
y_resample.shape

(120038, 1)

In [69]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

lgmodel = LogisticRegression(solver='lbfgs', random_state=1)
lgmodel.fit(X_resample, y_resample)
y_pred = model.predict(X_scale)

In [70]:
print(y_pred)

['high_risk' 'low_risk' 'high_risk' ... 'low_risk' 'low_risk' 'high_risk']


In [71]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.8062083610062025

In [72]:
# Import Confusion Matrix
from sklearn.metrics import confusion_matrix

In [73]:
# Display the confusion matrix
cfm = confusion_matrix(y_test, y_pred)
print(cfm)

[[ 487    4]
 [5698 9319]]


In [74]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.08      0.99      0.62      0.15      0.78      0.64       491
   low_risk       1.00      0.62      0.99      0.77      0.78      0.59     15017

avg / total       0.97      0.63      0.98      0.75      0.78      0.59     15508



### SMOTE Oversampling

In [75]:
# imports SMOTE
from imblearn.over_sampling import SMOTE

In [76]:
# Resample the training data with SMOTE
X_resample, y_resample = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)

# View the count of target classes with Counter
Counter(y_resample)

Counter({'loan_status': 1})

In [77]:
# Train the Logistic Regression model using the resampled data
lgmodel.fit(X_resample, y_resample)

LogisticRegression(random_state=1)

In [78]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_resample, y_resample)

1.0

In [79]:
# Display the confusion matrix
cfm = confusion_matrix(y_test, y_pred)
print(cfm)

[[ 487    4]
 [5698 9319]]


In [80]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.08      0.99      0.62      0.15      0.78      0.64       491
   low_risk       1.00      0.62      0.99      0.77      0.78      0.59     15017

avg / total       0.97      0.63      0.98      0.75      0.78      0.59     15508



# Undersampling

In this section, you will test an undersampling algorithm to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Display the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [81]:
# Imports 
from imblearn.under_sampling import ClusterCentroids

In [82]:
# Resample the data using the ClusterCentroids resampler
ccs=ClusterCentroids(random_state=1)
X_resample, y_resample = ccs.fit_resample(X_train, y_train)
# View the count of target classes with Counter
Counter(y_resample)

Counter({'loan_status': 1})

In [83]:
# Train the Logistic Regression model using the resampled data
lgmodel.fit(X_resample, y_resample)

LogisticRegression(random_state=1)

In [84]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_resample, y_resample)

1.0

In [85]:
# Display the confusion matrix
cfm = confusion_matrix(y_test, y_pred)
print(cfm)

[[ 487    4]
 [5698 9319]]


In [86]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.08      0.99      0.62      0.15      0.78      0.64       491
   low_risk       1.00      0.62      0.99      0.77      0.78      0.59     15017

avg / total       0.97      0.63      0.98      0.75      0.78      0.59     15508



# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Display the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [87]:
# Resample the training data with SMOTEENN
# YOUR CODE HERE

# View the count of target classes with Counter
# YOUR CODE HERE

In [88]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE

In [89]:
# Calculate the balanced accuracy score
# YOUR CODE HERE

In [90]:
# Display the confusion matrix
# YOUR CODE HERE

In [91]:
# Print the imbalanced classification report
# YOUR CODE HERE

# Final Questions

1. Which model had the best balanced accuracy score?

   I think somewhere in the last 2 i messed up the re-sampling, they all look very similar, so I am unsure

2. Which model had the best recall score?


3. Which model had the best geometric mean score?

