# RICE-VIRT-DATA-PT-05-2022-U-B-MW Final Project

## Code Summary
- **Purpose  :** Evaluation of Crime Trends Machine Learning Resampling Algorithms 
- **Created  :** 2022 Sept 29 22:25:12 UTC (Meghan E. Hull)

## Dependencies

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
import re
from sqlalchemy import create_engine
import psycopg2

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Version Check

In [4]:
!python --version

Python 3.7.13


In [5]:
!conda list | findstr numpy

numpy                     1.21.5           py37h7a0a035_1  
numpy-base                1.21.5           py37hca35cd5_1  
numpydoc                  1.2                pyhd3eb1b0_0  


In [6]:
!conda list | findstr pandas

pandas                    1.3.5            py37h6214cd6_0  


In [7]:
!conda list | findstr scipy

scipy                     1.7.3            py37h0a974cb_0  


In [8]:
!conda list | findstr scikit-learn

scikit-learn              1.0.2            py37hf11a4ad_1  
scikit-learn-intelex      2021.5.0         py37haa95532_0  


In [9]:
!conda list | findstr imbalanced-learn

imbalanced-learn          0.9.0                    pypi_0    pypi


## Institate a Linear Regression Model

In [9]:
model = LinearRegression()

## Report Tables

In [10]:
summary_df = pd.DataFrame(columns=['Balanced Accuracy Score', 
                                   'High Risk Precision Score', 
                                   'Low Risk Precision Score', 
                                   'High Risk Recall Score', 
                                   'Low Risk Recall Score',
                                   'High Risk F1 Score', 
                                   'Low Risk F1 Score'])

# 1. Import & Prep Client Data

## 1.1 Import & Initial Cleaning of Data

In [None]:
# Import database password
from config import db_password

# Define connection string
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Crime_AnalysisDB"

In [11]:
# File name
file_path = Path('Data/LoanStats_2019Q1.csv')
file_path

WindowsPath('Data/LoanStats_2019Q1.csv')

## 1.2 Split into Training & Testing
The data has been split along the following parameters:
- Target values are in the column "loan_status"
- Features values are converted from strings to numbers using the `get_dummies()` method, excluding the columns:
  - Text columns - "home_ownership", "verification_status", "initial_list_status", "application_type"
  - Date columns - "issue_d", "next_pymnt_d"
  - Boolean / "Y/N" columns - "pymnt_plan", "hardship_flag", "debt_settlement_flag"

In [15]:
# Create target
y = pd.DataFrame(df["loan_status"])

# Create features
X = pd.get_dummies(df, columns=['home_ownership', 
                                'verification_status', 
                                'issue_d', 
                                'pymnt_plan',
                                'initial_list_status', 
                                'next_pymnt_d', 
                                'application_type', 
                                'hardship_flag', 
                                'debt_settlement_flag']).drop('loan_status', axis=1)


In [16]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,issue_d_Mar-2019,pymnt_plan_n,initial_list_status_f,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.177238,1.0,0.123879,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.381873,0.0,0.329446,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
y.head(10)

Unnamed: 0,loan_status
0,low_risk
1,low_risk
2,low_risk
3,low_risk
4,low_risk
5,low_risk
6,low_risk
7,low_risk
8,low_risk
9,low_risk


In [18]:
# Check the balance of target values
y['loan_status'].value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [19]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print('Feature (X) Data Sets')
print(f'X_train Shape: {X_train.shape}')
print(f'X_test Shape: {X_test.shape}')
print('')
print('Target (y) Data Sets')
print(f'y_train Shape: {y_train.shape}')
print(f'y_test Shape: {y_test.shape}')
print('')
print('Target (y) Counters')
print(f'y_train: {Counter(y_train)}')
print(f'y_test: {Counter(y_test)}')

Feature (X) Data Sets
X_train Shape: (51612, 95)
X_test Shape: (17205, 95)

Target (y) Data Sets
y_train Shape: (51612, 1)
y_test Shape: (17205, 1)

Target (y) Counters
y_train: Counter({'loan_status': 1})
y_test: Counter({'loan_status': 1})


# 2. Oversampling
## 2.1 Overview
In this section, two oversampling algorithms are compared to determine which algorithm results in the best performance: the naive random oversampling algorithm and the SMOTE algorithm. Each algorithm uses the folliowing steps:

1. Resample data with algorithm & view the count of the target classes using `Counter` from the collections library. 
3. Train a logistic regression model with the resampled data.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

**Note:** A random state of 1 is used for each sampling algorithm to ensure consistency between tests

## 2.2 Naive Random Oversampling (`RandomOverSampler`)

### 2.2.1 Resample training data with algorithm

In [20]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

# Instantiate the model
ros = RandomOverSampler(random_state=1)

# Resample the targets
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'loan_status': 1})

### 2.2.2 Train model & predict test target values

In [21]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [22]:
# Generate predicted target values
y_pred = classifier.predict(X_test)

### 2.2.3 Balanced Accuracy Score

In [23]:
# Calculated the balanced accuracy score
modelBAScore = balanced_accuracy_score(y_test, y_pred)
modelBAScore

0.6285327805778149

### 2.2.4 Confusion Matrix

In [24]:
# Display the confusion matrix
myCM = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
myCM_df = pd.DataFrame(
    myCM, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
myCM_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,52,35
Actual low_risk,5831,11287


### 2.2.5 Classification Report

In [25]:
# Print the imbalanced classification report
myReport = classification_report_imbalanced(y_test, y_pred, output_dict=True)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.60      0.66      0.02      0.63      0.39        87
   low_risk       1.00      0.66      0.60      0.79      0.63      0.40     17118

avg / total       0.99      0.66      0.60      0.79      0.63      0.40     17205



In [26]:
summary_df.loc['Naive Random Oversampling'] = [modelBAScore,
                                               myReport['high_risk']['pre'],
                                               myReport['low_risk']['pre'],
                                               myReport['high_risk']['rec'],
                                               myReport['low_risk']['rec'],
                                               myReport['high_risk']['f1'],
                                               myReport['low_risk']['f1']]
summary_df

Unnamed: 0,Balanced Accuracy Score,High Risk Precision Score,Low Risk Precision Score,High Risk Recall Score,Low Risk Recall Score,High Risk F1 Score,Low Risk F1 Score
Naive Random Oversampling,0.628533,0.008839,0.996909,0.597701,0.659364,0.01742,0.793741


## 2.3 SMOTE Oversampling

### 2.3.1 Resample training data with algorithm

In [27]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

# Instantiate the model
mySMOTE = SMOTE(random_state=1)

# Resample the targets
X_resampled, y_resampled = mySMOTE.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'loan_status': 1})

### 2.3.2 Train model & predict test target values

In [28]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [29]:
# Generate predicted target values
y_pred = classifier.predict(X_test)

### 2.3.3 Balanced Accuracy Score

In [30]:
# Calculated the balanced accuracy score
modelBAScore = balanced_accuracy_score(y_test, y_pred)
modelBAScore

0.6417211565966053

### 2.3.4 Confusion Matrix

In [31]:
# Display the confusion matrix
myCM = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
myCM_df = pd.DataFrame(
    myCM, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
myCM_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,54,33
Actual low_risk,5773,11345


### 2.3.5 Classification Report

In [32]:
# Print the imbalanced classification report
myReport = classification_report_imbalanced(y_test, y_pred, output_dict=True)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.62      0.66      0.02      0.64      0.41        87
   low_risk       1.00      0.66      0.62      0.80      0.64      0.41     17118

avg / total       0.99      0.66      0.62      0.79      0.64      0.41     17205



In [33]:
summary_df.loc['SMOTE Oversampling'] = [modelBAScore,
                                               myReport['high_risk']['pre'],
                                               myReport['low_risk']['pre'],
                                               myReport['high_risk']['rec'],
                                               myReport['low_risk']['rec'],
                                               myReport['high_risk']['f1'],
                                               myReport['low_risk']['f1']]
summary_df

Unnamed: 0,Balanced Accuracy Score,High Risk Precision Score,Low Risk Precision Score,High Risk Recall Score,Low Risk Recall Score,High Risk F1 Score,Low Risk F1 Score
Naive Random Oversampling,0.628533,0.008839,0.996909,0.597701,0.659364,0.01742,0.793741
SMOTE Oversampling,0.641721,0.009267,0.9971,0.62069,0.662753,0.018262,0.796252


# 3. Undersampling

## 3.1 Overview
In this section, the undersampling algorithm Cluster Centroids is utilized. The same steps are used as for oversampling:

1. Resample data with algorithm & view the count of the target classes using `Counter` from the collections library. 
3. Train a logistic regression model with the resampled data.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

**Note:** A random state of 1 is used for each sampling algorithm to ensure consistency between tests

## 3.2 Cluster Centroids

### 3.2.1 Resample training data with algorithm

In [34]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

# Instantiate the model
myCC = ClusterCentroids(random_state=1)

# Resample the targets
X_resampled, y_resampled = myCC.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'loan_status': 1})

### 3.2.2 Train model & predict test target values

In [35]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [36]:
# Generate predicted target values
y_pred = classifier.predict(X_test)

### 3.2.3 Balanced Accuracy Score

In [37]:
# Calculated the balanced accuracy score
modelBAScore = balanced_accuracy_score(y_test, y_pred)
modelBAScore

0.5300701822239949

### 3.2.4 Confusion Matrix

In [38]:
# Display the confusion matrix
myCM = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
myCM_df = pd.DataFrame(
    myCM, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
myCM_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,56,31
Actual low_risk,9989,7129


### 3.2.5 Classification Report

In [39]:
# Print the imbalanced classification report
myReport = classification_report_imbalanced(y_test, y_pred, output_dict=True)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.64      0.42      0.01      0.52      0.27        87
   low_risk       1.00      0.42      0.64      0.59      0.52      0.26     17118

avg / total       0.99      0.42      0.64      0.58      0.52      0.26     17205



In [40]:
summary_df.loc['Cluster Centroids Undersampling'] = [modelBAScore,
                                               myReport['high_risk']['pre'],
                                               myReport['low_risk']['pre'],
                                               myReport['high_risk']['rec'],
                                               myReport['low_risk']['rec'],
                                               myReport['high_risk']['f1'],
                                               myReport['low_risk']['f1']]
summary_df

Unnamed: 0,Balanced Accuracy Score,High Risk Precision Score,Low Risk Precision Score,High Risk Recall Score,Low Risk Recall Score,High Risk F1 Score,Low Risk F1 Score
Naive Random Oversampling,0.628533,0.008839,0.996909,0.597701,0.659364,0.01742,0.793741
SMOTE Oversampling,0.641721,0.009267,0.9971,0.62069,0.662753,0.018262,0.796252
Cluster Centroids Undersampling,0.53007,0.005575,0.99567,0.643678,0.416462,0.011054,0.587281


# 4. Combination (Over and Under) Sampling
## 4.1 Overview
In this section, a combination over- and under-sampling algorithm SMOTEENN is utilized. The same steps are used as for oversampling:

1. Resample data with algorithm & view the count of the target classes using `Counter` from the collections library. 
3. Train a logistic regression model with the resampled data.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

**Note:** A random state of 1 is used for each sampling algorithm to ensure consistency between tests

## 4.2 SMOTEENN

### 4.2.1 Resample training data with algorithm

In [41]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

# Instantiate the model
mySMOTEENN = SMOTEENN(random_state=1)

# Resample the targets
X_resampled, y_resampled = mySMOTEENN.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'loan_status': 1})

### 4.2.2 Train model & predict test target values

In [42]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [43]:
# Generate predicted target values
y_pred = classifier.predict(X_test)

### 4.2.3 Balanced Accuracy Score

In [44]:
# Calculated the balanced accuracy score
modelBAScore = balanced_accuracy_score(y_test, y_pred)
modelBAScore

0.651763016143523

### 4.2.4 Confusion Matrix

In [45]:
# Display the confusion matrix
myCM = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
myCM_df = pd.DataFrame(
    myCM, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
myCM_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,59,28
Actual low_risk,6413,10705


### 4.2.5 Classification Report

In [46]:
# Print the imbalanced classification report
myReport = classification_report_imbalanced(y_test, y_pred, output_dict=True)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.68      0.63      0.02      0.65      0.43        87
   low_risk       1.00      0.63      0.68      0.77      0.65      0.42     17118

avg / total       0.99      0.63      0.68      0.76      0.65      0.42     17205



In [47]:
summary_df.loc['SMOTEENN Hybrid Sampling'] = [modelBAScore,
                                               myReport['high_risk']['pre'],
                                               myReport['low_risk']['pre'],
                                               myReport['high_risk']['rec'],
                                               myReport['low_risk']['rec'],
                                               myReport['high_risk']['f1'],
                                               myReport['low_risk']['f1']]
summary_df

Unnamed: 0,Balanced Accuracy Score,High Risk Precision Score,Low Risk Precision Score,High Risk Recall Score,Low Risk Recall Score,High Risk F1 Score,Low Risk F1 Score
Naive Random Oversampling,0.628533,0.008839,0.996909,0.597701,0.659364,0.01742,0.793741
SMOTE Oversampling,0.641721,0.009267,0.9971,0.62069,0.662753,0.018262,0.796252
Cluster Centroids Undersampling,0.53007,0.005575,0.99567,0.643678,0.416462,0.011054,0.587281
SMOTEENN Hybrid Sampling,0.651763,0.009116,0.997391,0.678161,0.625365,0.017991,0.768734


In [48]:
filepath = Path('Data/Resampling_Summary.csv')
summary_df.to_csv(filepath)