In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from imblearn.under_sampling import TomekLinks
from collections import Counter
from imblearn.under_sampling import NearMiss
from joblib import dump
from joblib import load
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
import pickle
from sklearn.ensemble import RandomForestClassifier
from imblearn.metrics import geometric_mean_score


DATA PREPARATION

In [None]:
df = pd.read_csv('creditcard.csv')  
scaler = StandardScaler()

df['Amount'] = scaler.fit_transform(df[['Amount']])
df['Time'] = scaler.fit_transform(df[['Time']])
X = df.drop('Class', axis=1)

y = df['Class']

TRAIN SPLIT 80 % 20 % , HYPERPARAMETER TUNING FOR Random Forrest, STRATIFIED FOLD FOR VALIDATION

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_grid_cost = {
    'n_estimators': [200,400,600],
    'criterion' :["gini","entropy"],
    'max_features' : ['sqrt','log2'],
    'class_weight': ["balanced",{0: 1, 1: 1.5},{0: 1, 1: 1}, {0:1,1:2} , {0: 1, 1: 10},{0: 1, 1: 3},{0: 1, 1: 4},{0: 1, 1: 5},{0: 1, 1: 15},{0: 1, 1: 20}]
}



param_grid = {
    'n_estimators': [200,400,600],
    'criterion' :["gini","entropy"],
    'max_features' : ['sqrt','log2'],
    #max depth / min sample split ( Tried those but lead to overfit)
}

param_grid_default = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


FUNCTION TO EVALUATE PERFORMANCE 

In [None]:
def evaluate_model_performance(logistic, X_test, y_test):
    y_pred = logistic.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    gmean = geometric_mean_score(y_test, y_pred)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"f1-score: {f1}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("\n")
    print(f"Classification Report:\n{classification_rep}")
    print(f"Geometric Mean: {gmean}")

FUNCTION FOR P R CURVE :

In [None]:
def PRCurve(model,X_test,y_test,title):
    y_scores = model.predict_proba(X_test)[:, 1]  

    precision, recall, _ = precision_recall_curve(y_test, y_scores)

    auprc = auc(recall, precision)
    print(f"Area Under the Precision-Recall Curve (AUPRC): {auprc}")


    plt.figure()
    plt.plot(recall, precision, label='Precision-Recall curve',color='red')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.show()

In [None]:
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='f1',n_jobs=-1)

grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
rf_baseline = grid_search.best_estimator_
dump(rf_baseline, 'Models/rf_Baseline.joblib')

In [None]:
rf_baseline = load('Models/rf_Baseline.joblib')
evaluate_model_performance(rf_baseline, X_test, y_test)

In [None]:
PRCurve(rf_baseline,X_test,y_test,"PR Curve Hyperparameter Tuned Random Forest")

**UNDERSAMPLING TECHNIQUES**

Tomek Links:

In [None]:
#Import Tomek Link from previous Notebook (takes a long time to compute)
with open('resampled_Tomek.pkl', 'rb') as file:
    X_tomek, Y_tomek = pickle.load(file)

In [None]:
rf = RandomForestClassifier(random_state=42)
grid_search_tl = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_tl.fit(X_tomek, Y_tomek)
grid_search_tl.best_params_

In [None]:
rf_tl = grid_search_tl.best_estimator_
dump(rf_tl, 'Models/rf_TomekLink.joblib')

In [None]:
rf_tomek = load('Models/rf_TomekLink.joblib')
evaluate_model_performance(rf_tomek, X_test, y_test)

In [None]:
PRCurve(rf_tomek,X_test,y_test,"PR Curve Random Forest Tomek LinK")

**Near Miss 3 Implementations:**

In [None]:
rf = RandomForestClassifier(random_state=42)

nearMiss1 = NearMiss(version=1,n_jobs=-1)

X_resampled_nm1, Y_resampled_nm1 = nearMiss1.fit_resample(X_train, y_train)

grid_search_nm1 = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_nm1.fit(X_resampled_nm1,Y_resampled_nm1)

print(f"Before resampling: {Counter(y_train)}")
print(f"After resampling with NearMiss-1: {Counter(Y_resampled_nm1)}")

grid_search_nm1.best_params_

In [None]:
rf_NearMiss1 = grid_search_nm1.best_estimator_
dump(rf_NearMiss1, 'Models/rf_NearMiss1.joblib')

In [None]:
rf_NearMiss1 = load('Models/rf_NearMiss1.joblib')
evaluate_model_performance(rf_NearMiss1, X_test, y_test)

In [None]:
PRCurve(rf_NearMiss1,X_test,y_test,"PR Curve Random Forest NearMiss(1)")

Near Miss 2:

In [None]:
nearMiss2 = NearMiss(version=2,n_jobs=-1)
rf = RandomForestClassifier(random_state=42)

X_resampled_nm2, Y_resampled_nm2 = nearMiss2.fit_resample(X_train, y_train)

grid_search_nm2 = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_nm2.fit(X_resampled_nm2,Y_resampled_nm2)

grid_search_nm2.best_params_

In [None]:
rf_NearMiss2 = grid_search_nm2.best_estimator_
dump(rf_NearMiss2, 'Models/rf_NearMiss2.joblib')

In [None]:
rf_NearMiss2 = load('Models/rf_NearMiss2.joblib')
evaluate_model_performance(rf_NearMiss2, X_test, y_test)

In [None]:
PRCurve(rf_NearMiss2,X_test,y_test,"PR Curve Random Forest NearMiss(2)")

Near Miss 3 :

In [None]:
nearMiss3 = NearMiss(version=3,n_jobs=-1)
rf = RandomForestClassifier(random_state=42)

X_resampled_nm3, Y_resampled_nm3 = nearMiss3.fit_resample(X_train, y_train)

grid_search_nm3 = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_nm3.fit(X_resampled_nm3,Y_resampled_nm3)
grid_search_nm3.best_params_

In [None]:
rf_NearMiss3 = grid_search_nm3.best_estimator_
dump(rf_NearMiss3, 'Models/rf_NearMiss3.joblib')

In [None]:
rf_NearMiss3 = load('Models/rf_NearMiss3.joblib')
evaluate_model_performance(rf_NearMiss3, X_test, y_test)

In [None]:
PRCurve(rf_NearMiss3,X_test,y_test,"PR Curve Random Forest NearMiss(3)")

**OVERSAMPLING TECHNIQUES:**

SMOTE:

In [None]:
print(f"Features shape: {X_train.shape}")
smote = SMOTE(random_state=42, n_jobs=-1)

rf = RandomForestClassifier(random_state=42)


X_resampled_smote, Y_resampled_smote = smote.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_smote.shape}")

grid_search_smote = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_smote.fit(X_resampled_smote,Y_resampled_smote)
grid_search_smote.best_params_

In [None]:
rf_smote = grid_search_smote.best_estimator_
dump(rf_smote, 'Models/rf_Smote.joblib')

In [None]:
print(f"Features shape: {X_train.shape}")
print(f"Target shape: {y_train.shape}")
print("\nAfter SMOTE:")

print(f"Features shape: {X_resampled_smote.shape}")
print(f"Target shape: {Y_resampled_smote.shape}")

In [None]:
rf_SMOTE = load('Models/rf_Smote.joblib')
evaluate_model_performance(rf_SMOTE, X_test, y_test)

In [None]:
PRCurve(rf_SMOTE,X_test,y_test,"PR Curve Random Forest SMOTE")

ADASYN:

In [None]:
print(f"Features shape: {X_train.shape}")
adasyn = ADASYN(random_state=42, n_jobs=-1)
rf = RandomForestClassifier(random_state=42)

X_resampled_ad, Y_resampled_ad = adasyn.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_ad.shape}")

grid_search_ad = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_ad.fit(X_resampled_ad,Y_resampled_ad)
grid_search_ad.best_params_

In [None]:
dump(grid_search_ad.best_estimator_, 'Models/rf_adasyn.joblib')

In [None]:
rf_adasyn = load('Models/rf_adasyn.joblib')
evaluate_model_performance(rf_adasyn, X_test, y_test)

In [None]:
PRCurve(rf_adasyn,X_test,y_test,"PR Curve Random Forest ADASYN")

**BORDERLINE SMOTE 2 APPROACHES**

BORDERLINE SMOTE 1

In [None]:
print(f"Features shape: {X_train.shape}")

borderline_smote = BorderlineSMOTE(random_state=42, n_jobs=-1)

rf = RandomForestClassifier(random_state=42)
X_resampled_bs, Y_resampled_bs = borderline_smote.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_bs.shape}")

grid_search_bs = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_bs.fit(X_resampled_bs, Y_resampled_bs)
grid_search_bs.best_params_

In [None]:
dump(grid_search_bs.best_estimator_, 'Models/rf_borderlinesmote1.joblib')

In [None]:
rf_Bsmote = load('Models/rf_borderlinesmote1.joblib')
evaluate_model_performance(rf_Bsmote, X_test, y_test)

In [None]:
PRCurve(rf_Bsmote,X_test,y_test,"PR Curve Random Forest BorderlineSMOTE(1)")

BORDERLINE SMOTE 2

In [None]:
print(f"Features shape: {X_train.shape}")

borderline_smote2 = BorderlineSMOTE(random_state=42, n_jobs=-1,kind='borderline-2')

rf = RandomForestClassifier(random_state=42)

X_resampled_bs2, Y_resampled_bs2 = borderline_smote2.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_bs2.shape}")

grid_search_bs2 = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_bs2.fit(X_resampled_bs2, Y_resampled_bs2)
grid_search_bs2.best_params_

In [None]:
rf_borderlinesmote2 = grid_search_bs2.best_estimator_
dump(rf_borderlinesmote2, 'Models/rf_borderlinesmote2.joblib')

In [None]:
rf_borderlinesmote2 = load('Models/rf_borderlinesmote2.joblib')
evaluate_model_performance(rf_borderlinesmote2, X_test, y_test)

In [None]:
PRCurve(rf_borderlinesmote2,X_test,y_test,"PR Curve Random Forest BorderlineSMOTE(2)")

**HYBRID APPROACHES :**

SMOTETomek

In [None]:
with open('resampled_data.pkl', 'rb') as file:
    X_loaded, Y_loaded = pickle.load(file)

In [None]:

smote_tomek = SMOTETomek(random_state=42, n_jobs=-1)
rf = RandomForestClassifier(random_state=42)

X_resampled_st, Y_resampled_st = smote_tomek.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_st.shape}")

grid_search_st = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_st.fit(X_resampled_st, Y_resampled_st)
grid_search_st.best_params_

In [None]:
rf_smoteTomek= grid_search_st.best_estimator_
dump(rf_smoteTomek, 'Models/rf_smoteTtomek.joblib')

In [None]:
rf_smokeTomek = load('Models/rf_smoteTtomek.joblib')
evaluate_model_performance(rf_smokeTomek,X_test,y_test)

In [None]:
PRCurve(rf_smoteTomek,X_test,y_test,"PR Curve Random Forest SMOTETomek")

In [None]:
evaluate_model_performance(grid_search_st.best_params_,X_test,y_test)

SMOTEENN:

In [None]:
with open('resampled_dataSmoteE.pkl', 'rb') as file:
    X_loaded, Y_loaded = pickle.load(file)

In [None]:

rf = RandomForestClassifier(random_state=42)
grid_search_se = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_se.fit(X_loaded, Y_loaded)
grid_search_se.best_params_

In [None]:
#Features shape: (227845, 30)
 
#Features shape after SMOTEENN: (454505, 30)

In [None]:
rf_se = grid_search_se.best_estimator_
dump(rf_se, 'Models/rf_smoteE.joblib')

In [None]:
rf_se = load( 'Models/rf_smoteE.joblib')
evaluate_model_performance(rf_se,X_test,y_test)

In [None]:
PRCurve(rf_se,X_test,y_test,"PR Curve Random Forest SMOTEENN")

**COST SENSITIVE LEARNING**

In [None]:

rf = RandomForestClassifier(random_state=42)

grid_search_cost = GridSearchCV(rf, param_grid_cost, cv=cv, scoring='f1',n_jobs=-1)

grid_search_cost.fit(X_train, y_train)
grid_search_cost.best_params_

In [None]:
rf_cost = grid_search_cost.best_estimator_
dump(rf_cost, 'Models/rf_costsensitive.joblib')

In [None]:
rf_cost = load('Models/rf_costsensitive.joblib')

In [None]:
evaluate_model_performance(rf_cost,X_test,y_test)

In [None]:
PRCurve(rf_cost,X_test,y_test,"PR Curve Random Forest Cost Sensitive Learning ")