In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from imblearn.under_sampling import TomekLinks
from collections import Counter
from imblearn.under_sampling import NearMiss
from joblib import dump
from joblib import load
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
import pickle
from imblearn.metrics import geometric_mean_score
from sklearn.dummy import DummyClassifier


DATA PREPARATION

In [None]:
df = pd.read_csv('creditcard.csv')  
scaler = StandardScaler()

df['Amount'] = scaler.fit_transform(df[['Amount']])
df['Time'] = scaler.fit_transform(df[['Time']])
X = df.drop('Class', axis=1)

y = df['Class']

TRAIN SPLIT 80 % 20 % , HYPERPARAMETER TUNING FOR LOGISTIC REGRESSION, STRATIFIED FOLD FOR VALIDATION

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_grid_cost = {
    'C': [0.1,1,10], 
    'penalty': ['l1','l2'],  
    'solver': ['liblinear','saga'], 
     'class_weight': ["balanced",{0: 1, 1: 1.5},{0: 1, 1: 1}, {0:1,1:2} , {0: 1, 1: 10},{0: 1, 1: 3},{0: 1, 1: 4},{0: 1, 1: 5},{0: 1, 1: 15},{0: 1, 1: 20}]
}

param_grid = {
    'C': [0.1,1,10], 
    'penalty': ['l1','l2'],  
    'solver': ['liblinear','saga'], 
}

param_grid_empty ={}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


FUNCTION TO EVALUATE PERFORMANCE 

In [None]:
def evaluate_model_performance(logistic, X_test, y_test):
    y_pred = logistic.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    gmean = geometric_mean_score(y_test, y_pred)


    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"f1-score: {f1}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("\n")
    print(f"Classification Report:\n{classification_rep}")
    print(f"Geometric Mean: {gmean}")

FUNCTION FOR P R CURVE :

In [None]:
def PRCurve(model,X_test,y_test,title):
    y_scores = model.predict_proba(X_test)[:, 1]  

    precision, recall, _ = precision_recall_curve(y_test, y_scores)

    auprc = auc(recall, precision)
    print(f"Area Under the Precision-Recall Curve (AUPRC): {auprc}")


    plt.figure()
    plt.plot(recall, precision, label='Precision-Recall curve',color='red')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.show()

DUMMY CLASSIFIER BASELINE

In [None]:
dummy_clf = DummyClassifier(strategy="uniform",random_state =42)
dummy_clf.fit(X_train, y_train)

In [None]:
evaluate_model_performance(dummy_clf, X_test, y_test)

In [None]:
PRCurve(dummy_clf,X_test,y_test,"PR Curve DummyClassifier Baseline")

In [None]:
log = LogisticRegression(random_state=42)

grid_search = GridSearchCV(log, param_grid, cv=cv, scoring='f1',n_jobs=-1)

grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
logistic_baseline = grid_search.best_estimator_

dump(logistic_baseline, 'Models/lr_Baseline.joblib')

In [None]:
logistic_baseline = load('Models/lr_Baseline.joblib')
evaluate_model_performance(logistic_baseline, X_test, y_test)

In [None]:
PRCurve(logistic_baseline,X_test,y_test,"PR Curve Hyperparameter Tuned Logistic Regression")

**UNDERSAMPLING TECHNIQUES**

Tomek Links:

In [None]:
print(f'Before Tomek Links undersampling: {X_train.shape[0]} samples')
tl = TomekLinks(n_jobs=-1) 
X_res, y_res = tl.fit_resample(X_train, y_train)
print(f'After Tomek Links undersampling: {X_res.shape[0]} samples')

In [None]:
log = LogisticRegression(random_state=42)
grid_search_tl = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_tl.fit(X_res, y_res)
grid_search_tl.best_params_

In [None]:
with open('resampled_Tomek.pkl', 'wb') as file:
    pickle.dump((X_res, y_res), file)

In [None]:
Logistic_TomekLinks = grid_search_tl.best_estimator_
dump(Logistic_TomekLinks, 'Models/lr_TomekLink.joblib')

In [None]:
logistic_Tomek = load('Models/lr_TomekLink.joblib')
evaluate_model_performance(logistic_Tomek, X_test, y_test)

In [None]:
PRCurve(logistic_Tomek,X_test,y_test,"PR Curve Logistic Regression Tomek LinK")

**Near Miss 3 Implementations:**

In [None]:
nearMiss1 = NearMiss(version=1,n_jobs=-1,n_neighbors=5)

X_resampled_nm1, Y_resampled_nm1 = nearMiss1.fit_resample(X_train, y_train)

grid_search_nm1 = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_nm1.fit(X_resampled_nm1,Y_resampled_nm1)

print(f"Before resampling: {Counter(y_train)}")
print(f"After resampling with NearMiss-1: {Counter(Y_resampled_nm1)}")

grid_search_nm1.best_params_

In [None]:
logistic_NearMiss1 = grid_search_nm1.best_estimator_
dump(logistic_NearMiss1, 'Models/lr_NearMiss1.joblib')

In [None]:
logistic_NearMiss1 = load('Models/lr_NearMiss1.joblib')
evaluate_model_performance(logistic_NearMiss1, X_test, y_test)

In [None]:
PRCurve(logistic_NearMiss1,X_test,y_test,"PR Curve Logistic Regression NearMiss(1)")

Near Miss 2:

In [None]:
nearMiss2 = NearMiss(version=2,n_jobs=-1,n_neighbors=5)
log = LogisticRegression(random_state=42)

X_resampled_nm2, Y_resampled_nm2 = nearMiss2.fit_resample(X_train, y_train)

grid_search_nm2 = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_nm2.fit(X_resampled_nm2,Y_resampled_nm2)

grid_search_nm2.best_params_

In [None]:
logistic_NearMiss2 = grid_search_nm2.best_estimator_
dump(logistic_NearMiss2, 'Models/lr_NearMiss2.joblib')

In [None]:
logistic_NearMiss2 = load('Models/lr_NearMiss2.joblib')
evaluate_model_performance(logistic_NearMiss2, X_test, y_test)

In [None]:
PRCurve(logistic_NearMiss2,X_test,y_test,"PR Curve Logistic Regression NearMiss(2)")

Near Miss 3 :

In [None]:
nearMiss3 = NearMiss(version=3,n_jobs=-1,n_neighbors=5)
log = LogisticRegression(random_state=42)

X_resampled_nm3, Y_resampled_nm3 = nearMiss3.fit_resample(X_train, y_train)

grid_search_nm3 = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_nm3.fit(X_resampled_nm3,Y_resampled_nm3)
grid_search_nm3.best_params_

In [None]:
logistic_NearMiss3 = grid_search_nm3.best_estimator_
dump(logistic_NearMiss3, 'Models/lr_NearMiss3.joblib')

In [None]:
logistic_NearMiss3 = load('Models/lr_NearMiss3.joblib')
evaluate_model_performance(logistic_NearMiss3, X_test, y_test)

In [None]:
PRCurve(logistic_NearMiss3,X_test,y_test,"PR Curve Logistic Regression NearMiss(3)")

**OVERSAMPLING TECHNIQUES:**

SMOTE:

In [None]:
print(f"Features shape: {X_train.shape}")
smote = SMOTE(random_state=42, n_jobs=-1)

log = LogisticRegression(random_state=42)

X_resampled_smote, Y_resampled_smote = smote.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_smote.shape}")

grid_search_smote = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_smote.fit(X_resampled_smote,Y_resampled_smote)
grid_search_smote.best_params_

In [None]:
logistic_smote = grid_search_smote.best_estimator_
dump(logistic_smote, 'Models/lr_Smote.joblib')

In [None]:
print(f"Features shape: {X_train.shape}")
print(f"Target shape: {y_train.shape}")
print("\nAfter SMOTE:")

print(f"Features shape: {X_resampled_smote.shape}")
print(f"Target shape: {Y_resampled_smote.shape}")

In [None]:
logistic_SMOTE = load('Models/lr_Smote.joblib')
evaluate_model_performance(logistic_SMOTE, X_test, y_test)

In [None]:
PRCurve(logistic_SMOTE,X_test,y_test,"PR Curve Logistic Regression SMOTE")

ADASYN:

In [None]:
print(f"Features shape: {X_train.shape}")
adasyn = ADASYN(random_state=42, n_jobs=-1)
log = LogisticRegression(random_state=42)

X_resampled_ad, Y_resampled_ad = adasyn.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_ad.shape}")

grid_search_ad = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1,n_jobs=-1)
grid_search_ad.fit(X_resampled_ad,Y_resampled_ad)
grid_search_ad.best_params_

In [None]:
dump(grid_search_ad.best_estimator_, 'Models/lr_adasyn.joblib')

In [None]:
logistic_adasyn = load('Models/lr_adasyn.joblib')
evaluate_model_performance(logistic_adasyn, X_test, y_test)

In [None]:
PRCurve(logistic_adasyn,X_test,y_test,"PR Curve Logistic Regression ADASYN")

THIS IS AN EXAMPLE OF HOW YOU CAN USE THE PR CRUVE TO MAKE AN ANALYSIS ON YOUR MODEL. For this example we can see that if we find the best precision Recall Curve we can find a really balanced model. But in this case we have to take into account that Recall is much more important since FN are much more punishing. I will discuss this in my thesis

In [None]:
y_scores = logistic_adasyn.predict_proba(X_test)[:, 1] 

precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores)

f1_scores = [f1_score(y_test, y_scores > t) for t in thresholds]

best_index = np.argmax(f1_scores)

best_threshold = thresholds[best_index]

predictions = y_scores > best_threshold


In [None]:
report = classification_report(y_test, predictions, target_names=['Negative', 'Positive'])

print('Best Threshold:', best_threshold)
print(report)
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)
mcc = matthews_corrcoef(y_test, predictions)
print(mcc)

**BORDERLINE SMOTE 2 APPROACHES**

BORDERLINE SMOTE 1

In [None]:
print(f"Features shape: {X_train.shape}")

borderline_smote = BorderlineSMOTE(random_state=42, n_jobs=-1)

log = LogisticRegression(random_state=42)
X_resampled_bs, Y_resampled_bs = borderline_smote.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_bs.shape}")

grid_search_bs = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_bs.fit(X_resampled_bs, Y_resampled_bs)
grid_search_bs.best_params_

In [None]:
dump(grid_search_bs.best_estimator_, 'Models/lr_borderlinesmote1.joblib')

In [None]:
logistic_Bsmote = load('Models/lr_borderlinesmote1.joblib')
evaluate_model_performance(logistic_Bsmote, X_test, y_test)

In [None]:
PRCurve(logistic_Bsmote,X_test,y_test,"PR Curve Logistic Regression BorderlineSMOTE(1)")

BORDERLINE SMOTE 2

In [None]:
print(f"Features shape: {X_train.shape}")

borderline_smote2 = BorderlineSMOTE(random_state=42, n_jobs=-1,kind='borderline-2')

log = LogisticRegression(random_state=42)

X_resampled_bs2, Y_resampled_bs2 = borderline_smote2.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_bs2.shape}")

grid_search_bs2 = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_bs2.fit(X_resampled_bs2, Y_resampled_bs2)
grid_search_bs2.best_params_

In [None]:
dump(grid_search_bs2.best_estimator_, 'Models/lr_borderlinesmote2.joblib')

In [None]:
logistic_Bsmote2 = load('Models/lr_borderlinesmote2.joblib')
evaluate_model_performance(logistic_Bsmote2, X_test, y_test)

In [None]:
PRCurve(logistic_Bsmote2,X_test,y_test,"PR Curve Logistic Regression BorderlineSMOTE(2)")

**HYBRID APPROACHES :**

SMOTETomek

In [None]:
print(f"Features shape: {X_train.shape}")

smote_tomek = SMOTETomek(random_state=42, n_jobs=-1)
log = LogisticRegression(random_state=42)

X_resampled_st, Y_resampled_st = smote_tomek.fit_resample(X_train, y_train)
print(f"Features shape: {X_resampled_st.shape}")

grid_search_st = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_st.fit(X_resampled_st, Y_resampled_st)
grid_search_st.best_params_

Store the Resampled parts since they take a long time to compute (took 1 hour..)

In [None]:
with open('resampled_data.pkl', 'wb') as file:
    pickle.dump((X_resampled_st, Y_resampled_st), file)

# if you want to read
#with open('resampled_data.pkl', 'rb') as file:
#    X_loaded, Y_loaded = pickle.load(file)

In [None]:
dump(grid_search_st.best_estimator_, 'Models/lr_smotetomek.joblib')

In [None]:
logistic_smoketomek = load('Models/lr_smotetomek.joblib')
evaluate_model_performance(logistic_smoketomek,X_test,y_test)

In [None]:
PRCurve(logistic_smoketomek,X_test,y_test,"PR Curve Logistic Regression SMOTETomek")

In [None]:
with open('resampled_data.pkl', 'rb') as file:
    X_loaded, Y_loaded = pickle.load(file)

SMOTEENN:

In [None]:
print(f"Features shape: {X_train.shape}")

smote_enn = SMOTEENN(random_state=42, n_jobs=-1)
log = LogisticRegression(random_state=42)

X_resampled_se, Y_resampled_se = smote_enn.fit_resample(X_train, y_train)
print(f"Features shape after SMOTEENN: {X_resampled_se.shape}")

grid_search_se = GridSearchCV(log, param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)
grid_search_se.fit(X_resampled_se, Y_resampled_se)
grid_search_se.best_params_

In [None]:
with open('resampled_dataSmoteE.pkl', 'wb') as file:
    pickle.dump((X_resampled_se, Y_resampled_se), file)

In [None]:
dump(grid_search_se.best_estimator_, 'Models/lr_smoteE.joblib')

In [None]:
linear_smotee = load( 'Models/lr_smoteE.joblib')
evaluate_model_performance(linear_smotee,X_test,y_test)

In [None]:
PRCurve(linear_smotee,X_test,y_test,"PR Curve Logistic Regression SMOTEENN")

**COST SENSITIVE LEARNING**

In [None]:

log = LogisticRegression(random_state=42)

grid_search_cost = GridSearchCV(log, param_grid_cost, cv=cv, scoring='f1',n_jobs=-1)

grid_search_cost.fit(X_train, y_train)
grid_search_cost.best_params_

In [None]:
dump(grid_search_cost.best_estimator_, 'Models/lr_costsensitive.joblib')

In [None]:
linear_costSensitive = load( 'Models/lr_costsensitive.joblib')

In [None]:
evaluate_model_performance(linear_costSensitive,X_test,y_test)

In [None]:
PRCurve(linear_costSensitive,X_test,y_test,"PR Curve Logistic Regression Cost Sensitive Learning ")

In [None]:
# THIS IS TO TEST SMOTE TOMEK

In [None]:


# Step 1: Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Step 2: Apply Tomek Links
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_smote, y_smote)

print(f"Original dataset shape: {Counter(y)}")
print(f"After SMOTE: {Counter(y_smote)}")
print(f"After Tomek Links: {Counter(y_tl)}")