In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier


seed_value = 7
split_ratio = 0.30

dataset_link = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(dataset_link)

features = df.drop(columns=["Class"])
target = df["Class"]

sampling_dict = {
    "Sampling1": RandomOverSampler(random_state=seed_value),
    "Sampling2": RandomUnderSampler(random_state=seed_value),
    "Sampling3": SMOTE(random_state=seed_value),
    "Sampling4": SMOTEENN(random_state=seed_value),
    "Sampling5": "balanced_rf"
}

model_dict = {
    "M1": LogisticRegression(max_iter=2000, n_jobs=-1),
    "M2": DecisionTreeClassifier(random_state=seed_value),
    "M3": RandomForestClassifier(n_estimators=150, random_state=seed_value, n_jobs=-1),
    "M4": SVC(),
    "M5": GaussianNB()
}

performance_table = pd.DataFrame(index=model_dict.keys(), columns=sampling_dict.keys())

for samp_name, samp_method in sampling_dict.items():

    if samp_name == "Sampling5":
        X_sampled, y_sampled = features, target
    else:
        X_sampled, y_sampled = samp_method.fit_resample(features, target)

    X_train, X_test, y_train, y_test = train_test_split(
        X_sampled,
        y_sampled,
        test_size=split_ratio,
        random_state=seed_value,
        stratify=y_sampled
    )

    for model_name, base_model in model_dict.items():

        if samp_name == "Sampling5":
            classifier = BalancedRandomForestClassifier(
                n_estimators=150,
                random_state=seed_value,
                n_jobs=-1
            )
        else:
            classifier = base_model

        classifier.fit(X_train, y_train)
        predictions = classifier.predict(X_test)

        acc_value = accuracy_score(y_test, predictions) * 100
        performance_table.loc[model_name, samp_name] = acc_value


performance_table = performance_table.astype(float)

top_score = performance_table.values.max()
best_pair = performance_table.stack().idxmax()

print("\n================ PERFORMANCE MATRIX ================\n")
print(performance_table.round(2))

print("\n================ OPTIMAL RESULT ====================\n")
print("Best Model  :", best_pair[0])
print("Best Method :", best_pair[1])
print("Accuracy %  :", round(top_score, 2))



    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1      94.10      50.00      93.01      96.74      89.66
M2      99.13      66.67      97.38      97.92      89.66
M3     100.00     100.00      99.34      99.70      89.66
M4      71.18      33.33      66.81      78.04      89.66
M5      84.72      16.67      83.41      89.91      89.66


Best Model  : M3
Best Method : Sampling1
Accuracy %  : 100.0
