In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.utils import resample



In [3]:
data = pd.read_csv("/content/Creditcard_data (1).csv")

X = data.drop("Class", axis=1)
y = data["Class"]

print("Original dataset shape:", data.shape)
print("Class distribution:\n", y.value_counts())


Original dataset shape: (772, 31)
Class distribution:
 Class
0    763
1      9
Name: count, dtype: int64


In [4]:
data_majority = data[data.Class == 0]
data_minority = data[data.Class == 1]

data_majority_downsampled = resample(
    data_majority,
    replace=False,
    n_samples=len(data_minority),
    random_state=42
)

balanced_data = pd.concat([data_majority_downsampled, data_minority])

X_bal = balanced_data.drop("Class", axis=1)
y_bal = balanced_data["Class"]

print("\nBalanced dataset shape:", balanced_data.shape)
print("Balanced class distribution:\n", y_bal.value_counts())



Balanced dataset shape: (18, 31)
Balanced class distribution:
 Class
0    9
1    9
Name: count, dtype: int64


In [5]:
samples = []

for i in range(5):
    X_s, _, y_s, _ = train_test_split(
        X_bal, y_bal,
        train_size=0.6,
        random_state=42+i,
        stratify=y_bal
    )
    samples.append((X_s, y_s))


In [6]:
def simple_random_sampling(X, y):
    return resample(X, y, replace=False, n_samples=len(y), random_state=1)

def systematic_sampling(X, y):
    step = 2
    idx = np.arange(0, len(X), step)
    return X.iloc[idx], y.iloc[idx]

def stratified_sampling(X, y):
    return train_test_split(X, y, train_size=0.7, stratify=y, random_state=1)[:2]

def cluster_sampling(X, y):
    return X.iloc[::3], y.iloc[::3]

def bootstrap_sampling(X, y):
    return resample(X, y, replace=True, n_samples=len(y), random_state=1)


In [7]:
sampling_methods = [
    simple_random_sampling,
    systematic_sampling,
    stratified_sampling,
    cluster_sampling,
    bootstrap_sampling
]


In [8]:
models = {
    "M1_Logistic": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_SVM": SVC(),
    "M5_GradientBoost": GradientBoostingClassifier()
}


In [13]:
results = pd.DataFrame(
    index=models.keys(),
    columns=["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"]
)


def stratified_sampling_corrected(X, y):
    X_train_sampled, _, y_train_sampled, _ = train_test_split(X, y, train_size=0.7, stratify=y, random_state=1)
    return X_train_sampled, y_train_sampled


local_sampling_methods = list(sampling_methods)
local_sampling_methods[2] = stratified_sampling_corrected

for i, (X_s, y_s) in enumerate(samples):
    for j, sampling_func in enumerate(local_sampling_methods):
        X_sampled, y_sampled = sampling_func(X_s, y_s)


        if y_sampled.nunique() < 2 or np.bincount(y_sampled).min() < 2:

            X_train, X_test, y_train, y_test = train_test_split(
                X_sampled, y_sampled,
                test_size=0.3,
                random_state=42
            )
        else:

            X_train, X_test, y_train, y_test = train_test_split(
                X_sampled, y_sampled,
                test_size=0.3,
                random_state=42,
                stratify=y_sampled
            )


        if y_train.nunique() < 2:
            for model_name in models.keys():
                results.loc[model_name, f"Sampling{j+1}"] = 0.0
            continue

        for model_name, model in models.items():
            model.fit(X_train, y_train)


            if len(y_test) > 0 and y_test.nunique() >= 2:
                preds = model.predict(X_test)
                acc = accuracy_score(y_test, preds) * 100
                results.loc[model_name, f"Sampling{j+1}"] = round(acc, 2)
            else:
                results.loc[model_name, f"Sampling{j+1}"] = 0.0


In [14]:
print("\nFinal Accuracy Table:\n")
print(results)



Final Accuracy Table:

                 Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1_Logistic            0.0      50.0     100.0     100.0     66.67
M2_DecisionTree        0.0     100.0     66.67       0.0     66.67
M3_RandomForest        0.0      50.0     66.67     100.0     66.67
M4_SVM               33.33      50.0     100.0     100.0     66.67
M5_GradientBoost       0.0      50.0     66.67     100.0     100.0


In [15]:
best_sampling = results.idxmax(axis=1)
print("\nBest Sampling Technique for Each Model:\n")
print(best_sampling)



Best Sampling Technique for Each Model:

M1_Logistic         Sampling3
M2_DecisionTree     Sampling2
M3_RandomForest     Sampling4
M4_SVM              Sampling3
M5_GradientBoost    Sampling4
dtype: object
