In [2]:
import pandas as pd
import numpy as np
import math

data = pd.read_csv('Creditcard_data.csv')

class_0 = data[data['Class'] == 0]
class_1 = data[data['Class'] == 1]

class_1_over = class_1.sample(len(class_0), replace=True, random_state=42)
balanced_data = pd.concat([class_0, class_1_over], axis=0)

print(f"New Balanced Data Shape: {balanced_data.shape}")
print(f"Class Distribution:\n{balanced_data['Class'].value_counts()}")

Z = 1.96
p = 0.5
E = 0.05

n = math.ceil((Z**2 * p * (1 - p)) / (E**2))
print(f"\nCalculated Sample Size (n): {n}")

samples = {}

samples['Simple Random'] = balanced_data.sample(n=n, random_state=42)

k = int(len(balanced_data) / n)
indices = np.arange(0, len(balanced_data), k)[:n]
samples['Systematic'] = balanced_data.iloc[indices]

from sklearn.model_selection import train_test_split
stratified_sample, _ = train_test_split(
    balanced_data,
    train_size=n,
    stratify=balanced_data['Class'],
    random_state=42
)
samples['Stratified'] = stratified_sample

balanced_data_cluster = balanced_data.copy()
num_clusters = len(balanced_data) // 20
balanced_data_cluster['cluster_id'] = np.random.randint(0, num_clusters, size=len(balanced_data))

unique_clusters = balanced_data_cluster['cluster_id'].unique()
selected_clusters = np.random.choice(
    unique_clusters,
    size=int(len(unique_clusters) * 0.25),
    replace=False
)
cluster_sample = balanced_data_cluster[
    balanced_data_cluster['cluster_id'].isin(selected_clusters)
]
samples['Cluster'] = cluster_sample.drop(columns=['cluster_id'])

samples['Bootstrap'] = balanced_data.sample(n=n, replace=True, random_state=42)

print("\nSamples created successfully!")
for name, df in samples.items():
    print(f"{name} Sample Size: {len(df)}")


New Balanced Data Shape: (1526, 31)
Class Distribution:
Class
0    763
1    763
Name: count, dtype: int64

Calculated Sample Size (n): 385

Samples created successfully!
Simple Random Sample Size: 385
Systematic Sample Size: 385
Stratified Sample Size: 385
Cluster Sample Size: 388
Bootstrap Sample Size: 385


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

models = {
    'M1 (Logistic Regression)': LogisticRegression(random_state=42, max_iter=1000),
    'M2 (Decision Tree)': DecisionTreeClassifier(random_state=42),
    'M3 (Random Forest)': RandomForestClassifier(random_state=42),
    'M4 (SVM)': SVC(random_state=42),
    'M5 (KNN)': KNeighborsClassifier()
}

X = balanced_data.drop('Class', axis=1)
y = balanced_data['Class']
_, X_test, _, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

results = {}

print("Training models...\n")

for sample_name, sample_df in samples.items():
    print(f"Processing {sample_name}...")

    X_train = sample_df.drop('Class', axis=1)
    y_train = sample_df['Class']

    sample_scores = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        sample_scores[model_name] = round(acc * 100, 2)

    results[sample_name] = sample_scores

results_df = pd.DataFrame(results)

print("       FINAL ACCURACY RESULTS (%)         ")
print(results_df)

results_df.to_csv("sampling_accuracy_table.csv")


Training models...

Processing Simple Random...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Processing Systematic...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Processing Stratified...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Processing Cluster...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Processing Bootstrap...
       FINAL ACCURACY RESULTS (%)         
                          Simple Random  Systematic  Stratified  Cluster  \
M1 (Logistic Regression)          92.41       86.91       89.27    91.36   
M2 (Decision Tree)               100.00       99.48       97.38    99.48   
M3 (Random Forest)               100.00      100.00       99.74    99.74   
M4 (SVM)                          65.18       63.87       66.49    66.23   
M5 (KNN)                          96.07       96.86       95.81    95.55   

                          Bootstrap  
M1 (Logistic Regression)      91.36  
M2 (Decision Tree)            95.55  
M3 (Random Forest)            99.21  
M4 (SVM)                      76.70  
M5 (KNN)                      95.55  


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

X = balanced_data.drop('Class', axis=1)
y = balanced_data['Class']

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

train_data = pd.concat([X_train_full, y_train_full], axis=1)

print(f"Total Training Data: {len(train_data)}")
print(f"Total Test Data: {len(X_test)}")

sample_size = min(n, len(train_data))

samples = {}
samples['Simple Random'] = train_data.sample(n=sample_size, random_state=42)

k = int(len(train_data) / sample_size)
indices = np.arange(0, len(train_data), k)[:sample_size]
samples['Systematic'] = train_data.iloc[indices]

samples['Stratified'], _ = train_test_split(
    train_data,
    train_size=sample_size,
    stratify=train_data['Class'],
    random_state=42
)

train_data_cluster = train_data.copy()
num_clusters = 10
train_data_cluster['cluster_id'] = np.random.randint(
    0, num_clusters, size=len(train_data)
)
selected_clusters = np.random.choice(range(num_clusters), size=3, replace=False)
cluster_sample = train_data_cluster[
    train_data_cluster['cluster_id'].isin(selected_clusters)
]
samples['Cluster'] = cluster_sample.drop(columns=['cluster_id'])

samples['Bootstrap'] = train_data.sample(
    n=sample_size, replace=True, random_state=42
)

models = {
    'M1 (Logistic Regression)': LogisticRegression(random_state=42, max_iter=5000),
    'M2 (Decision Tree)': DecisionTreeClassifier(random_state=42),
    'M3 (Random Forest)': RandomForestClassifier(random_state=42),
    'M4 (SVM)': SVC(random_state=42),
    'M5 (KNN)': KNeighborsClassifier()
}

results = {}

print("Training on Full Dataset (Baseline)...")
full_scores = {}
for name, model in models.items():
    model.fit(X_train_full, y_train_full)
    acc = accuracy_score(y_test, model.predict(X_test))
    full_scores[name] = round(acc * 100, 2)
results['No Sampling (Full Data)'] = full_scores

for sample_name, sample_df in samples.items():
    print(f"Training on {sample_name}...")
    X_sample = sample_df.drop('Class', axis=1)
    y_sample = sample_df['Class']

    sample_scores = {}
    for name, model in models.items():
        model.fit(X_sample, y_sample)
        acc = accuracy_score(y_test, model.predict(X_test))
        sample_scores[name] = round(acc * 100, 2)
    results[sample_name] = sample_scores

results_df = pd.DataFrame(results)
print("\nAccuracy Comparison Table (%):")
print(results_df)

results_df.to_csv("sampling_results.csv")


Total Training Data: 1144
Total Test Data: 382
Training on Full Dataset (Baseline)...
Training on Simple Random...
Training on Systematic...
Training on Stratified...
Training on Cluster...
Training on Bootstrap...

Accuracy Comparison Table (%):
                          No Sampling (Full Data)  Simple Random  Systematic  \
M1 (Logistic Regression)                    91.36          92.15       89.79   
M2 (Decision Tree)                          99.48          98.69       99.48   
M3 (Random Forest)                         100.00          99.21       99.74   
M4 (SVM)                                    67.54          67.02       66.75   
M5 (KNN)                                    98.17          95.55       96.07   

                          Stratified  Cluster  Bootstrap  
M1 (Logistic Regression)       91.10    90.31      92.41  
M2 (Decision Tree)             98.17    97.12      96.86  
M3 (Random Forest)             99.74    99.21      99.21  
M4 (SVM)                       70.16