In [10]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler
df = pd.read_csv("Creditcard_data.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
print("Original Dataset Shape:", df.shape)
print(df['Class'].value_counts())

target_col = "Class"
X = df.drop('Class', axis=1)
Y = df['Class']
ros = RandomOverSampler(random_state=42)
X_sampled, Y_sampled = ros.fit_resample(X, Y)
balanced_df = pd.concat([X_sampled, Y_sampled], axis=1)

print(balanced_df['Class'].value_counts())

Original Dataset Shape: (772, 31)
Class
0    763
1      9
Name: count, dtype: int64
Class
0    763
1    763
Name: count, dtype: int64


In [3]:

models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_SVM": SVC(),
    "M5_KNN": KNeighborsClassifier()
}

In [12]:

population_size = balanced_df.shape[0]
error_margin = 0.05
sample_size = int(population_size / (1 + population_size * error_margin**2))

# -----------------------------
# Sampling functions
# -----------------------------

def simple_random_sampling(df, n):
    return df.sample(n=n, random_state=10)


def bootstrap_sampling(df, n):
    return df.sample(n=n, replace=True, random_state=10)


def stratified_sampling(df, n):
    class0 = df[df['Class'] == 0].sample(n=n//2, random_state=10)
    class1 = df[df['Class'] == 1].sample(n=n//2, random_state=10)
    return pd.concat([class0, class1])


def systematic_sampling(df, n):
    step = df.shape[0] // n
    return df.iloc[::step].head(n)


def cluster_sampling(df, n):
    df_temp = df.copy()
    df_temp['cluster'] = pd.cut(df_temp['V1'], bins=4)
    selected_cluster = df_temp['cluster'].unique()[1]
    return df_temp[df_temp['cluster'] == selected_cluster].drop('cluster', axis=1).head(n)

samples = {
    "Simple Random": simple_random_sampling(balanced_df, sample_size),
    "Bootstrap": bootstrap_sampling(balanced_df, sample_size),
    "Stratified": stratified_sampling(balanced_df, sample_size),
    "Systematic": systematic_sampling(balanced_df, sample_size),
    "Cluster": cluster_sampling(balanced_df, sample_size)
}

In [13]:
final_results = {}

for name, df in samples.items():
    features = df.drop(columns=['Class'])
    target = df['Class']

    X_train, X_test, Y_train, Y_test = train_test_split(
        features,
        target,
        test_size=0.2,
        random_state=42,
        stratify=target
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_scores = {}

    for algo, clf in models.items():
        if algo in ("SVM", "KNN", "Logistic Regression"):
            clf.fit(X_train_scaled, Y_train)
            Y_pred = clf.predict(X_test_scaled)
        else:
            clf.fit(X_train, Y_train)
            Y_pred = clf.predict(X_test)

        model_scores[algo] = round(accuracy_score(Y_test, Y_pred) * 100, 2)

    final_results[name] = model_scores

accuracy_table = pd.DataFrame(final_results)
print(accuracy_table)

best_models = accuracy_table.idxmax(axis=1)
print(best_models)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                       Simple Random  Bootstrap  Stratified  Systematic  \
M1_LogisticRegression          90.62      85.94       92.19       92.19   
M2_DecisionTree               100.00      93.75       95.31       98.44   
M3_RandomForest                98.44      95.31      100.00      100.00   
M4_SVM                         81.25      71.88       73.44       70.31   
M5_KNN                         95.31      89.06       98.44       89.06   

                       Cluster  
M1_LogisticRegression    98.44  
M2_DecisionTree          98.44  
M3_RandomForest          98.44  
M4_SVM                   98.44  
M5_KNN                   98.44  
M1_LogisticRegression          Cluster
M2_DecisionTree          Simple Random
M3_RandomForest             Stratified
M4_SVM                         Cluster
M5_KNN                      Stratified
dtype: object
