In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

from imblearn.over_sampling import RandomOverSampler
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [25]:
# read data
df = pd.read_csv('/content/Creditcard_data.csv')

print("Original data shape:", df.shape)
print(df['Class'].value_counts())

# seperate x and y
X = df.drop('Class', axis=1)
y = df['Class']

# oversample minority class
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

# create df
bal_df = pd.DataFrame(X_bal)
bal_df.columns = X.columns
bal_df['Class'] = y_bal

print("\nAfter balancing:", bal_df.shape)
print(bal_df['Class'].value_counts())

# calc sample size
total = len(bal_df)
n = int(total * 0.1)
print(f"\nSample size: {n}")
test_sz = 0.3

# random sampling
def random_sample(df, size):
    samp = df.sample(n=size, random_state=42)
    return samp

# bootstrap
def bootstrap_sample(df, size):
    boot = df.sample(n=size, replace=True, random_state=123)
    return boot

# systematic
def systematic_sample(dataframe, sample_size):
    k = len(dataframe) // sample_size
    idx = []

    # tried using np.arange but this worked better
    for i in range(0, len(dataframe), k):
        idx.append(i)

    if len(idx) > sample_size:
        idx = idx[:sample_size]

    result = dataframe.iloc[idx]
    result = result.reset_index(drop=True)
    return result

# stratified
def stratified_sample(df, size):
    frac = size / len(df)

    # keeps class ratio
    strat = df.groupby('Class', group_keys=False).apply(
        lambda grp: grp.sample(frac=frac, random_state=99)
    )

    strat = strat.reset_index(drop=True)
    return strat

# cluster based
def cluster_sample(data, n_samples):
    np.random.seed(55)

    n_clusters = 10
    per_clust = n_samples // n_clusters

    # shuffle data
    shuf = data.sample(frac=1, random_state=77)
    shuf = shuf.reset_index(drop=True)

    clust_len = len(shuf) // n_clusters

    all_samp = []

    # sample from each cluster
    for i in range(n_clusters):
        start = i * clust_len
        end = start + clust_len

        clust_data = shuf.iloc[start:end]

        n_take = per_clust
        if len(clust_data) < n_take:
            n_take = len(clust_data)

        samp = clust_data.sample(n=n_take, random_state=88)

        all_samp.append(samp)
        # print(f"cluster {i}: {len(samp)}")

    # combine all
    combined = pd.concat(all_samp, ignore_index=True)
    return combined

# create samples
s1 = random_sample(bal_df, n)

s2 = systematic_sample(bal_df, n)

s3 = stratified_sample(bal_df, n)

s4 = cluster_sample(bal_df, n)

s5 = bootstrap_sample(bal_df, n)


samps = [s1, s2, s3, s4, s5]
samp_names = ['Sampling1', 'Sampling2', 'Sampling3', 'Sampling4', 'Sampling5']
# print(f"created {len(samps)} samples")

# models
lr = LogisticRegression(max_iter=1000, random_state=42)

dt = DecisionTreeClassifier(random_state=123)

rf = RandomForestClassifier(n_estimators=100, random_state=456)

sv = SVC(kernel='linear', random_state=789)

knn = KNeighborsClassifier(n_neighbors=5)


models = [lr, dt, rf, sv, knn]
mod_names = ['M1', 'M2', 'M3', 'M4', 'M5']

# store results
results = []

# train on each sample
for s in samps:

    # seperate features and target
    X_s = s.drop('Class', axis=1)
    y_s = s['Class']

    # split
    X_tr, X_te, y_tr, y_te = train_test_split(X_s, y_s, test_size=test_sz, random_state=42)

    samp_res = []

    # train each model
    for m in models:

        # train
        m.fit(X_tr, y_tr)

        # predict
        preds = m.predict(X_te)

        # get accuracy
        acc = accuracy_score(y_te, preds)

        # convert to percent
        acc = acc * 100

        samp_res.append(acc)
        # print(f"acc: {acc}")

    results.append(samp_res)

# reorganize results
res_dict = {}

for i in range(len(mod_names)):

    name = mod_names[i]

    accs = []

    for r in results:

        accs.append(r[i])

    res_dict[name] = accs

# make df
df_res = pd.DataFrame(res_dict)
df_res.index = samp_names

# transpose
df_res = df_res.T

print("\n\nAccuracy Results:")
print(df_res.round(2))

print("\nBest sampling for each model:")

for m in df_res.index:

    best = df_res.loc[m].idxmax()

    best_val = df_res.loc[m].max()

    print(f"{m}: {best} ({best_val:.2f}%)")

# save
df_res.to_csv('results.csv')

print("\nSaved to results.csv")


Original data shape: (772, 31)
Class
0    763
1      9
Name: count, dtype: int64

After balancing: (1526, 31)
Class
0    763
1    763
Name: count, dtype: int64

Sample size: 152


Accuracy Results:
    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1      86.96      80.43      91.30      86.67      93.48
M2      93.48     100.00      93.48      91.11      86.96
M3      97.83     100.00     100.00      93.33      95.65
M4      86.96      80.43      91.30      91.11      95.65
M5      73.91      82.61      78.26      73.33      97.83

Best sampling for each model:
M1: Sampling5 (93.48%)
M2: Sampling2 (100.00%)
M3: Sampling2 (100.00%)
M4: Sampling5 (95.65%)
M5: Sampling5 (97.83%)

Saved to results.csv


In [None]:
#These results show that different sampling techniques for different models give different accuracies.
#For each model best performing sampling technique ko upar identify kiya gaya hai, which provides us with the maximum accuracy.