In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

In [5]:
data = pd.read_csv('/content/Creditcard_data.csv')

class_0 = data[data['Class'] == 0]
class_1 = data[data['Class'] == 1]
class_1_over = class_1.sample(len(class_0), replace=True, random_state=42)
balanced_data = pd.concat([class_0, class_1_over], axis=0)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
z = 1.96
p = 0.5
e = 0.05
n = math.ceil((z**2 * p * (1-p)) / (e**2))

samples = {}

samples['Sampling1'] = balanced_data.sample(n=n, random_state=42)

In [7]:
step = len(balanced_data) // n
indices = np.arange(0, len(balanced_data), step=step)
samples['Sampling2'] = balanced_data.iloc[indices[:n]]

samples['Sampling3'] = balanced_data.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(n/2), random_state=42))

kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
balanced_data['cluster'] = kmeans.fit_predict(balanced_data.drop('Class', axis=1))
selected_clusters = np.random.choice(10, 3, replace=False)
cluster_sample = balanced_data[balanced_data['cluster'].isin(selected_clusters)]
samples['Sampling4'] = cluster_sample.sample(n=n, replace=True, random_state=42).drop('cluster', axis=1)
balanced_data = balanced_data.drop('cluster', axis=1)

samples['Sampling5'] = balanced_data.sample(n=n, replace=True, random_state=42)

  samples['Sampling3'] = balanced_data.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(n/2), random_state=42))


In [8]:
models = {
    'M1': LogisticRegression(max_iter=5000),
    'M2': DecisionTreeClassifier(random_state=42),
    'M3': RandomForestClassifier(random_state=42),
    'M4': KNeighborsClassifier(),
    'M5': GaussianNB()
}

results = {model_name: {} for model_name in models}

In [10]:
for model_name, model in models.items():
    for sample_name, sample_df in samples.items():
        X = sample_df.drop('Class', axis=1)
        y = sample_df['Class']

        if y.nunique() < 2:
            print(f"skipping training for {model_name} with {sample_name} because target 'Class' has only one unique value.")
            results[model_name][sample_name] = np.nan
            continue

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name][sample_name] = accuracy

final_table = pd.DataFrame(results).transpose()
print(final_table)
final_table.to_csv('sampling_results.csv')

Skipping training for M1 with Sampling4 because target 'Class' has only one unique value.
Skipping training for M2 with Sampling4 because target 'Class' has only one unique value.
Skipping training for M3 with Sampling4 because target 'Class' has only one unique value.
Skipping training for M4 with Sampling4 because target 'Class' has only one unique value.
Skipping training for M5 with Sampling4 because target 'Class' has only one unique value.
    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1   0.783505   0.938144   0.875000        NaN   0.927835
M2   0.958763   0.958763   0.979167        NaN   0.989691
M3   1.000000   1.000000   1.000000        NaN   1.000000
M4   0.969072   0.907216   0.958333        NaN   0.927835
M5   0.824742   0.649485   0.729167        NaN   0.742268
