In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Creditcard_data.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
print(df.shape)
df.info()

(772, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    772 non-null    int64  
 1   V1      772 non-null    float64
 2   V2      772 non-null    float64
 3   V3      772 non-null    float64
 4   V4      772 non-null    float64
 5   V5      772 non-null    float64
 6   V6      772 non-null    float64
 7   V7      772 non-null    float64
 8   V8      772 non-null    float64
 9   V9      772 non-null    float64
 10  V10     772 non-null    float64
 11  V11     772 non-null    float64
 12  V12     772 non-null    float64
 13  V13     772 non-null    float64
 14  V14     772 non-null    float64
 15  V15     772 non-null    float64
 16  V16     772 non-null    float64
 17  V17     772 non-null    float64
 18  V18     772 non-null    float64
 19  V19     772 non-null    float64
 20  V20     772 non-null    float64
 21  V21     772 non-null    float

In [4]:
df["Class"].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


In [5]:
!pip install imbalanced-learn



In [6]:
from imblearn.over_sampling import RandomOverSampler

X = df.drop("Class", axis=1)
y = df["Class"]

In [7]:
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

In [8]:
pd.Series(y_bal).value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,763


In [9]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

In [10]:
samplers = {
    "RandomOver": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "RandomUnder": RandomUnderSampler(random_state=42),
    "SMOTEENN": SMOTEENN(random_state=42),
    "StratifiedSplit": None  # handled separately
}

In [11]:
sampled_data = {}

for name, sampler in samplers.items():
    if sampler is not None:
        X_s, y_s = sampler.fit_resample(X_bal, y_bal)
        sampled_data[name] = (X_s, y_s)
    else:
        X_s, X_t, y_s, y_t = train_test_split(
            X_bal, y_bal, test_size=0.2, stratify=y_bal, random_state=42
        )
        sampled_data[name] = (X_s, y_s)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [13]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()
}

In [None]:
results = pd.DataFrame(index=models.keys(), columns=sampled_data.keys())

for sample_name, (X_s, y_s) in sampled_data.items():

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.2, random_state=42
    )

    for model_name, model in models.items():

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        acc = accuracy_score(y_test, preds)
        results.loc[model_name, sample_name] = acc

In [15]:
results

Unnamed: 0,RandomOver,SMOTE,RandomUnder,SMOTEENN,StratifiedSplit
LogisticRegression,0.934641,0.934641,0.905229,0.923841,0.946721
RandomForest,1.0,1.0,1.0,1.0,1.0
DecisionTree,0.993464,1.0,1.0,1.0,0.983607
KNN,0.986928,0.986928,0.980392,0.986755,0.979508
SVM,0.669935,0.669935,0.689542,0.708609,0.729508


In [16]:
results.idxmax(axis=1)

Unnamed: 0,0
LogisticRegression,StratifiedSplit
RandomForest,RandomOver
DecisionTree,SMOTE
KNN,RandomOver
SVM,StratifiedSplit
