In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [7]:
data = pd.read_csv('creditcard_dataset.csv')
print("Original dataset shape:", data.shape)

Original dataset shape: (284807, 31)


In [8]:
# 3️⃣ Split features and target
X = data.drop('Class', axis=1)
y = data['Class']

In [9]:
# 4️⃣ Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
print("Class distribution in training set:", Counter(y_train))
print("Class distribution in test set:", Counter(y_test))

Class distribution in training set: Counter({0: 227451, 1: 394})
Class distribution in test set: Counter({0: 56864, 1: 98})


In [11]:
# 5️⃣ Apply Random Undersampling (RUS)
rus = RandomUnderSampler(
    sampling_strategy='auto',  # balance to match minority class size
    random_state=42
)

In [12]:
X_res, y_res = rus.fit_resample(X_train, y_train)

In [13]:
print("Resampled dataset shape:", X_res.shape)
print("Class distribution after RUS:", Counter(y_res))

Resampled dataset shape: (788, 30)
Class distribution after RUS: Counter({0: 394, 1: 394})


In [14]:
# 6️⃣ Combine resampled features and labels
balanced_data = pd.DataFrame(X_res, columns=X.columns)
balanced_data['Class'] = y_res.values

In [15]:
# 7️⃣ Save balanced dataset
balanced_data.to_csv('creditcard_dataset_rus.csv', index=False)

In [16]:
print("✅ Balanced dataset after Random Undersampling saved as creditcard_dataset_rus.csv")

✅ Balanced dataset after Random Undersampling saved as creditcard_dataset_rus.csv
