In [1]:
# ==============================================
# Create balanced dataset with SMOTE
# ==============================================

In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"

In [3]:
# 1️⃣ Load the base dataset
data = pd.read_csv("creditcard_dataset.csv")

In [4]:
# 2️⃣ Split into features and target
X = data.drop('Class', axis=1)
y = data['Class']

In [5]:
# 3️⃣ Split into training and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
# 4️⃣ Apply SMOTE on training data
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_res, y_res = smote.fit_resample(X_train, y_train)

In [7]:
# 5️⃣ Combine resampled data into a single DataFrame
balanced_data = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.DataFrame(y_res, columns=['Class'])], axis=1)


In [8]:
# 6️⃣ Save to CSV
balanced_data.to_csv("creditcard_dataset_smote.csv", index=False)


In [9]:
print("✅ SMOTE dataset created and saved as 'creditcard_dataset_smote.csv'")
print("Original dataset shape:", X_train.shape, y_train.value_counts().to_dict())
print("Resampled dataset shape:", X_res.shape, y_res.value_counts().to_dict())

✅ SMOTE dataset created and saved as 'creditcard_dataset_smote.csv'
Original dataset shape: (227845, 30) {0: 227451, 1: 394}
Resampled dataset shape: (454902, 30) {0: 227451, 1: 227451}
