In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import os

In [2]:

# Load dataset
df = pd.read_csv('../data/creditcard.csv')

In [3]:
# Separate features and target
X = df.drop(columns=['Class'])
y = df['Class']

# Scale 'Time' and 'Amount'
scaler = StandardScaler()
X[['Time', 'Amount']] = scaler.fit_transform(X[['Time', 'Amount']])

In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [5]:
# Apply SMOTE on training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [6]:
# Save preprocessed data
os.makedirs('../data', exist_ok=True)
joblib.dump((X_train_resampled, y_train_resampled, X_test, y_test, scaler), '../data/creditcard_preprocessed.pkl')

['../data/creditcard_preprocessed.pkl']