## 3. Resampling Experiments (SMOTE, ADASYN, Tomek Links)

In [None]:
# 3_resampling_experiments.ipynb

import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 3.1 Load engineered data
df = pd.read_parquet("creditcard_engineered.parquet")

In [None]:
X = df.drop(columns="Class")
y = df["Class"]

# 3.2 Standardize features (especially Amount, TimeDelta, rolling stats)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3.3 Split into train/test (70:30, per proposal)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.30, stratify=y, random_state=42
)

In [None]:
# 3.4 Apply resampling techniques on training set only
#     3.4.1 SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

#     3.4.2 ADASYN
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

#     3.4.3 Tomek Links (undersampling majority)
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X_train, y_train)

In [None]:
# 3.5 Print resulting class distributions
def print_distribution(name, y_arr):
    unique, counts = np.unique(y_arr, return_counts=True)
    print(f"{name} distribution: {dict(zip(unique, counts))}")

print_distribution("Original train", y_train)
print_distribution("SMOTE train", y_smote)
print_distribution("ADASYN train", y_adasyn)
print_distribution("Tomek train", y_tomek)

In [None]:
# SMOTE
pd.DataFrame(X_smote).to_parquet("X_train_smote.parquet")
pd.DataFrame(y_smote, columns=["Class"]).to_parquet("y_train_smote.parquet")
# — or equivalently —
# pd.Series(y_smote, name="Class").to_frame().to_parquet("y_train_smote.parquet")

# ADASYN
pd.DataFrame(X_adasyn).to_parquet("X_train_adasyn.parquet")
pd.DataFrame(y_adasyn, columns=["Class"]).to_parquet("y_train_adasyn.parquet")

# Tomek Links
pd.DataFrame(X_tomek).to_parquet("X_train_tomek.parquet")
pd.DataFrame(y_tomek, columns=["Class"]).to_parquet("y_train_tomek.parquet")

# Also save the original (unresampled) splits
pd.DataFrame(X_train).to_parquet("X_train_orig.parquet")
pd.DataFrame(y_train, columns=["Class"]).to_parquet("y_train_orig.parquet")

pd.DataFrame(X_test).to_parquet("X_test.parquet")
pd.DataFrame(y_test, columns=["Class"]).to_parquet("y_test.parquet")