# Phase 3 — Addressing Class Imbalance

Use RandomUnderSampler + SMOTE (pipeline)
Balance dataset by reducing majority class first, then applying SMOTE.
This keeps dataset size manageable.

In [9]:
from pathlib import Path
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

DATA_DIR = Path("../data")
PROC_DIR = Path("../data/processed/ml_ready")
OUT_DIR = Path("../data/processed/ml_balance")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Load standard-scaled training data
X_train = pd.read_csv(PROC_DIR / "X_train_standard.csv").astype("float32")
y_train = pd.read_csv(PROC_DIR / "y_train.csv")

# Ensure y_train is a Series
if y_train.shape[1] == 1:
    y_train = y_train.iloc[:, 0]

print("Original train label distribution:\n", y_train.value_counts())
print("\nOriginal distribution:", Counter(y_train))

# Strategy: cap BENIGN, boost small classes
max_majority = 100_000
target_distribution = {}
for cls, count in Counter(y_train).items():
    if cls == 0:
        target_distribution[cls] = min(count, max_majority)
    else:
        target_distribution[cls] = max(count, 20_000)

print("Target distribution:", target_distribution)

pipeline = Pipeline([
    ("under", RandomUnderSampler(sampling_strategy={0: max_majority}, random_state=42)),
    ("smote", SMOTE(random_state=42, sampling_strategy=target_distribution))
])

X_res, y_res = pipeline.fit_resample(X_train, y_train)

print("\nAfter resampling:", Counter(y_res))

# Save balanced training set
pd.DataFrame(X_res, columns=X_train.columns).to_csv(OUT_DIR / "train_balanced.csv", index=False)
pd.Series(y_res).to_csv(OUT_DIR / "train_balanced_labels.csv", index=False)

# Also copy over test set
X_test = pd.read_csv(PROC_DIR / "X_test_standard.csv").astype("float32")
y_test = pd.read_csv(PROC_DIR / "y_test.csv")
if y_test.shape[1] == 1:
    y_test = y_test.iloc[:, 0]

X_test.to_csv(OUT_DIR / "test.csv", index=False)
y_test.to_csv(OUT_DIR / "test_labels.csv", index=False)

print("Saved balanced training and test sets.")


Original train label distribution:
 Label_ID
0     1817055
4      184099
10     127043
2      102420
3        8234
7        6348
11       4717
6        4637
5        4399
1        1565
12       1206
14        522
9          29
13         17
8           9
Name: count, dtype: int64
Original distribution: Counter({0: 1817055, 4: 184099, 10: 127043, 2: 102420, 3: 8234, 7: 6348, 11: 4717, 6: 4637, 5: 4399, 1: 1565, 12: 1206, 14: 522, 9: 29, 13: 17, 8: 9})
Target distribution: {0: 100000, 5: 20000, 2: 102420, 7: 20000, 4: 184099, 10: 127043, 12: 20000, 3: 20000, 11: 20000, 6: 20000, 14: 20000, 1: 20000, 13: 20000, 9: 20000, 8: 20000}
After resampling: Counter({4: 184099, 10: 127043, 2: 102420, 0: 100000, 1: 20000, 3: 20000, 5: 20000, 6: 20000, 7: 20000, 8: 20000, 9: 20000, 11: 20000, 12: 20000, 13: 20000, 14: 20000})
Saved balanced training and test sets.
