# Phase 3 — Addressing Class Imbalance

Multiple strategies:
1. **Class weights** - Simpler, faster, preserves all real data
2. **Moderate SMOTE** - Conservative oversampling for extremely rare classes
3. **Focal Loss** - Advanced loss function for deep learning models

We'll create variants for different model types.

In [None]:
from pathlib import Path
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import joblib

DATA_DIR = Path("../data")
PROC_DIR = Path("../data/processed/ml_ready")
OUT_DIR = Path("../data/processed/ml_balance")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Load standard-scaled training data
X_train = pd.read_csv(PROC_DIR / "X_train_standard.csv").astype("float32")
y_train = pd.read_csv(PROC_DIR / "y_train.csv")

# Ensure y_train is a Series
if y_train.shape[1] == 1:
    y_train = y_train.iloc[:, 0]

print("Original train label distribution:\n", y_train.value_counts())
print("\nOriginal distribution:", Counter(y_train))

# ===============================================
# Strategy 1: Compute Class Weights (for tree-based models)
# ===============================================
print("\n=== Computing Class Weights ===")
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

print("Class weights:")
for cls, weight in class_weight_dict.items():
    print(f"  Class {cls}: {weight:.4f}")

# Save class weights
joblib.dump(class_weight_dict, OUT_DIR / "class_weights.pkl")
print("Saved class weights to", OUT_DIR / "class_weights.pkl")

# ===============================================
# Strategy 2: Moderate SMOTE (only for extremely rare classes)
# ===============================================
print("\n=== Applying Moderate SMOTE ===")

# Only boost classes with < 1000 samples to 5000
# Cap BENIGN to reduce dominance
target_distribution = {}
for cls, count in Counter(y_train).items():
    if cls == 0:  # BENIGN
        target_distribution[cls] = min(count, 200_000)
    elif count < 1000:  # Very rare classes
        target_distribution[cls] = 5_000
    elif count < 10_000:  # Moderately rare
        target_distribution[cls] = 10_000
    else:  # Keep as is
        target_distribution[cls] = count

print("Target distribution (moderate approach):", target_distribution)

# Apply moderate resampling
pipeline = Pipeline([
    ("under", RandomUnderSampler(sampling_strategy={0: target_distribution[0]}, random_state=42)),
    ("smote", SMOTE(random_state=42, sampling_strategy=target_distribution, k_neighbors=3))
])

X_res, y_res = pipeline.fit_resample(X_train, y_train)

print("\nAfter moderate resampling:", Counter(y_res))

# Save balanced training set (for models that need resampled data)
pd.DataFrame(X_res, columns=X_train.columns).to_csv(OUT_DIR / "train_balanced.csv", index=False)
pd.Series(y_res).to_csv(OUT_DIR / "train_balanced_labels.csv", index=False)

# ===============================================
# Strategy 3: Save original training data (for class weight approach)
# ===============================================
print("\n=== Saving Original Data (for class weight training) ===")
X_train.to_csv(OUT_DIR / "train_original.csv", index=False)
y_train.to_csv(OUT_DIR / "train_original_labels.csv", index=False)

# Copy over test set
X_test = pd.read_csv(PROC_DIR / "X_test_standard.csv").astype("float32")
y_test = pd.read_csv(PROC_DIR / "y_test.csv")
if y_test.shape[1] == 1:
    y_test = y_test.iloc[:, 0]

X_test.to_csv(OUT_DIR / "test.csv", index=False)
y_test.to_csv(OUT_DIR / "test_labels.csv", index=False)

print("\nSaved all variants:")
print("  - train_original.csv: Use with class_weights")
print("  - train_balanced.csv: Use with moderate SMOTE")
print("  - class_weights.pkl: For tree-based models")
print("  - test.csv: Original test set (never resample!)")

# ===============================================
# Strategy 4: Create Focal Loss for Deep Learning
# ===============================================
print("\n=== Creating Focal Loss Implementation ===")

focal_loss_code = '''
import tensorflow as tf
from tensorflow.keras import backend as K

def focal_loss(gamma=2.0, alpha=0.25):
    """
    Focal Loss for multi-class classification
    
    Args:
        gamma: Focusing parameter (default 2.0)
        alpha: Balancing parameter (default 0.25)
    
    Returns:
        Loss function
    """
    def focal_loss_fixed(y_true, y_pred):
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        
        # Compute cross entropy
        cross_entropy = -y_true * K.log(y_pred)
        
        # Compute focal loss
        loss = alpha * K.pow(1 - y_pred, gamma) * cross_entropy
        
        return K.mean(K.sum(loss, axis=-1))
    
    return focal_loss_fixed

# Usage in model compilation:
# model.compile(optimizer='adam', loss=focal_loss(gamma=2.0, alpha=0.25), metrics=['accuracy'])
'''

# Save focal loss code
with open(OUT_DIR / "focal_loss.py", "w") as f:
    f.write(focal_loss_code)

print("Saved focal_loss.py for deep learning models")
print("\nRecommendations:")
print("  • Tree models (RF/XGB): Use train_original.csv + class_weights")
print("  • Deep Learning: Use train_original.csv + focal_loss.py")
print("  • Baseline comparison: Use train_balanced.csv")import psutil
import gc


In [None]:
# ===================================================================
# Memory Optimization Utilities
# ===================================================================
import psutil
import gc

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process()
    return process.memory_info().rss / 1024**3

def optimize_dtypes(df):
    """Reduce memory usage by optimizing data types"""
    print("\nOptimizing data types...")
    start_mem = df.memory_usage(deep=True).sum() / 1024**3
    print(f"  Initial memory: {start_mem:.2f} GB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**3
    saved = start_mem - end_mem
    print(f"  Final memory: {end_mem:.2f} GB")
    print(f"  Saved: {saved:.2f} GB ({100 * saved / start_mem:.1f}%)")
    
    return df

print(f"System RAM: {psutil.virtual_memory().total / 1024**3:.1f} GB")
print(f"Available RAM: {psutil.virtual_memory().available / 1024**3:.1f} GB")
print(f"Current process memory: {get_memory_usage():.2f} GB")

In [None]:
# Visualization: Compare distributions
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Original distribution
y_train_counts = y_train.value_counts().sort_index()
axes[0].bar(y_train_counts.index, y_train_counts.values, color='steelblue')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].set_title('Original Distribution (with class weights)')
axes[0].set_yscale('log')
axes[0].grid(alpha=0.3)

# After moderate SMOTE
y_res_counts = pd.Series(y_res).value_counts().sort_index()
axes[1].bar(y_res_counts.index, y_res_counts.values, color='coral')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Count')
axes[1].set_title('After Moderate SMOTE')
axes[1].set_yscale('log')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\nSummary:")
print(f"Original samples: {len(y_train):,}")
print(f"After SMOTE: {len(y_res):,}")
print(f"Size increase: {(len(y_res) / len(y_train) - 1) * 100:.1f}%")