# Phase 6 — Hyperparameter Tuning and Optimization

**Improved Approach:**
1. Use **Optuna** for efficient Bayesian optimization (better than RandomizedSearchCV)
2. Increase search iterations (20-30 trials)
3. Use class weights for better handling of imbalanced data
4. Cross-validation with 5 folds
5. Optimize for F1-weighted score (better for imbalanced datasets)

In [None]:
import pandas as pd
import joblib
import numpy as np
from pathlib import Path
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import optuna
from optuna.samplers import TPESampler

# Paths
DATA_DIR = Path("../data/processed/ml_balance")
MODEL_DIR = Path("../trained_models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Load ORIGINAL data (use class weights instead of resampled data)
print("Loading original training data (for class weight approach)...")
X = pd.read_csv(DATA_DIR / "train_original.csv")
y = pd.read_csv(DATA_DIR / "train_original_labels.csv")

if y.shape[1] == 1:
    y = y.iloc[:, 0]

# Load class weights
class_weights = joblib.load(DATA_DIR / "class_weights.pkl")
print(f"Loaded class weights for {len(class_weights)} classes")

# Define F1-weighted scorer
f1_scorer = make_scorer(f1_score, average='weighted')

print(f"Dataset shape: {X.shape}")
print(f"Class distribution:\n{y.value_counts()}")

# ============================================================
# Random Forest Optimization with Optuna
# ============================================================
print("\n" + "=" * 60)
print("Random Forest Optimization with Optuna")
print("=" * 60)

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': -1
    }
    
    rf = RandomForestClassifier(**params)
    
    # 5-fold CV
    scores = cross_val_score(rf, X, y, cv=5, scoring=f1_scorer, n_jobs=-1)
    return scores.mean()

study_rf = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='RandomForest_Tuning'
)

study_rf.optimize(objective_rf, n_trials=25, show_progress_bar=True)

print("\n--- Best RF Parameters ---")
print(study_rf.best_params)
print(f"Best F1 Score: {study_rf.best_value:.4f}")

# Train final model with best params
best_rf = RandomForestClassifier(**study_rf.best_params, random_state=42, n_jobs=-1)
best_rf.fit(X, y)
joblib.dump(best_rf, MODEL_DIR / "final_rf_optuna.pkl")
print(f"Saved to {MODEL_DIR / 'final_rf_optuna.pkl'}")

# ============================================================
# XGBoost Optimization with Optuna
# ============================================================
print("\n" + "=" * 60)
print("XGBoost Optimization with Optuna")
print("=" * 60)

# Convert class weights to sample weights for XGBoost
sample_weights = y.map(class_weights).values

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
        'tree_method': 'hist',
        'eval_metric': 'mlogloss'
    }
    
    xgb = XGBClassifier(**params)
    
    # 5-fold CV with sample weights
    scores = cross_val_score(
        xgb, X, y, 
        cv=5, 
        scoring=f1_scorer,
        fit_params={'sample_weight': sample_weights},
        n_jobs=-1
    )
    return scores.mean()

study_xgb = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='XGBoost_Tuning'
)

study_xgb.optimize(objective_xgb, n_trials=30, show_progress_bar=True)

print("\n--- Best XGBoost Parameters ---")
print(study_xgb.best_params)
print(f"Best F1 Score: {study_xgb.best_value:.4f}")

# Train final model with best params
best_xgb = XGBClassifier(**study_xgb.best_params, random_state=42, tree_method='hist', eval_metric='mlogloss')
best_xgb.fit(X, y, sample_weight=sample_weights)
joblib.dump(best_xgb, MODEL_DIR / "final_xgb_optuna.pkl")
print(f"Saved to {MODEL_DIR / 'final_xgb_optuna.pkl'}")

# ============================================================
# LightGBM Optimization with Optuna
# ============================================================
print("\n" + "=" * 60)
print("LightGBM Optimization with Optuna")
print("=" * 60)

def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'class_weight': 'balanced',
        'random_state': 42,
        'verbose': -1
    }
    
    lgbm = LGBMClassifier(**params)
    
    # 5-fold CV
    scores = cross_val_score(lgbm, X, y, cv=5, scoring=f1_scorer, n_jobs=-1)
    return scores.mean()

study_lgbm = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='LightGBM_Tuning'
)

study_lgbm.optimize(objective_lgbm, n_trials=25, show_progress_bar=True)

print("\n--- Best LightGBM Parameters ---")
print(study_lgbm.best_params)
print(f"Best F1 Score: {study_lgbm.best_value:.4f}")

# Train final model with best params
best_lgbm = LGBMClassifier(**study_lgbm.best_params, random_state=42, verbose=-1)
best_lgbm.fit(X, y)
joblib.dump(best_lgbm, MODEL_DIR / "final_lgbm_optuna.pkl")
print(f"Saved to {MODEL_DIR / 'final_lgbm_optuna.pkl'}")

print("\n" + "=" * 60)
print("Optimization Complete!")
print("=" * 60)
print("\nBest F1 Scores:")
print(f"  Random Forest: {study_rf.best_value:.4f}")
print(f"  XGBoost:       {study_xgb.best_value:.4f}")
print(f"  LightGBM:      {study_lgbm.best_value:.4f}")import psutil
import gc


In [None]:
# ===================================================================
# Memory Optimization Utilities
# ===================================================================
import psutil
import gc

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process()
    return process.memory_info().rss / 1024**3

def optimize_dtypes(df):
    """Reduce memory usage by optimizing data types"""
    print("\nOptimizing data types...")
    start_mem = df.memory_usage(deep=True).sum() / 1024**3
    print(f"  Initial memory: {start_mem:.2f} GB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**3
    saved = start_mem - end_mem
    print(f"  Final memory: {end_mem:.2f} GB")
    print(f"  Saved: {saved:.2f} GB ({100 * saved / start_mem:.1f}%)")
    
    return df

print(f"System RAM: {psutil.virtual_memory().total / 1024**3:.1f} GB")
print(f"Available RAM: {psutil.virtual_memory().available / 1024**3:.1f} GB")
print(f"Current process memory: {get_memory_usage():.2f} GB")