In [None]:
"""
SVR + LSSVR hyperparameter sweep with CV (multiprocessing).

Install:
    pip install numpy pandas scikit-learn

For SVR (scikit-learn): included in scikit-learn

For LSSVR:
    pip install lssvr
    (If 'lssvr' is not available on your system, comment out LSSVR usage or replace with your own implementation.)

Usage:
    python svr_lssvr_sweep.py

Outputs:
    - results_svr_lssvr.csv  (tidy table of mean %RMSE across CV folds and runs)
"""

from __future__ import annotations

import multiprocessing as mp
from dataclasses import dataclass
from itertools import product
from typing import Literal, Optional

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

# LSSVR is optional; we only import if requested
try:
    from lssvr import LSSVR  # type: ignore
except Exception:
    LSSVR = None


ModelType = Literal["svr", "lssvr"]


# ----------------------------
# Config
# ----------------------------
@dataclass(frozen=True)
class Config:
    data_path: str = "./../Data/complete_data.csv"

    sexes: tuple[str, ...] = ("All", "Male", "Female")
    target_features: tuple[str, ...] = ("ALM", "% fat - Total", "BMD - Total")

    excluded_columns: tuple[str, ...] = ("PPT ID", "Site", "Gender", "Race", "BMD - Total", "ALM", "% fat - Total")

    n_runs: int = 10
    n_splits: int = 5

    # RBF kernel settings (used by both SVR and LSSVR)
    gammas: tuple[float, ...] = (0.0001, 0.001, 0.01, 0.1, 0.5, 1.0)

    # Regularization parameter
    Cs: tuple[float, ...] = (0.01, 0.1, 1, 10, 25, 50, 75, 100, 250, 500, 1000, 10000)

    # For scikit SVR only (epsilon-insensitive loss width)
    epsilons: tuple[float, ...] = (0.01, 0.1)

    # Which models to run
    models: tuple[ModelType, ...] = ("svr", "lssvr")

    # multiprocessing
    processes: int = max(1, mp.cpu_count() - 1)


CFG = Config()


# ----------------------------
# Metrics
# ----------------------------
def percent_rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """%RMSE normalized by RMS(y_true)."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    denom = np.sqrt(np.mean(y_true**2)) + 1e-12
    return 100.0 * rmse / denom


# ----------------------------
# Model factory
# ----------------------------
def make_model(model_type: ModelType, gamma: float, C: float, epsilon: Optional[float] = None):
    if model_type == "svr":
        # sklearn SVR uses gamma + C; epsilon is SVR-specific
        if epsilon is None:
            epsilon = 0.1
        return SVR(kernel="rbf", gamma=gamma, C=C, epsilon=epsilon)

    if model_type == "lssvr":
        if LSSVR is None:
            raise ImportError("LSSVR requested but `lssvr` could not be imported. Run `pip install lssvr`.")
        # lssvr API: LSSVR(kernel='rbf', gamma=..., C=...)
        return LSSVR(kernel="rbf", gamma=gamma, C=C)

    raise ValueError(f"Unknown model_type: {model_type}")


# ----------------------------
# Single job: (model, sex, target, gamma, C, epsilon?, run)
# ----------------------------
def run_one_job(args):
    (
        model_type,
        sex,
        target,
        gamma,
        C,
        epsilon,
        run_seed,
        X_all,
        y_all,
        n_splits,
    ) = args

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=run_seed)
    fold_scores = []

    for train_idx, test_idx in kf.split(X_all):
        X_train = X_all[train_idx]
        X_test = X_all[test_idx]
        y_train = y_all[train_idx]
        y_test = y_all[test_idx]

        scaler = StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_test_s = scaler.transform(X_test)

        model = make_model(model_type, gamma=gamma, C=C, epsilon=epsilon)
        model.fit(X_train_s, y_train)
        y_pred = model.predict(X_test_s)

        fold_scores.append(percent_rmse(y_test, y_pred))

    return float(np.mean(fold_scores))


def progress_bar(done: int, total: int, bar_len: int = 40) -> None:
    filled = int(bar_len * done / total)
    bar = "#" * filled + "_" * (bar_len - filled)
    print(f"[{bar}] {done}/{total}", end="\r")


# ----------------------------
# Main
# ----------------------------
def main() -> None:
    print("Loading data...")
    df = pd.read_csv(CFG.data_path)
    print("Data loaded.")

    results_rows = []

    # Build all combinations (but do NOT attach X,y yet)
    combos = []

    for model_type in CFG.models:
        if model_type == "lssvr" and LSSVR is None:
            print("Warning: LSSVR not available (import failed). Skipping LSSVR runs.")
            continue

        for sex, target, gamma, C, run_seed in product(
            CFG.sexes, CFG.target_features, CFG.gammas, CFG.Cs, range(CFG.n_runs)
        ):
            if model_type == "svr":
                for eps in CFG.epsilons:
                    combos.append((model_type, sex, target, gamma, C, eps, run_seed))
            else:
                combos.append((model_type, sex, target, gamma, C, None, run_seed))

    total = len(combos)
    print(f"Total jobs: {total}  | processes={CFG.processes}")

    # Multiprocessing pool
    with mp.Pool(processes=CFG.processes) as pool:
        async_results = []
        for idx, (model_type, sex, target, gamma, C, eps, run_seed) in enumerate(combos, start=1):
            # filter once per job (cheap, and avoids passing huge df slices into job)
            if sex == "All":
                df_f = df
            else:
                df_f = df[df["Gender"] == sex]

            X = df_f.drop(columns=list(CFG.excluded_columns), errors="ignore").to_numpy(dtype=float)
            y = df_f[target].to_numpy(dtype=float)

            job_args = (model_type, sex, target, gamma, C, eps, run_seed, X, y, CFG.n_splits)
            async_results.append((model_type, sex, target, gamma, C, eps, run_seed, pool.apply_async(run_one_job, (job_args,))))

            progress_bar(idx, total)

        print("\nCollecting results...")

        for (model_type, sex, target, gamma, C, eps, run_seed, ar) in async_results:
            score = ar.get()

            results_rows.append(
                {
                    "model": model_type,
                    "sex": sex,
                    "target": target,
                    "gamma": gamma,
                    "C": C,
                    "epsilon": eps,
                    "run": run_seed,
                    "mean_percent_rmse": score,
                }
            )

    results = pd.DataFrame(results_rows)

    # Aggregate over runs (mean across runs)
    summary = (
        results.groupby(["model", "sex", "target", "gamma", "C", "epsilon"], dropna=False)["mean_percent_rmse"]
        .mean()
        .reset_index()
        .rename(columns={"mean_percent_rmse": "avg_percent_rmse_over_runs"})
        .sort_values(["model", "sex", "target", "avg_percent_rmse_over_runs"])
    )

    results.to_csv("results_svr_lssvr_all_runs.csv", index=False)
    summary.to_csv("results_svr_lssvr_summary.csv", index=False)

    print("\nSaved:")
    print(" - results_svr_lssvr_all_runs.csv")
    print(" - results_svr_lssvr_summary.csv")

    # Print best config per (model, sex, target)
    print("\nBest configs:")
    for (model, sex, target), g in summary.groupby(["model", "sex", "target"]):
        best = g.iloc[0]
        print(
            f"{model.upper():5s} | {sex:6s} | {target:12s} | "
            f"%RMSE={best['avg_percent_rmse_over_runs']:.3f} | "
            f"gamma={best['gamma']} C={best['C']} epsilon={best['epsilon']}"
        )


if __name__ == "__main__":
    main()
