In [None]:
"""
Polynomial regression baselines (Linear, Lasso, Ridge, BayesianRidge)
with K-Fold CV across multiple runs.

Install:
    pip install numpy pandas scikit-learn

Outputs:
    - poly_results_all_runs.csv
    - poly_results_summary.csv
"""

from __future__ import annotations

from dataclasses import dataclass
from math import sqrt
from typing import Sequence

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.metrics import mean_squared_error


# ----------------------------
# Config
# ----------------------------
@dataclass(frozen=True)
class Config:
    file_paths: dict[str, str] = None  # set below
    target_col: str = "ALM"

    n_runs: int = 10
    n_splits: int = 5
    degrees: tuple[int, ...] = (1, 2, 3)

    excluded_columns: tuple[str, ...] = ("0", "PPT ID", "Site", "Gender", "BMD - Total", "ALM", "% fat - Total", "Race")

    # If you want fewer prints
    verbose: bool = True


CFG = Config(
    file_paths={
        "Male Data": "./../Data/male.csv",
        "Female Data": "./../Data/female.csv",
        "Complete Data": "./../Data/complete_data.csv",
    }
)

MODELS = [
    ("LinearRegression", LinearRegression()),
    ("Lasso", Lasso(max_iter=20000)),
    ("Ridge", Ridge()),
    ("BayesianRidge", BayesianRidge()),
]


# ----------------------------
# Metric
# ----------------------------
def percent_rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """%RMSE normalized by RMS(y_true)."""
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    denom = np.sqrt(np.mean(y_true**2)) + 1e-12
    return 100.0 * rmse / denom


# ----------------------------
# Core evaluation
# ----------------------------
def evaluate_models(
    file_path: str,
    target_col: str,
    n_runs: int,
    n_splits: int,
    degrees: Sequence[int],
    models,
    excluded_columns: Sequence[str],
    verbose: bool = False,
):
    df = pd.read_csv(file_path)

    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found in {file_path}")

    # Shape: (run, fold, degree_idx, model_idx)
    prmses = np.zeros((n_runs, n_splits, len(degrees), len(models)), dtype=float)

    # Run loop
    for run in range(n_runs):
        if verbose:
            print(f"Run {run+1}/{n_runs} | {file_path}")

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=run)

        for fold, (train_idx, test_idx) in enumerate(kf.split(df)):
            if verbose:
                print(f"  Fold {fold+1}/{n_splits}")

            train_df = df.iloc[train_idx]
            test_df = df.iloc[test_idx]

            X_train = train_df.drop(columns=list(excluded_columns), errors="ignore")
            y_train = train_df[target_col].to_numpy(dtype=float)

            X_test = test_df.drop(columns=list(excluded_columns), errors="ignore")
            y_test = test_df[target_col].to_numpy(dtype=float)

            # Standardize ONLY using train fold
            scaler = StandardScaler()
            X_train_s = scaler.fit_transform(X_train)
            X_test_s = scaler.transform(X_test)

            # Model × degree
            for model_idx, (model_name, base_model) in enumerate(models):
                for deg_idx, deg in enumerate(degrees):
                    pipe = Pipeline(
                        steps=[
                            ("poly", PolynomialFeatures(degree=deg, include_bias=False)),
                            ("reg", base_model),
                        ]
                    )
                    pipe.fit(X_train_s, y_train)
                    y_pred = pipe.predict(X_test_s)

                    prmses[run, fold, deg_idx, model_idx] = percent_rmse(y_test, y_pred)

    # Summary: mean/std over folds and runs
    # mean_over_folds shape: (run, degree, model)
    mean_over_folds = prmses.mean(axis=1)
    std_over_folds = prmses.std(axis=1)

    # overall summary across runs+folds:
    overall_mean = prmses.mean(axis=(0, 1))  # (degree, model)
    overall_std = prmses.std(axis=(0, 1))    # (degree, model)

    return prmses, mean_over_folds, std_over_folds, overall_mean, overall_std


# ----------------------------
# Run + save
# ----------------------------
def main():
    all_rows = []
    summary_rows = []

    results = {}

    for dataset_name, path in CFG.file_paths.items():
        prmses, mean_folds, std_folds, overall_mean, overall_std = evaluate_models(
            file_path=path,
            target_col=CFG.target_col,
            n_runs=CFG.n_runs,
            n_splits=CFG.n_splits,
            degrees=CFG.degrees,
            models=MODELS,
            excluded_columns=CFG.excluded_columns,
            verbose=CFG.verbose,
        )
        results[dataset_name] = (prmses, mean_folds, std_folds)

        # Save tidy "all runs" rows
        for run in range(CFG.n_runs):
            for fold in range(CFG.n_splits):
                for deg_idx, deg in enumerate(CFG.degrees):
                    for model_idx, (model_name, _) in enumerate(MODELS):
                        all_rows.append(
                            {
                                "dataset": dataset_name,
                                "target": CFG.target_col,
                                "run": run,
                                "fold": fold,
                                "degree": deg,
                                "model": model_name,
                                "percent_rmse": float(prmses[run, fold, deg_idx, model_idx]),
                            }
                        )

        # Summary per dataset/degree/model
        for deg_idx, deg in enumerate(CFG.degrees):
            for model_idx, (model_name, _) in enumerate(MODELS):
                summary_rows.append(
                    {
                        "dataset": dataset_name,
                        "target": CFG.target_col,
                        "degree": deg,
                        "model": model_name,
                        "mean_percent_rmse": float(overall_mean[deg_idx, model_idx]),
                        "std_percent_rmse": float(overall_std[deg_idx, model_idx]),
                    }
                )

    all_df = pd.DataFrame(all_rows)
    summary_df = pd.DataFrame(summary_rows).sort_values(["dataset", "degree", "mean_percent_rmse"])

    all_df.to_csv("poly_results_all_runs.csv", index=False)
    summary_df.to_csv("poly_results_summary.csv", index=False)

    # Print compact view (like your original)
    for name, (prmses, mean_folds, std_folds) in results.items():
        print(f"\nResults for {name}:")
        print("Mean %RMSE over folds (shape: run × degree × model):")
        print(mean_folds)
        print("\nStd %RMSE over folds (shape: run × degree × model):")
        print(std_folds)

    print("\nSaved:")
    print(" - poly_results_all_runs.csv")
    print(" - poly_results_summary.csv")


if __name__ == "__main__":
    main()
