In [None]:
"""
Random Forest + XGBoost grid sweep with K-Fold CV.

Install:
    pip install numpy pandas scikit-learn

For XGBoost:
    pip install xgboost

Optional (only for heatmaps):
    pip install matplotlib seaborn
"""

from __future__ import annotations

from dataclasses import dataclass
from math import sqrt
from typing import Literal

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

try:
    from xgboost import XGBRegressor
except Exception:
    XGBRegressor = None

# plotting is optional
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
except Exception:
    plt, sns = None, None


ModelName = Literal["rf", "xgb"]


# ----------------------------
# Config
# ----------------------------
@dataclass(frozen=True)
class Config:
    data_path: str = "./../Data/complete_data.csv"

    sexes: tuple[str, ...] = ("All", "Male", "Female")
    targets: tuple[str, ...] = ("ALM", "% fat - Total", "BMD - Total")

    excluded_columns: tuple[str, ...] = ("0", "PPT ID", "Site", "Gender", "BMD - Total", "ALM", "% fat - Total", "Race")

    trees: tuple[int, ...] = (10, 15)   # n_estimators
    depths: tuple[int, ...] = (10, 25)  # max_depth

    n_runs: int = 2
    n_splits: int = 5

    # Which models to run
    models: tuple[ModelName, ...] = ("rf", "xgb")

    # Plot heatmaps at the end?
    plot: bool = True


CFG = Config()


# ----------------------------
# Utilities
# ----------------------------
def update_progress_bar(done: int, total: int, bar_length: int = 40) -> None:
    filled = int(bar_length * done / total)
    bar = "#" * filled + "_" * (bar_length - filled)
    print(f"[{bar}] {done}/{total}", end="\r")


def percent_rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    denom = np.sqrt(np.mean(np.square(y_true))) + 1e-12
    return 100.0 * rmse / denom


def make_model(model: ModelName, n_estimators: int, max_depth: int, seed: int):
    if model == "rf":
        return RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=seed,
            n_jobs=-1,
        )

    if model == "xgb":
        if XGBRegressor is None:
            raise ImportError("xgboost is not installed. Run: pip install xgboost")
        return XGBRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=seed,
            tree_method="auto",
            n_jobs=-1,
            verbosity=0,
        )

    raise ValueError(f"Unknown model: {model}")


# ----------------------------
# Main experiment
# ----------------------------
def main() -> None:
    print("Loading data...")
    df = pd.read_csv(CFG.data_path)

    rows = []

    # Total iterations: model × sex × target × tree × depth × run × fold
    total_iters = (
        len(CFG.models) * len(CFG.sexes) * len(CFG.targets) * len(CFG.trees) * len(CFG.depths) * CFG.n_runs * CFG.n_splits
    )
    done = 0

    for model in CFG.models:
        for sex in CFG.sexes:
            df_f = df if sex == "All" else df[df["Gender"] == sex]

            for target in CFG.targets:
                X = df_f.drop(columns=list(CFG.excluded_columns), errors="ignore")
                y = df_f[target].to_numpy(dtype=float)

                # Use numpy arrays for faster indexing
                X_np = X.to_numpy(dtype=float)

                for n_estimators in CFG.trees:
                    for max_depth in CFG.depths:
                        fold_scores = []

                        for run in range(CFG.n_runs):
                            kf = KFold(n_splits=CFG.n_splits, shuffle=True, random_state=run)

                            for fold, (train_idx, test_idx) in enumerate(kf.split(X_np)):
                                X_train, X_test = X_np[train_idx], X_np[test_idx]
                                y_train, y_test = y[train_idx], y[test_idx]

                                mdl = make_model(model, n_estimators=n_estimators, max_depth=max_depth, seed=run)
                                mdl.fit(X_train, y_train)
                                y_pred = mdl.predict(X_test)

                                score = percent_rmse(y_test, y_pred)
                                fold_scores.append(score)

                                done += 1
                                update_progress_bar(done, total_iters)

                        mean_score = float(np.mean(fold_scores))
                        rows.append(
                            {
                                "model": model,
                                "sex": sex,
                                "target": target,
                                "n_estimators": n_estimators,
                                "max_depth": max_depth,
                                "mean_percent_rmse": mean_score,
                            }
                        )

    print("\nAll iterations completed.")

    results = pd.DataFrame(rows)
    results.to_csv("rf_xgb_results.csv", index=False)

    # Best config per (model, sex, target)
    best = (
        results.sort_values("mean_percent_rmse")
        .groupby(["model", "sex", "target"], as_index=False)
        .first()
    )
    best.to_csv("rf_xgb_best_configs.csv", index=False)

    print("Saved:")
    print(" - rf_xgb_results.csv")
    print(" - rf_xgb_best_configs.csv")

    # Optional heatmaps (one heatmap per (model, sex, target))
    if CFG.plot:
        if plt is None or sns is None:
            print("Plotting requested but matplotlib/seaborn not installed. Skipping plots.")
            return

        for model in CFG.models:
            for sex in CFG.sexes:
                for target in CFG.targets:
                    sub = results[(results["model"] == model) & (results["sex"] == sex) & (results["target"] == target)]
                    if sub.empty:
                        continue

                    # pivot to a grid: rows=n_estimators, cols=max_depth
                    grid = sub.pivot(index="n_estimators", columns="max_depth", values="mean_percent_rmse")
                    plt.figure(figsize=(6, 4))
                    sns.heatmap(grid, annot=True, fmt=".2f", cmap="viridis")
                    plt.title(f"{model.upper()} | {sex} | {target} (%RMSE)")
                    plt.xlabel("max_depth")
                    plt.ylabel("n_estimators")
                    plt.tight_layout()
                    plt.show()


if __name__ == "__main__":
    main()
