In [None]:
"""
MLP baseline with K-Fold CV over multiple runs and epoch sweep.

Install:
    pip install numpy pandas scikit-learn torch

Optional (if you want GPU):
    pip install torch --index-url https://download.pytorch.org/whl/cu121
    (choose the CUDA version that matches your system)
"""

from __future__ import annotations

import pickle
import random
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, max_error
from torch.utils.data import DataLoader, TensorDataset


# ----------------------------
# Config
# ----------------------------
@dataclass(frozen=True)
class Config:
    data_path: str = "./complete_data.csv"
    excluded_columns: tuple[str, ...] = ("0", "PPT ID", "Site", "BMD - Total", "Race", "% fat - Total", "Gender", "ALM")
    target_columns: tuple[str, ...] = ("BMD - Total", "% fat - Total", "ALM")

    batch_size: int = 16
    lr: float = 0.01
    n_splits: int = 5
    n_runs: int = 10

    epoch_grid: tuple[int, ...] = tuple(range(50, 450, 50))  # 50,100,...,400
    hidden1: int = 10
    hidden2: int = 8
    num_workers: int = 0  # set >0 if you want multiprocessing dataloading


CFG = Config()


# ----------------------------
# Reproducibility
# ----------------------------
def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


# ----------------------------
# Model
# ----------------------------
class MLP(nn.Module):
    def __init__(self, n_inputs: int, h1: int = 10, h2: int = 8):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_inputs, h1),
            nn.ReLU(),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Linear(h2, 1),
        )
        # Xavier init for linear layers
        for m in self.net:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


# ----------------------------
# Data prep (NO leakage)
# ----------------------------
def get_xy(df: pd.DataFrame, excluded_cols: list[str], target_col: str) -> tuple[np.ndarray, np.ndarray]:
    """
    Returns raw (unscaled) X, y arrays.
    Note: We do NOT scale here to avoid leakage; scaling happens per fold on train only.
    """
    df_use = df.drop(columns=excluded_cols, errors="ignore").copy()

    if target_col not in df_use.columns:
        raise ValueError(f"Target column '{target_col}' not found after dropping excluded columns.")

    y = df_use[target_col].to_numpy(dtype=np.float32)
    X = df_use.drop(columns=[target_col]).to_numpy(dtype=np.float32)
    return X, y


def make_loaders(
    X: np.ndarray,
    y: np.ndarray,
    train_idx: np.ndarray,
    test_idx: np.ndarray,
    batch_size: int,
    num_workers: int = 0,
) -> tuple[DataLoader, DataLoader, int, np.ndarray]:
    """
    Fit scaler on train fold ONLY, transform train/test, return DataLoaders and n_inputs.
    Also returns y_test (numpy) for normalization.
    """
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X[train_idx])
    X_test = scaler.transform(X[test_idx])

    y_train = y[train_idx].reshape(-1, 1)
    y_test = y[test_idx].reshape(-1, 1)

    train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
    test_ds = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    n_inputs = X_train.shape[1]
    return train_dl, test_dl, n_inputs, y_test.squeeze()


# ----------------------------
# Train / Eval
# ----------------------------
def train_model(model: nn.Module, train_dl: DataLoader, optimizer: optim.Optimizer, criterion: nn.Module, epochs: int) -> None:
    model.train()
    for _ in range(epochs):
        for xb, yb in train_dl:
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()


@torch.no_grad()
def evaluate_model(model: nn.Module, test_dl: DataLoader) -> tuple[float, float, float, np.ndarray, np.ndarray]:
    model.eval()
    preds, trues = [], []
    for xb, yb in test_dl:
        out = model(xb).view(-1).cpu().numpy()
        yt = yb.view(-1).cpu().numpy()
        preds.append(out)
        trues.append(yt)

    y_pred = np.concatenate(preds)
    y_true = np.concatenate(trues)

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mx = max_error(y_true, y_pred)
    return mse, mae, mx, y_pred, y_true


def percent_rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    denom = np.sqrt(np.mean(y_true**2)) + 1e-12
    return 100.0 * rmse / denom


def percent_mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    mae = mean_absolute_error(y_true, y_pred)
    denom = np.mean(np.abs(y_true)) + 1e-12
    return 100.0 * mae / denom


# ----------------------------
# Experiment per target
# ----------------------------
def run_experiment(df: pd.DataFrame, target_col: str, cfg: Config)
