In [None]:
# model_comparison_rmse.py
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# XGBoost (may require installation)
try:
    from xgboost import XGBRegressor
except Exception as e:
    raise ImportError("Please install xgboost first: pip install xgboost") from e

# =======================
# Parameters (modifiable)
# =======================
DATA_FILE = "Data.xlsx"            # Input file (in the same directory)
TARGET_COL = "Hv"                  # Target column name
N_SPLITS = 5                       # 5-fold CV
N_REPEATS = 20                     # Repeat 20 times
RANDOM_STATE_BASE = 42             # Base random seed (different seed for each repetition)

OUT_CSV = "Model_Comparison_RMSE.csv"
OUT_PNG = "Model_Comparison_RMSE.png"

# =======================
# Read data
# =======================
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"Cannot find {DATA_FILE}. Please put Data.xlsx in the same directory as this script and ensure the first column name is '{TARGET_COL}'.")

df = pd.read_excel(DATA_FILE)

if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in data. Use df.columns to check column names.")

X = df.drop(columns=[TARGET_COL]).values
y = df[TARGET_COL].values

# =======================
# Define model dictionary
# =======================
models = {
    "GDB": GradientBoostingRegressor(random_state=RANDOM_STATE_BASE),
    "RF": RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE_BASE, n_jobs=-1),
    "XGB": XGBRegressor(n_estimators=200, random_state=RANDOM_STATE_BASE, verbosity=0),
    "ADB": AdaBoostRegressor(random_state=RANDOM_STATE_BASE),
    "DT": DecisionTreeRegressor(random_state=RANDOM_STATE_BASE),
    "SVM-lin": SVR(kernel="linear"),
    "SVM-poly": SVR(kernel="poly", degree=3),
    "SVM-rbf": SVR(kernel="rbf"),
    "KNN": KNeighborsRegressor(n_neighbors=5, n_jobs=-1),
    "MLP": MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=RANDOM_STATE_BASE)
}

# Build pipeline for models requiring standardization (e.g., linear/SVM/KNN/MLP)
# We apply a consistent rule: use StandardScaler for non-tree models (more stable)
def make_pipeline(model):
    # For tree-based models (RF, GDB, XGB, ADB, DT), scaling is not necessary, but harmless
    name = model.__class__.__name__.lower()
    if isinstance(model, (RandomForestRegressor, GradientBoostingRegressor, XGBRegressor, AdaBoostRegressor, DecisionTreeRegressor)):
        return Pipeline([("model", model)])
    else:
        return Pipeline([("scaler", StandardScaler()), ("model", model)])

# =======================
# Main loop: Repeat N_REPEATS times, each with a different random seed for KFold
# =======================
results = {name: [] for name in models.keys()}

for rep in range(N_REPEATS):
    seed = RANDOM_STATE_BASE + rep  # Different random seed each time
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    print(f"Repetition {rep+1}/{N_REPEATS}  (seed={seed})")

    for name, base_model in models.items():
        pipeline = make_pipeline(base_model)
        # Use negative mean squared error as the scoring metric (cross_val_score returns negative values)
        # Convert to positive and take the square root to obtain RMSE for each fold
        scores = cross_val_score(pipeline, X, y, cv=kf, scoring="neg_mean_squared_error", n_jobs=-1)
        rmse_folds = np.sqrt(-scores)             # RMSE per fold
        mean_rmse = rmse_folds.mean()             # Mean RMSE across 5 folds for this repetition
        results[name].append(mean_rmse)

# =======================
# Summary: compute mean and std across 20 repetitions
# =======================
summary_rows = []
for name, vals in results.items():
    vals = np.array(vals)
    mean_all = vals.mean()
    std_all = vals.std(ddof=1)
    summary_rows.append({
        "model": name,
        "mean_RMSE": mean_all,
        "std_RMSE": std_all,
        "all_repetition_RMSEs": ",".join([f"{v:.6f}" for v in vals])
    })

summary_df = pd.DataFrame(summary_rows).sort_values("mean_RMSE").reset_index(drop=True)

# Save CSV (includes RMSEs from all repetitions)
summary_df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"✅ Evaluation results saved as: {OUT_CSV}")

# =======================
# Plot: bar chart with error bars
# =======================
plt.figure(figsize=(10, 6))
x_pos = np.arange(len(summary_df))
means = summary_df["mean_RMSE"].values
stds = summary_df["std_RMSE"].values

bars = plt.bar(x_pos, means, yerr=stds, capsize=6)
plt.xticks(x_pos, summary_df["model"], rotation=45, ha="right")
plt.ylabel("RMSE")
plt.title(f"Model comparison (RMSE) — {N_SPLITS}-fold CV × {N_REPEATS} repeats")
plt.tight_layout()
plt.savefig(OUT_PNG, dpi=300)
plt.show()
print(f"✅ Comparison plot saved as: {OUT_PNG}")

# =======================
# Save a more detailed result (each repetition as a column)
# =======================
# Build DataFrame: rows = models, columns = rep_1 ... rep_N, plus mean and std
detailed = []
for name, vals in results.items():
    row = {"model": name}
    for i, v in enumerate(vals, 1):
        row[f"rep_{i}"] = v
    row["mean_RMSE"] = np.mean(vals)
    row["std_RMSE"] = np.std(vals, ddof=1)
    detailed.append(row)

detailed_df = pd.DataFrame(detailed).set_index("model")
detailed_df.to_excel("Model_Comparison_Detailed_RMSE.xlsx")
print("✅ Detailed results saved as: Model_Comparison_Detailed_RMSE.xlsx")

# =======================
# Print summary (concise)
# =======================
print("\n=== Summary (ordered by mean_RMSE) ===")
print(summary_df[["model", "mean_RMSE", "std_RMSE"]].to_string(index=False, float_format='%.6f'))