In [None]:
# feature_importance_gdb.py
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

# ===============================
# Parameter Settings
# ===============================
DATA_FILE = "Data.xlsx"
TARGET_COL = "Hv"
N_SPLITS = 5
N_REPEATS = 10
RANDOM_BASE = 42

OUT_CSV = "Feature_Importance_GDB.csv"
OUT_PNG = "Feature_Importance_GDB.png"

# ===============================
# Step 1. Load Data
# ===============================
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"‚ùå File not found: {DATA_FILE}")

df = pd.read_excel(DATA_FILE)
if TARGET_COL not in df.columns:
    raise ValueError(f"‚ùå Target column '{TARGET_COL}' not found in data.")

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].values
feature_names = X.columns.tolist()

# ===============================
# Step 2. Initialize Model and Scaler
# ===============================
base_model = GradientBoostingRegressor(random_state=RANDOM_BASE)
model = Pipeline([("scaler", StandardScaler()), ("gdb", base_model)])

# ===============================
# Step 3. Multiple Repeats + Cross-Validation + Permutation Importance
# ===============================
all_importances = pd.DataFrame(0, index=feature_names, columns=[f"rep_{i+1}" for i in range(N_REPEATS)])
all_rmse = []

for rep in range(N_REPEATS):
    print(f"üîÅ Evaluation {rep+1}/{N_REPEATS} in progress...")

    # Use different random seeds for different splits
    seed = RANDOM_BASE + rep
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)

    # Fit model and calculate cross-validation RMSE
    scores = cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error", n_jobs=-1)
    rmse_mean = np.sqrt(-scores).mean()
    all_rmse.append(rmse_mean)

    # Retrain model on full data (for feature importance calculation)
    model.fit(X, y)
    r = permutation_importance(model, X, y, scoring="neg_mean_squared_error", n_repeats=10, random_state=seed)
    # Take the average importance values
    importances = np.mean(r.importances, axis=1)
    all_importances[f"rep_{rep+1}"] = importances

# ===============================
# Step 4. Compute Average Results
# ===============================
all_importances["mean_importance"] = all_importances.mean(axis=1)
all_importances["std_importance"] = all_importances.std(axis=1)
all_importances = all_importances.sort_values("mean_importance", ascending=False)

# Save results
all_importances.to_csv(OUT_CSV, encoding="utf-8-sig")
print(f"‚úÖ Feature importance results saved as: {OUT_CSV}")

# ===============================
# Step 5. Plot
# ===============================
plt.figure(figsize=(10, 6))
plt.barh(all_importances.index, all_importances["mean_importance"], xerr=all_importances["std_importance"], capsize=5)
plt.gca().invert_yaxis()
plt.xlabel("Feature Importance (average over 10 repeats)")
plt.ylabel("Features")
plt.title("Feature Importance (Gradient Boosting, 5-fold CV √ó 10 repeats)")
plt.tight_layout()
plt.savefig(OUT_PNG, dpi=300)
plt.show()
print(f"‚úÖ Plot saved as: {OUT_PNG}")

# ===============================
# Step 6. Print Summary
# ===============================
print("\n=== Average Feature Importance Ranking ===")
print(all_importances[["mean_importance", "std_importance"]].to_string(float_format="%.6f"))

print(f"\nüìâ Average RMSE (mean over 10 repeats): {np.mean(all_rmse):.4f} ¬± {np.std(all_rmse):.4f}")