In [None]:
# rfe_forward_gdb.py
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score

import warnings
warnings.filterwarnings("ignore")

# ============================
# Parameters (Adjustable)
# ============================
DATA_FILE = "Data.xlsx"                           # Original data (includes Hv)
PCC_SELECTED_FILE = "Selected_Features_PCC.xlsx"  # PCC-filtered feature matrix (columns = remaining features)
IMPORTANCE_FILE = "Feature_Importance_GDB.csv"    # Feature importance file (contains feature & mean_importance/importance)
TARGET_COL = "Hv"

N_SPLITS = 5      # 5-fold cross-validation
N_REPEATS = 3     # Repeat 3 times
RANDOM_BASE = 42

OUT_CSV = "RFE_Forward_Results.csv"       # Summary results (mean & std RMSE for each repetition)
OUT_DETAILED = "RFE_Forward_Detailed.csv" # Detailed results (RMSE for each repetition and k)
OUT_PNG = "RFE_Forward_Plot.png"          # Plot save path

# ============================
# File Checks
# ============================
for f in (DATA_FILE, PCC_SELECTED_FILE, IMPORTANCE_FILE):
    if not os.path.exists(f):
        raise FileNotFoundError(f"File not found: {f}. Please ensure all files are in the same directory as the script.")

# ============================
# Load Data
# ============================
df = pd.read_excel(DATA_FILE)
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in data.")

X_all = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].values

# Load PCC-filtered matrix and take its columns as remaining features
pcc_df = pd.read_excel(PCC_SELECTED_FILE, index_col=0)
remaining_features = list(pcc_df.columns)
if len(remaining_features) == 0:
    raise ValueError("No remaining feature columns found in the PCC-filtered file. Please check Selected_Features_PCC.xlsx format.")

# Load feature importance file and parse columns
imp_df = pd.read_csv(IMPORTANCE_FILE, encoding="utf-8-sig")

# Try to locate possible column names for feature and importance
possible_feature_cols = [c for c in imp_df.columns if c.lower() in ("feature", "feature_name", "index", "unnamed: 0", "0")]
possible_imp_cols = [c for c in imp_df.columns if c.lower() in ("mean_importance", "importance", "mean_score", "score")]

if not possible_feature_cols:
    # fallback: use the first column as feature name
    feature_col = imp_df.columns[0]
else:
    feature_col = possible_feature_cols[0]

if not possible_imp_cols:
    # fallback: use the second column as importance
    if imp_df.shape[1] >= 2:
        imp_col = imp_df.columns[1]
    else:
        raise ValueError("Failed to detect importance column. Please check Feature_Importance_GDB.csv format.")
else:
    imp_col = possible_imp_cols[0]

imp_df = imp_df[[feature_col, imp_col]].rename(columns={feature_col: "feature", imp_col: "importance"})
imp_df["feature"] = imp_df["feature"].astype(str)
imp_df = imp_df.sort_values("importance", ascending=False).reset_index(drop=True)

# Keep only features that are in remaining_features, sorted by importance
ranked_features = [f for f in imp_df["feature"].tolist() if f in remaining_features]
if len(ranked_features) == 0:
    raise ValueError("No features in the importance file match those remaining after PCC filtering. Please ensure consistent naming.")

print(f"Detected {len(remaining_features)} PCC-filtered features, "
      f"matched {len(ranked_features)} features in importance file (used for ranking).")

# Append unmatched remaining_features at the end (preserve original order)
for f in remaining_features:
    if f not in ranked_features:
        ranked_features.append(f)

n_features = len(ranked_features)

# ============================
# Main Process: Forward Feature Addition
# Evaluate k = 1..n_features, repeated N_REPEATS times
# ============================
detailed = pd.DataFrame(index=[f"top_{k}" for k in range(1, n_features + 1)])

for rep in range(N_REPEATS):
    seed = RANDOM_BASE + rep
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    rep_scores = []
    print(f"Repetition {rep + 1}/{N_REPEATS} (seed={seed})")

    for k in range(1, n_features + 1):
        features_k = ranked_features[:k]
        Xk = X_all[features_k]
        Xk = np.array(Xk).reshape(len(Xk), -1)

        model = GradientBoostingRegressor(n_estimators=200, random_state=seed)

        scores = cross_val_score(model, Xk, y, cv=kf,
                                 scoring="neg_mean_squared_error", n_jobs=-1)
        rmse_mean = np.sqrt(-scores).mean()
        rep_scores.append(rmse_mean)

    detailed[f"rep_{rep + 1}"] = rep_scores

# Replace index with numeric feature counts
detailed.index = [k for k in range(1, n_features + 1)]

# Compute mean and standard deviation
detailed["mean_RMSE"] = detailed.mean(axis=1)
detailed["std_RMSE"] = detailed.std(axis=1, ddof=1)

# Save detailed results (each repetition’s RMSE)
detailed.to_csv(OUT_DETAILED, index_label="num_features")
print(f"✅ Detailed results saved to: {OUT_DETAILED}")

# Save summary results (mean & std only)
summary = detailed[["mean_RMSE", "std_RMSE"]].reset_index().rename(columns={"index": "num_features"})
summary.to_csv(OUT_CSV, index=False)
print(f"✅ Summary results saved to: {OUT_CSV}")

# ============================
# Plot: Mean RMSE with Error Band
# ============================
plt.figure(figsize=(10, 6))
x = summary["num_features"].to_numpy()
y = summary["mean_RMSE"].to_numpy()
yerr = summary["std_RMSE"].to_numpy()

plt.plot(x, y, marker='o', linestyle='-', linewidth=2)
plt.fill_between(x, y - yerr, y + yerr, alpha=0.25)
plt.xticks(x)
plt.xlabel("Number of top features used")
plt.ylabel("RMSE (5-fold CV mean)")
plt.title(f"Forward Feature Addition (GDB) — {N_SPLITS}-fold CV × {N_REPEATS} repeats")
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig(OUT_PNG, dpi=300)
plt.show()
print(f"✅ Plot saved as: {OUT_PNG}")

# ============================
# Additional Output: Save Feature Ranking
# ============================
feat_order_df = pd.DataFrame({
    "rank": list(range(1, n_features + 1)),
    "feature": ranked_features
})
feat_order_df.to_csv("Selected_Feature_Ranking.csv", index=False, encoding="utf-8-sig")
print("✅ Feature ranking saved as: Selected_Feature_Ranking.csv")

# ============================
# Completion Summary
# ============================
print("\n=== Completed ===")
print(f"Total features retained: {n_features}")
print(f"Detailed results: {OUT_DETAILED}")
print(f"Summary results: {OUT_CSV}")
print(f"Plot file: {OUT_PNG}")
print("If you’d like the plot to show each repetition as a separate line (to visualize variance), "
      "or label RMSE values directly with feature names, let me know.")