In [None]:
# feature_selection_by_importance_pcc.py
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# ============================
# Parameter Settings
# ============================
DATA_FILE = "Data.xlsx"                      # Original data file
IMPORTANCE_FILE = "Feature_Importance_GDB.csv"  # Feature importance file
TARGET_COL = "Hv"                            # Target column name
PCC_THRESHOLD = 0.95                         # Correlation threshold

OUT_SELECTED_EXCEL = "Selected_Features_PCC.xlsx"
OUT_PLOT = "Selected_Features_PCC.png"

# ============================
# Step 1. Load Data and Importance File
# ============================
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"❌ Data file not found: {DATA_FILE}")

if not os.path.exists(IMPORTANCE_FILE):
    raise FileNotFoundError(f"❌ Feature importance file not found: {IMPORTANCE_FILE}")

# Load data and features
df = pd.read_excel(DATA_FILE)
if TARGET_COL not in df.columns:
    raise ValueError(f"❌ Target column '{TARGET_COL}' not found in data.")

features = df.drop(columns=[TARGET_COL])

# Load feature importance
imp_df = pd.read_csv(IMPORTANCE_FILE)
# If the file includes extra columns, keep only mean_importance
if "mean_importance" in imp_df.columns:
    imp_df = (
        imp_df[["Unnamed: 0", "mean_importance"]]
        if "Unnamed: 0" in imp_df.columns
        else imp_df[["model", "mean_importance"]]
        if "model" in imp_df.columns
        else imp_df
    )
# Validate column count
if imp_df.shape[1] != 2:
    raise ValueError("❌ The feature importance file should contain two columns: feature name and mean_importance.")
imp_df.columns = ["feature", "importance"]
imp_df = imp_df.sort_values("importance", ascending=False).reset_index(drop=True)

# ============================
# Step 2. Compute PCC Between Features
# ============================
corr_matrix = features.corr(method="pearson").abs()

# ============================
# Step 3. Select Features by Importance: Remove Highly Correlated Features
# ============================
selected_features = []
removed_features = set()

for feat in imp_df["feature"]:
    if feat not in features.columns:
        continue  # Skip if the feature does not exist in the dataset
    if feat in removed_features:
        continue
    # Keep this feature
    selected_features.append(feat)
    # Find highly correlated features with the current feature
    high_corr_feats = corr_matrix.index[
        (corr_matrix[feat] >= PCC_THRESHOLD) & (corr_matrix.index != feat)
    ].tolist()
    removed_features.update(high_corr_feats)

print(f"✅ Initial number of features: {features.shape[1]}")
print(f"✅ Number of highly correlated features removed: {len(removed_features)}")
print(f"✅ Final number of selected features: {len(selected_features)}")
print(f"✅ Selected features: {selected_features}")

# ============================
# Step 4. PCC Matrix of Selected Features
# ============================
filtered_corr = features[selected_features].corr(method="pearson")

# Save filtered PCC matrix
filtered_corr.to_excel(OUT_SELECTED_EXCEL)
print(f"✅ Filtered PCC matrix saved as: {OUT_SELECTED_EXCEL}")

# ============================
# Step 5. Plot Heatmap of Selected Features
# ============================
plt.figure(figsize=(10, 8))
sns.heatmap(
    filtered_corr,
    cmap="coolwarm",
    annot=False,
    square=True,
    cbar_kws={"label": "Pearson Correlation Coefficient"},
    linewidths=0.5,
)
plt.title("PCC Heatmap of Selected Features", fontsize=14)
plt.tight_layout()
plt.savefig(OUT_PLOT, dpi=300)
plt.show()
print(f"✅ Filtered heatmap saved as: {OUT_PLOT}")