In [None]:
# Feature Engineering
# ==============================================================================
# --- Configuration Section (Modify all tunable parameters here) ---
# ==============================================================================

# --- 1. File Path Configuration ---
INPUT_FILE = 'Original dataset-BQ.xlsx'  # Input Excel filename
OUTPUT_DIR = 'feature_engineering_BQ_output' # Directory name to save all output results

# --- 2. Core Algorithm Parameters ---
RANDOM_STATE = 0  # Random seed to ensure reproducibility. Usually does not need to be changed.
PEARSON_CORR_THRESHOLD = 0.8  # Pearson correlation coefficient threshold for Stage 1. Feature pairs with a correlation greater than this value will be processed.

# --- 3. Random Forest Model Hyperparameters ---
RF_N_ESTIMATORS = 300  # Number of trees in the forest. A higher value makes the model more stable but increases computation time.
RF_MIN_SAMPLES_LEAF = 5  # The minimum number of samples required to be at a leaf node.
RF_MIN_SAMPLES_SPLIT = 10  # The minimum number of samples required to split an internal node.
RF_MAX_FEATURES = 'sqrt'  # The number of features to consider when looking for the best split. 'sqrt' uses the square root of the total number of features.
RF_MAX_DEPTH = None  # The maximum depth of the tree. None means nodes are expanded until all leaves are pure.
RF_N_JOBS = -1  # The number of jobs to run in parallel. -1 means using all available CPU cores to speed up computation.

# --- 4. SHAP Value and Iterative Selection Parameters ---
PERFORMANCE_METRIC = 'r2'  # Performance metric for iterative selection. Options: 'mae' (Mean Absolute Error, lower is better) or 'r2' (R-squared, higher is better).
SHAP_COARSE_SELECTION_PERCENT = 0.8  # Percentage of features to keep during the SHAP coarse selection stage. 1.0 means keep 100%, 0.8 means keep the top 80%.
EARLY_STOPPING_PATIENCE = 5 # Early stopping patience. The number of consecutive iterations with no performance improvement before stopping.

# --- 5. Cross-Validation Parameters ---
KFOLD_SPLITS = 10 # Number of folds (k) for K-Fold cross-validation. Common values are 5 or 10.

# --- 6. Dataset Splitting Parameters ---
# Slice rule for feature columns (string format). '1:-1' means from the second column to the second-to-last.
FEATURE_COLUMN_SLICE = '1:-1'
# Index for the target column (integer format). -1 means the last column.
TARGET_COLUMN_INDEX = -1

# ==============================================================================
# --- Core Logic (Usually no modification is needed below this line) ---
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import shap
from tqdm import tqdm
import os
import time
from collections import defaultdict
import textwrap

def load_data_from_excel():
    """
    Loads data from the specified Excel file, extracts features (X) and the target (Y)
    based on the global configuration, and performs data cleaning.
    """
    print(f"Loading data from '{INPUT_FILE}'...")
    if not os.path.exists(INPUT_FILE):
        print(f"Error: File '{INPUT_FILE}' not found.")
        exit()

    df = pd.read_excel(INPUT_FILE)
    df_original = df.copy()

    try:
        all_col_names = df.columns
        
        try:
            slice_parts = FEATURE_COLUMN_SLICE.split(':')
            start = int(slice_parts[0]) if slice_parts[0] else None
            end = int(slice_parts[1]) if len(slice_parts) > 1 and slice_parts[1] else None
            feature_slice = slice(start, end)
            
            feature_col_names = all_col_names[feature_slice]
            target_col_name = all_col_names[TARGET_COLUMN_INDEX]
        except (ValueError, IndexError) as e:
            print(f"Error: Invalid dataset split parameters '{FEATURE_COLUMN_SLICE}' or '{TARGET_COLUMN_INDEX}'. Please check the configuration. Details: {e}")
            exit()

        if len(feature_col_names) > 1:
            feature_range_str = f"from column '{feature_col_names[0]}' to '{feature_col_names[-1]}'"
        elif len(feature_col_names) == 1:
            feature_range_str = f"column '{feature_col_names[0]}'"
        else:
            feature_range_str = "no features"

        print(f"Extracting features ({feature_range_str}) and target '{target_col_name}'.")

        if target_col_name in feature_col_names:
            print(f"Critical Error: The target column '{target_col_name}' is also identified as a feature column. Please check the column order in your Excel file or the splitting parameters.")
            exit()

        X = df[feature_col_names]
        Y = df[target_col_name]

    except Exception as e:
        print(f"An unknown error occurred while extracting columns: {e}.")
        exit()

    print(f"Extracted features X with shape: {X.shape}")
    print(f"Extracted target Y '{Y.name}' with shape: {Y.shape}")

    combined_df = pd.concat([X, Y], axis=1)
    for col in combined_df.columns:
        combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

    original_rows = len(combined_df)
    combined_df_cleaned = combined_df.dropna()

    if len(combined_df_cleaned) < original_rows:
        print(f"Warning: Dropped {original_rows - len(combined_df_cleaned)} rows containing missing values.")
        df_original = df_original.loc[combined_df_cleaned.index]

    X_cleaned = combined_df_cleaned.drop(columns=[Y.name])
    y_cleaned = combined_df_cleaned[Y.name]

    print(f"Data loading complete. Final shapes: X={X_cleaned.shape}, y={y_cleaned.shape}")
    return X_cleaned, y_cleaned, df_original

def step1_filter_high_correlated_features(X, y):
    """
    Stage 1: Filter out highly correlated features using the Pearson correlation coefficient.
    """
    print("\n--- Stage 1: Filtering Highly Correlated Features ---")

    if X.shape[1] <= 1:
        print("Insufficient number of features (<=1). Skipping correlation filtering.")
        return X.copy(), []
    if X.shape[0] <= 1:
        print("Insufficient number of samples (<=1). Skipping correlation filtering.")
        return X.copy(), []

    corr_matrix = X.corr(method='pearson').abs()
    feature_y_corr = {col: abs(X[col].corr(y)) for col in X.columns}
    sorted_features = sorted(X.columns, key=lambda col: feature_y_corr.get(col, -1), reverse=True)

    kept_features_final = []
    all_dropped_features_set = set()
    retained_to_dropped_map = defaultdict(list)

    for current_feature in sorted_features:
        if current_feature in all_dropped_features_set:
            continue
        kept_features_final.append(current_feature)
        for other_feature in sorted_features:
            if other_feature == current_feature or other_feature in all_dropped_features_set:
                continue
            if corr_matrix.loc[current_feature, other_feature] > PEARSON_CORR_THRESHOLD:
                all_dropped_features_set.add(other_feature)
                retained_to_dropped_map[current_feature].append(other_feature)

    to_drop_list = list(all_dropped_features_set)

    if to_drop_list:
        print(f"Based on Pearson correlation (threshold > {PEARSON_CORR_THRESHOLD}), the following features were removed:")
        output_data = []
        for kept_feat, dropped_list in retained_to_dropped_map.items():
            if dropped_list:
                output_data.append({
                    'retained_feature': kept_feat,
                    'dropped_features': ", ".join(sorted(dropped_list))
                })
        if output_data:
            max_kept_len = max(len(row['retained_feature']) for row in output_data)
            col1_header = 'Retained Feature'
            col1_width = max(max_kept_len, len(col1_header)) + 4
            col2_header = 'Dropped Features (due to high correlation)'
            terminal_width = 120
            col2_width = terminal_width - col1_width
            print(f"\n{col1_header:<{col1_width}}{col2_header}")
            print(f"{'-' * (col1_width - 1)} {'-' * len(col2_header)}")
            for row in output_data:
                kept_feat = row['retained_feature']
                dropped_feats_str = row['dropped_features']
                wrapped_lines = textwrap.wrap(dropped_feats_str, width=col2_width)
                print(f"{kept_feat:<{col1_width}}{wrapped_lines[0] if wrapped_lines else ''}")
                for i in range(1, len(wrapped_lines)):
                    print(f"{'':<{col1_width}}{wrapped_lines[i]}")
        X_filtered = X.drop(columns=to_drop_list)
    else:
        print("No feature pairs exceeded the Pearson correlation threshold. No features were removed.")
        X_filtered = X.copy()

    print(f"\nNumber of features after filtering: {X_filtered.shape[1]}")
    return X_filtered, to_drop_list


def step2_embed_shap_coarse_selection(X_filtered, y):
    """
    Stage 2: Coarse feature selection using an embedded method based on SHAP values.
    """
    print(f"\n--- Stage 2: SHAP Coarse Feature Selection ({KFOLD_SPLITS}-Fold CV) ---")

    if X_filtered.shape[1] == 0:
        print("No features available for SHAP coarse selection.")
        return X_filtered.copy(), []
    if X_filtered.shape[0] < KFOLD_SPLITS:
        print(f"Number of samples ({X_filtered.shape[0]}) is less than k_folds ({KFOLD_SPLITS}). Cannot perform cross-validation.")
        return X_filtered.copy(), []

    kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    avg_abs_shap_values_per_fold = []

    print(f"Calculating SHAP values (using {KFOLD_SPLITS}-Fold Cross-Validation)...")

    train_size_per_fold = X_filtered.shape[0] * (KFOLD_SPLITS - 1) // KFOLD_SPLITS
    min_samples_for_rf = max(RF_MIN_SAMPLES_LEAF, RF_MIN_SAMPLES_SPLIT)
    if train_size_per_fold < min_samples_for_rf:
        print(f"Warning: Training samples per fold ({train_size_per_fold}) is less than the minimum required by Random Forest ({min_samples_for_rf}). Skipping SHAP selection.")
        return X_filtered.copy(), []

    for train_idx, val_idx in tqdm(kf.split(X_filtered), total=KFOLD_SPLITS, desc="SHAP Coarse Selection"):
        X_fold_train, y_fold_train = X_filtered.iloc[train_idx], y.iloc[train_idx]
        scaler = StandardScaler().fit(X_fold_train)
        X_fold_train_scaled = scaler.transform(X_fold_train)
        rf_model = RandomForestRegressor(n_estimators=RF_N_ESTIMATORS, max_features=RF_MAX_FEATURES, min_samples_leaf=RF_MIN_SAMPLES_LEAF, min_samples_split=RF_MIN_SAMPLES_SPLIT, random_state=RANDOM_STATE, n_jobs=RF_N_JOBS)
        try:
            rf_model.fit(X_fold_train_scaled, y_fold_train)
            explainer = shap.TreeExplainer(rf_model)
            shap_values_fold = explainer.shap_values(X_fold_train_scaled)
            if shap_values_fold.ndim == 1:
                avg_abs_shap_values_per_fold.append(np.abs(shap_values_fold))
            else:
                avg_abs_shap_values_per_fold.append(np.abs(shap_values_fold).mean(axis=0))
        except Exception as e:
            print(f"\nWarning: Model training or SHAP calculation failed during coarse selection: {e}")
            continue

    if not avg_abs_shap_values_per_fold:
        print("Warning: Failed to calculate any SHAP values. Skipping SHAP coarse selection.")
        return X_filtered.copy(), []

    feature_importances_shap = pd.Series(np.mean(avg_abs_shap_values_per_fold, axis=0), index=X_filtered.columns).sort_values(ascending=False)
    print("\nFeature Importances (based on mean absolute SHAP values from CV):")
    print(feature_importances_shap)

    num_features_to_keep = int(len(feature_importances_shap) * SHAP_COARSE_SELECTION_PERCENT)
    if num_features_to_keep == 0 and len(feature_importances_shap) > 0: num_features_to_keep = 1
    if num_features_to_keep > len(feature_importances_shap): num_features_to_keep = len(feature_importances_shap)

    features_to_keep = feature_importances_shap.index[:num_features_to_keep].tolist()
    features_to_drop = list(set(X_filtered.columns) - set(features_to_keep))

    if features_to_drop:
        print(f"\nBased on SHAP coarse selection (keeping top {SHAP_COARSE_SELECTION_PERCENT * 100:.1f}%), the following features were removed: {features_to_drop}")
        X_shap_coarse = X_filtered[features_to_keep]
    else:
        print("\nNo features were removed after SHAP coarse selection.")
        X_shap_coarse = X_filtered.copy()

    print(f"Number of features after SHAP coarse selection: {X_shap_coarse.shape[1]}")
    return X_shap_coarse, features_to_drop


def step3_wrapper_shap_iterative_selection(X_shap_coarse, y):
    """
    Stage 3: Iterative fine-grained feature selection using a wrapper method with an early stopping mechanism.
    """
    print(f"\n--- Stage 3: Iterative Feature Fine-Tuning ({KFOLD_SPLITS}-Fold CV) ---")

    if X_shap_coarse.shape[1] <= 1:
        print("Insufficient number of features (<=1). Skipping iterative selection.")
        return list(X_shap_coarse.columns), []
    if X_shap_coarse.shape[0] < KFOLD_SPLITS:
        print(f"Number of samples ({X_shap_coarse.shape[0]}) is less than k_folds ({KFOLD_SPLITS}). Cannot perform cross-validation.")
        return list(X_shap_coarse.columns), []

    current_features = list(X_shap_coarse.columns)
    best_features = list(current_features)

    if PERFORMANCE_METRIC == 'r2':
        best_score = -np.inf
        is_better = lambda current, best: current > best
    elif PERFORMANCE_METRIC == 'mae':
        best_score = np.inf
        is_better = lambda current, best: current < best
    else:
        raise ValueError("Unsupported performance metric. Please choose 'r2' or 'mae'.")

    performance_history = []
    kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_STATE)

    train_size_per_fold = X_shap_coarse.shape[0] * (KFOLD_SPLITS - 1) // KFOLD_SPLITS
    min_samples_for_rf = max(RF_MIN_SAMPLES_LEAF, RF_MIN_SAMPLES_SPLIT)
    if train_size_per_fold < min_samples_for_rf:
        print(f"Warning: Training samples per fold ({train_size_per_fold}) is less than the minimum required by Random Forest ({min_samples_for_rf}). Skipping iterative selection.")
        return list(X_shap_coarse.columns), []

    print("Calculating baseline performance for the initial feature set...")
    initial_fold_scores = []
    for train_idx, val_idx in tqdm(kf.split(X_shap_coarse), total=KFOLD_SPLITS, desc="Baseline Performance"):
        X_fold_train, y_fold_train = X_shap_coarse.iloc[train_idx][current_features], y.iloc[train_idx]
        X_fold_val, y_fold_val = X_shap_coarse.iloc[val_idx][current_features], y.iloc[val_idx]
        scaler_fold = StandardScaler().fit(X_fold_train)
        X_fold_train_scaled = scaler_fold.transform(X_fold_train)
        X_fold_val_scaled = scaler_fold.transform(X_fold_val)
        rf_model = RandomForestRegressor(n_estimators=RF_N_ESTIMATORS, max_features=RF_MAX_FEATURES, min_samples_leaf=RF_MIN_SAMPLES_LEAF, min_samples_split=RF_MIN_SAMPLES_SPLIT, random_state=RANDOM_STATE, n_jobs=RF_N_JOBS)
        try:
            rf_model.fit(X_fold_train_scaled, y_fold_train)
            y_pred = rf_model.predict(X_fold_val_scaled)
            if PERFORMANCE_METRIC == 'r2':
                initial_fold_scores.append(r2_score(y_fold_val, y_pred))
            elif PERFORMANCE_METRIC == 'mae':
                initial_fold_scores.append(mean_absolute_error(y_fold_val, y_pred))
        except Exception as e:
            print(f"\nWarning: Model training or prediction failed during baseline calculation: {e}")
            continue

    if not initial_fold_scores:
        print("Warning: Failed to produce any valid scores for the initial feature set. Skipping iterative selection.")
        return list(X_shap_coarse.columns), []

    best_score = np.mean(initial_fold_scores)
    performance_history.append({'num_features': len(current_features), PERFORMANCE_METRIC: best_score})
    print(f"  Initial feature set ({len(current_features)} features), Average {PERFORMANCE_METRIC.upper()}: {best_score:.4f}")

    print("Starting iterative feature removal...")
    
    non_improvement_streak = 0
    
    while len(current_features) > 1:
        current_iteration_shap_values_for_drop = []
        for train_idx, val_idx in kf.split(X_shap_coarse):
            X_fold_train, y_fold_train = X_shap_coarse.iloc[train_idx][current_features], y.iloc[train_idx]
            scaler_fold = StandardScaler().fit(X_fold_train)
            X_fold_train_scaled = scaler_fold.transform(X_fold_train)
            rf_model = RandomForestRegressor(n_estimators=RF_N_ESTIMATORS, max_features=RF_MAX_FEATURES, min_samples_leaf=RF_MIN_SAMPLES_LEAF, min_samples_split=RF_MIN_SAMPLES_SPLIT, random_state=RANDOM_STATE, n_jobs=RF_N_JOBS)
            try:
                rf_model.fit(X_fold_train_scaled, y_fold_train)
                explainer = shap.TreeExplainer(rf_model)
                shap_values_fold = explainer.shap_values(X_fold_train_scaled)
                if shap_values_fold.ndim == 1:
                    current_iteration_shap_values_for_drop.append(np.abs(shap_values_fold))
                else:
                    current_iteration_shap_values_for_drop.append(np.abs(shap_values_fold).mean(axis=0))
            except Exception:
                continue

        if not current_iteration_shap_values_for_drop:
            print(f"  Warning: Iteration failed, could not calculate SHAP values. Stopping.")
            break

        avg_iteration_shap = np.mean(current_iteration_shap_values_for_drop, axis=0)
        feature_importances_current = pd.Series(avg_iteration_shap, index=current_features).sort_values(ascending=True)
        feature_to_drop = feature_importances_current.index[0]
        current_features.remove(feature_to_drop)
        print(f"  Attempting to remove least important feature: {feature_to_drop}")

        fold_scores_after_drop = []
        for train_idx, val_idx in tqdm(kf.split(X_shap_coarse), total=KFOLD_SPLITS, desc=f"Evaluating performance"):
            X_fold_train, y_fold_train = X_shap_coarse.iloc[train_idx][current_features], y.iloc[train_idx]
            X_fold_val, y_fold_val = X_shap_coarse.iloc[val_idx][current_features], y.iloc[val_idx]
            scaler_fold = StandardScaler().fit(X_fold_train)
            X_fold_train_scaled = scaler_fold.transform(X_fold_train)
            X_fold_val_scaled = scaler_fold.transform(X_fold_val)
            rf_model = RandomForestRegressor(n_estimators=RF_N_ESTIMATORS, max_features=RF_MAX_FEATURES, min_samples_leaf=RF_MIN_SAMPLES_LEAF, min_samples_split=RF_MIN_SAMPLES_SPLIT, random_state=RANDOM_STATE, n_jobs=RF_N_JOBS)
            try:
                rf_model.fit(X_fold_train_scaled, y_fold_train)
                y_pred = rf_model.predict(X_fold_val_scaled)
                if PERFORMANCE_METRIC == 'r2':
                    fold_scores_after_drop.append(r2_score(y_fold_val, y_pred))
                elif PERFORMANCE_METRIC == 'mae':
                    fold_scores_after_drop.append(mean_absolute_error(y_fold_val, y_pred))
            except Exception:
                continue

        if not fold_scores_after_drop:
            print(f"  Warning: Could not calculate valid scores after removing feature. Stopping.")
            break

        current_avg_score = np.mean(fold_scores_after_drop)
        performance_history.append({'num_features': len(current_features), PERFORMANCE_METRIC: current_avg_score})
        print(f"  Features: {len(current_features)}, Average {PERFORMANCE_METRIC.upper()}: {current_avg_score:.4f}")

        if is_better(current_avg_score, best_score):
            best_score, best_features = current_avg_score, list(current_features)
            print(f"    -> Performance improved. Resetting patience counter. Best set size: {len(best_features)}")
            non_improvement_streak = 0
        else:
            non_improvement_streak += 1
            print(f"    -> No improvement (Streak: {non_improvement_streak}/{EARLY_STOPPING_PATIENCE})")
            if non_improvement_streak >= EARLY_STOPPING_PATIENCE:
                print(f"    -> Early stopping triggered after {EARLY_STOPPING_PATIENCE} iterations with no improvement.")
                break

    print(f"\nIterative selection complete. Best feature set ({len(best_features)} features): {best_features}")
    return best_features, performance_history


if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    if not os.path.exists(INPUT_FILE):
        print(f"'{INPUT_FILE}' not found. Creating a dummy dataset for demonstration...")
        num_samples, feature_cols = 70, 162
        df_list = [pd.DataFrame([f'Sample_{i+1}' for i in range(num_samples)], columns=['SampleID'])]
        all_feature_names = [f'F{i+1}' for i in range(feature_cols)]
        feature_data = np.random.rand(num_samples, len(all_feature_names))
        df_features = pd.DataFrame(feature_data, columns=all_feature_names)
        df_list.append(df_features)
        df_dummy = pd.concat(df_list, axis=1)
        y_values = 5 * df_dummy['F1'] + 3 * df_dummy['F2']**2 + np.random.randn(num_samples) * 0.5
        df_dummy['Target'] = y_values
        df_dummy.to_excel(INPUT_FILE, index=False)
        print(f"Dummy data saved to '{INPUT_FILE}'.")

    X_full, y_full, df_original = load_data_from_excel()
    if not X_full.empty:
        original_corr_matrix = X_full.corr(method='pearson')
        original_corr_path = os.path.join(OUTPUT_DIR, f'original_feature_correlation_matrix_{time.strftime("%Y%m%d_%H%M%S")}.xlsx')
        original_corr_matrix.to_excel(original_corr_path, index=True)
        print(f"Saved: Original feature correlation matrix -> {original_corr_path}")

    X_filtered, dropped_pearson_list = step1_filter_high_correlated_features(X_full, y_full)
    if not X_filtered.empty and X_filtered.shape[1] > 1:
        filtered_corr_matrix = X_filtered.corr(method='pearson')
        filtered_corr_path = os.path.join(OUTPUT_DIR, f'stage1_filtered_correlation_matrix_{time.strftime("%Y%m%d_%H%M%S")}.xlsx')
        filtered_corr_matrix.to_excel(filtered_corr_path, index=True)
        print(f"Saved: Stage 1 filtered correlation matrix -> {filtered_corr_path}")

    X_shap_coarse, dropped_shap = step2_embed_shap_coarse_selection(X_filtered, y_full)
    final_features, performance_history = step3_wrapper_shap_iterative_selection(X_shap_coarse, y_full)

    # Sort the final features according to their original column order
    if final_features:
        print("\n--- Reordering final features based on original column order ---")
        original_feature_order = X_full.columns
        sorted_final_features = [col for col in original_feature_order if col in final_features]
        final_features = sorted_final_features
        print(f"Successfully reordered the final features.")

    print(f"\n--- Exporting final results to the '{OUTPUT_DIR}' directory ---")
    num_dropped_pearson = len(dropped_pearson_list)
    summary_df = pd.DataFrame({
        'Stage': ['Initial Features', 'Stage 1: Pearson Correlation Filter', 'Stage 2: SHAP Coarse Selection', 'Stage 3: SHAP Iterative Selection'],
        'Number of Features': [X_full.shape[1], X_filtered.shape[1], X_shap_coarse.shape[1], len(final_features)],
        'Features Removed': [0, num_dropped_pearson, len(dropped_shap), X_shap_coarse.shape[1] - len(final_features)]
    })
    summary_path = os.path.join(OUTPUT_DIR, f'feature_engineering_summary_{time.strftime("%Y%m%d_%H%M%S")}.xlsx')
    summary_df.to_excel(summary_path, index=False)
    print(f"Saved: Feature engineering summary -> {summary_path}")

    history_path = os.path.join(OUTPUT_DIR, f'performance_iteration_history_{time.strftime("%Y%m%d_%H%M%S")}.xlsx')
    pd.DataFrame(performance_history).to_excel(history_path, index=False)
    print(f"Saved: Performance iteration history -> {history_path}")

    if len(final_features) > 1:
        final_features_df = df_original[final_features] # Use the sorted version
        final_corr_matrix = final_features_df.corr(method='pearson')
        final_corr_path = os.path.join(OUTPUT_DIR, f'final_features_correlation_matrix_{time.strftime("%Y%m%d_%H%M%S")}.xlsx')
        final_corr_matrix.to_excel(final_corr_path, index=True)
        print(f"Saved: Final features correlation matrix -> {final_corr_path}")

    if final_features:
        cols_to_export = []
        # Assume the first column is a sample ID
        if not df_original.empty and df_original.columns[0] not in final_features and df_original.columns[0] != y_full.name:
            cols_to_export.append(df_original.columns[0])
        
        cols_to_export.extend(final_features) # Use the sorted version
        if y_full.name in df_original.columns:
            cols_to_export.append(y_full.name)
        
        # Remove duplicates in case the ID column was also selected as a feature
        unique_cols_to_export = list(dict.fromkeys(cols_to_export))

        dataset_path = os.path.join(OUTPUT_DIR, f'final_selected_dataset_{time.strftime("%Y%m%d_%H%M%S")}.xlsx')
        df_original[unique_cols_to_export].to_excel(dataset_path, index=False)
        print(f"Saved: Final selected dataset -> {dataset_path}")
    else:
        print("No features were selected in the final set. The final dataset was not saved.")

    print("\nAll processing is complete.")