Hyperparameter tuning

In [None]:
# ==============================================================================
# --- GLOBAL CONFIGURATION & PARAMETERS ---
# ==============================================================================
# System-wide configuration and adjustable parameters.

# --- 1. Data Source and Feature Engineering ---
# File path for the input dataset (Excel format).
EXCEL_FILE_PATH = 'Final_engineered_dataset-FGA-Eb.xlsx'
# EXCEL_FILE_PATH = 'Final_engineered_dataset-FGA-NVOA.xlsx'

# Feature (X) and Target (Y) column slicing definitions.
# slice(start, stop) -> 'start' is inclusive, 'stop' is exclusive.
X_COLS_SLICE = slice(1, -1)  # Features: From index 1 to the second-to-last column.
Y_COLS_SLICE = -1            # Target: The last column.

# --- 2. Cross-Validation Strategy ---
CV_N_SPLITS = 10         # Number of folds for K-Fold cross-validation.
CV_SHUFFLE = True        # Enable data shuffling prior to splitting.
CV_RANDOM_STATE = 100    # Random seed for reproducibility of splits.

# --- 3. Bayesian Optimization Configuration ---
N_ITER_BAYESIAN = 30     # Number of optimization iterations (sampling steps).

# --- 4. Global Model Initialization ---
DEFAULT_MODEL_RANDOM_STATE = 0  # Global random state for estimator initialization.

# --- 5. Active Model Selection ---
# Define the ensemble of models to be executed in the pipeline.
# Uncomment a model ID to enable it; comment out to disable.
ENABLED_MODELS = [
    'XGBR',   # XGBoost Regressor
    'RF',     # Random Forest Regressor
    'GBRT',   # Gradient Boosting Regressor
    'HGBR',   # Histogram-based Gradient Boosting Regressor
    'ETR',    # Extra Trees Regressor
    'CBR',    # CatBoost Regressor
    'LGBM',   # LightGBM Regressor
]


# ==============================================================================
# --- HYPERPARAMETER SEARCH SPACES ---
# ==============================================================================
# Definition of the search space for Bayesian optimization for each estimator.
from skopt.space import Real, Integer, Categorical

parameter_XGBR = {
    'n_estimators': Integer(10, 500), 'learning_rate': Real(0.01, 0.5, prior='log-uniform'),
    'max_depth': Integer(1, 10), 'subsample': Real(0.5, 0.9, prior='uniform'),
    'colsample_bytree': Real(0.5, 1.0, prior='uniform'), 
    'reg_alpha': Real(0.1, 1.0, prior='log-uniform'), 'reg_lambda': Real(0.1, 10.0, prior='log-uniform')
}
parameter_RF = {
    'n_estimators': Integer(10, 200), 'max_depth': Integer(1, 20),
    'max_features': Categorical(['sqrt', 'log2', 1.0]), 'min_samples_leaf': Integer(1, 10),
    'min_samples_split': Integer(2, 10)
}
parameter_CBR = {
    'iterations': Integer(10, 500), 'learning_rate': Real(0.01, 0.5, prior='log-uniform'),
    'depth': Integer(1, 16), 'l2_leaf_reg': Real(0.1, 10.0, prior='log-uniform'),
    'subsample': Real(0.5, 0.9, prior='uniform'), 'rsm': Real(0.5, 1.0, prior='uniform')
}
parameter_LGBM = {
    'n_estimators': Integer(10, 500), 'learning_rate': Real(0.01, 0.8, prior='log-uniform'),
    'max_depth': Integer(1, 10), 'num_leaves': Integer(5, 50),
    'subsample': Real(0.5, 0.9, prior='uniform'), 'colsample_bytree': Real(0.5, 1.0, prior='uniform'),
    'reg_alpha': Real(0.1, 10.0, prior='log-uniform'), 'reg_lambda': Real(0.1, 10.0, prior='log-uniform')
}
parameter_GBRT = {
    'n_estimators': Integer(10, 500), 'learning_rate': Real(0.01, 0.5, prior='log-uniform'),
    'max_depth': Integer(1, 10), 'max_features': Categorical(['sqrt', 'log2', 1.0]),
    'min_samples_split': Integer(2, 10), 'min_samples_leaf': Integer(1, 10),
    'subsample': Real(0.5, 0.9, prior='uniform')
}
parameter_HGBR = {
    'learning_rate': Real(0.01, 0.5, prior='log-uniform'), 'max_iter': Integer(10, 500),
    'max_depth': Integer(1, 10), 'min_samples_leaf': Integer(2, 10),
    'l2_regularization': Real(0.1, 10.0, prior='log-uniform')
}
parameter_ETR = {
    'n_estimators': Integer(10, 200), 'max_depth': Integer(1, 10),
    'min_samples_split': Integer(2, 10), 'min_samples_leaf': Integer(1, 10),
    'max_features': Categorical(['sqrt', 'log2', 1.0])
}

# ==============================================================================
# --- MAIN EXECUTION PIPELINE ---
# ==============================================================================

# --- 1. Environment Initialization ---
import time
import pandas as pd
import numpy as np
import warnings
import re
import xgboost as XGB
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold
from skopt import BayesSearchCV

print("--- Pipeline Initialized ---")

try:
    from catboost import CatBoostRegressor
    catboost_available = True
except ImportError:
    catboost_available = False
try:
    from lightgbm import LGBMRegressor
    lightgbm_available = True
except ImportError:
    lightgbm_available = False
print("Dependencies verified.")


# --- 2. Data Ingestion and Preprocessing ---
X, Y = pd.DataFrame(), pd.Series()
try:
    print(f"\nImporting dataset: {EXCEL_FILE_PATH}")
    df = pd.read_excel(EXCEL_FILE_PATH)
    print("Dataset imported successfully.")

    min_cols_required = 2
    if isinstance(X_COLS_SLICE, slice):
        min_cols_required = max(min_cols_required, abs(X_COLS_SLICE.start or 0), abs(X_COLS_SLICE.stop or 0))
    if df.shape[1] < min_cols_required:
         raise ValueError(f"CRITICAL: Insufficient column count for operation X_COLS_SLICE={X_COLS_SLICE}.")

    X = df.iloc[:, X_COLS_SLICE]
    Y = df.iloc[:, Y_COLS_SLICE]
    print(f"Feature/Target extraction complete. X shape={X.shape}, Y shape={Y.shape} (Target: '{Y.name}')")

    print("\nSanitizing feature column names for estimator compatibility...")
    X.columns = [re.sub(r'\[|\]|<', '_', col) for col in X.columns]
    print("Column sanitization complete.")

except FileNotFoundError:
    print(f"\n!!! CRITICAL: File '{EXCEL_FILE_PATH}' not found. Check configuration.")
    exit()
except Exception as e:
    print(f"\n!!! CRITICAL: Data ingestion failure: {e}")
    exit()

# --- 3. Model Instantiation and Validation Strategy ---
cross_Valid = KFold(n_splits=CV_N_SPLITS, shuffle=CV_SHUFFLE, random_state=CV_RANDOM_STATE)

# Registry of all supported estimators
ALL_POSSIBLE_ESTIMATORS = {
    'XGBR': XGB.XGBRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE, objective='reg:squarederror'),
    'RF': RandomForestRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE),
    'GBRT': GradientBoostingRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE),
    'HGBR': HistGradientBoostingRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE),
    'ETR': ExtraTreesRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE)
}
if catboost_available:
    ALL_POSSIBLE_ESTIMATORS['CBR'] = CatBoostRegressor(verbose=False, random_state=DEFAULT_MODEL_RANDOM_STATE, allow_writing_files=False)
else:
    print("Info: CatBoost module not found; related configurations ignored.")

if lightgbm_available:
    ALL_POSSIBLE_ESTIMATORS['LGBM'] = LGBMRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE, verbosity=-1, objective='regression')
else:
    print("Info: LightGBM module not found; related configurations ignored.")

# Filter active estimators based on user configuration
estimators_for_bayes = {name: ALL_POSSIBLE_ESTIMATORS[name]
                        for name in ENABLED_MODELS
                        if name in ALL_POSSIBLE_ESTIMATORS}

params_mapping = {
    'XGBR': parameter_XGBR, 'RF': parameter_RF, 'GBRT': parameter_GBRT,
    'HGBR': parameter_HGBR, 'ETR': parameter_ETR
}
if catboost_available: params_mapping['CBR'] = parameter_CBR
if lightgbm_available: params_mapping['LGBM'] = parameter_LGBM

if not estimators_for_bayes:
    print("\n!!! CRITICAL: No active models configured. Verify 'ENABLED_MODELS' in configuration.")
    exit()

print(f"\nPipeline ready. Starting optimization ({cross_Valid.get_n_splits()}-fold CV) for models: {list(estimators_for_bayes.keys())}")


# --- 4. Bayesian Optimization Loop ---
grid_searches = {}
print(f"\nInitiating BayesSearchCV (Iterations={N_ITER_BAYESIAN})...")

for name, estimator in estimators_for_bayes.items():
    start_time = time.time()
    print(f"\n--- Optimizing {name} ---")
    if name not in params_mapping:
        print(f"Warning: No hyperparameter space defined for {name}. Skipping.")
        grid_searches[name] = None
        continue

    bayes_search = BayesSearchCV(
        estimator=estimator, search_spaces=params_mapping[name],
        n_iter=N_ITER_BAYESIAN, scoring='r2', cv=cross_Valid,
        n_jobs=-1, random_state=DEFAULT_MODEL_RANDOM_STATE, verbose=1
    )
    try:
        bayes_search.fit(X, Y)
        duration = time.time() - start_time
        grid_searches[name] = bayes_search
        print(f"--- {name} Optimization Complete ---")
        print(f"  Best Score (CV R²): {bayes_search.best_score_:.4f}")
        print(f"  Best Parameters: {dict(bayes_search.best_params_)}")
        print(f"  Runtime: {duration:.2f} s")
    except Exception as e:
        duration = time.time() - start_time
        print(f"\n!!! ERROR: Optimization failed for {name}: {e}")
        print(f"  Runtime before failure: {duration:.2f} s")
        grid_searches[name] = None

# ==============================================================================
# --- 5. RESULTS SUMMARY ---
# ==============================================================================
print("\n\n==============================================================================")
print("--- OPTIMIZATION REPORT ---")
print("==============================================================================")

# Storage for optimal hyperparameters
all_best_params_for_export = {}

# Iterate and report results
for name, search_result in grid_searches.items():
    print(f"\n--- Model: {name} ---")
    if search_result:
        best_score = search_result.best_score_
        best_params = dict(search_result.best_params_)
        all_best_params_for_export[name] = best_params

        print(f"  Best R² Score (CV): {best_score:.4f}")
        print("  Optimal Hyperparameters:")
        for param, value in best_params.items():
            if isinstance(value, float):
                print(f"    - {param}: {value:.6f}")
            else:
                print(f"    - {param}: {value}")
    else:
        print("  Status: Failed or Skipped.")
print("\n--- Pipeline Execution Finished ---")

Model stacking, robustness verification, SHAP analysis

In [None]:
# ==============================================================================
# --- USER CONFIGURATION AREA ---
# ==============================================================================
# Please modify all adjustable parameters within this section.

# --- 1. Evaluation Protocol Settings ---
# Number of seeds for repeated external cross-validation (Total runs of the full Stacking evaluation).
N_SEEDS_FOR_EVALUATION = 100  
# Number of folds for external cross-validation (Used for generating OOF predictions and evaluating Stacking).
N_SPLITS_OUTER_CV = 10
# Intensity factor for data augmentation via Gaussian noise (Set to 0 to disable).
NOISE_SCALE_FACTOR = 0

# --- 2. Meta-Learner Tuning Configuration ---
# Number of iterations for Bayesian optimization of the Meta-Learner (Executed only during the first seed/iteration).
META_LEARNER_N_ITER_BAYESIAN = 50
# Number of folds for the Meta-Learner's internal cross-validation.
META_LEARNER_N_SPLITS_CV = 10
# Number of repeats for the Meta-Learner's internal cross-validation.
META_LEARNER_N_REPEATS_CV = 10

# --- 3. Visualization and Export Settings ---
# Filename for the final output Excel report.
OUTPUT_EXCEL_FILENAME = 'SHAP_Analysis_Results.xlsx'
# Method to calculate weights for Level-1 aggregation ('1/RMSE', 'uniform', or 'exponential').
WEIGHTING_METHOD = '1/RMSE'
# Number of top features to display in the feature importance bar chart.
N_FEATURES_TO_PLOT = 30
# Whether to generate SHAP beeswarm plots (This process may be time-consuming).
PLOT_SHAP_SWARM_PLOT = True
# Maximum number of samples for SHAP beeswarm plots to prevent memory overflow. Set to None for no limit.
SHAP_SWARM_SAMPLES_LIMIT = 5000

# --- 4. Base Learner Selection ---
# Select the models to participate as Base Learners in Stacking.
# [Uncomment] a line to [ENABLE] the model.
# [Comment out] a line to [DISABLE] the model.
ENABLED_BASE_LEARNERS = [
    'XGBR',   # XGBoost Regressor
    'RF',     # Random Forest Regressor
    'GBRT',   # Gradient Boosting Regressor
    'HGBR',   # Histogram-based Gradient Boosting Regressor
    'ETR',    # Extra Trees Regressor
    'CBR',    # CatBoost Regressor
    'LGBM',   # LightGBM Regressor
]

# --- 5. Global Random State Setting ---
# Applied to all models and cross-validation to ensure reproducibility.
DEFAULT_MODEL_RANDOM_STATE = 0


# ==============================================================================
# --- MAIN SCRIPT EXECUTION (Do not modify this section) ---
# ==============================================================================

# --- 1. Library Imports and Environment Check ---
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import shap
import itertools
from math import comb
import scipy.stats as st

import xgboost as XGB
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.base import clone
from skopt import BayesSearchCV
from skopt.space import Real

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Script execution started ---")

try:
    from catboost import CatBoostRegressor
    catboost_available = True
except ImportError:
    catboost_available = False
try:
    from lightgbm import LGBMRegressor
    lightgbm_available = True
except ImportError:
    lightgbm_available = False
print("Library imports and environment check completed.")

# --- 2. Core Variable Loading and Preparation ---
# Ensure required variables from the hyperparameter search phase are available.
if 'X' not in locals() or 'X' not in globals():
    print("Error: Variable 'X' is undefined. Please run the hyperparameter search script first.")
    print("Warning: 'X' and 'Y' not found. Generating dummy data for demonstration.")
    from sklearn.datasets import make_regression
    X_dummy, Y_dummy = make_regression(n_samples=100, n_features=10, n_informative=5, random_state=DEFAULT_MODEL_RANDOM_STATE)
    X = pd.DataFrame(X_dummy, columns=[f'Feature_{i+1}' for i in range(10)])
    Y = pd.Series(Y_dummy)
    grid_searches = {model: None for model in ENABLED_BASE_LEARNERS}

if 'Y' not in locals() or 'Y' not in globals():
    print("Error: Variable 'Y' is undefined. Please run the hyperparameter search script first.")
    exit()
if 'grid_searches' not in locals() or 'grid_searches' not in globals():
    print("Error: Variable 'grid_searches' is undefined. Please run the hyperparameter search script first.")
    exit()

filtered_grid_searches = {name: gs for name, gs in grid_searches.items() if name in ENABLED_BASE_LEARNERS}
model_mapping = {
    'XGBR': XGB.XGBRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE), 'RF': RandomForestRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE),
    'GBRT': GradientBoostingRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE), 'HGBR': HistGradientBoostingRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE),
    'ETR': ExtraTreesRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE),
    'CBR': CatBoostRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE, verbose=0) if catboost_available else None,
    'LGBM': LGBMRegressor(random_state=DEFAULT_MODEL_RANDOM_STATE, verbosity=-1) if lightgbm_available else None,
}
for name, gs in filtered_grid_searches.items():
    if gs is None:
        print(f"Warning: Optimization results for {name} not found. Using default model configuration.")
        class MockSearchResult:
            def __init__(self, estimator): self.best_estimator_ = estimator
        filtered_grid_searches[name] = MockSearchResult(model_mapping[name])

model_names_available = list(filtered_grid_searches.keys())
print(f"\nModel filtering completed: Selected {len(filtered_grid_searches)} enabled models: {model_names_available}")
if not filtered_grid_searches: print("\n!!! FATAL ERROR: No valid Base Learners selected."); exit()
feature_names_list = X.columns.tolist()

# --- 3. Core Function Definitions ---
def initialize_best_estimators(grid_searches_dict):
    estimators_init = {}
    print("Initializing models...")
    for name, search_result in grid_searches_dict.items():
        if hasattr(search_result, 'best_estimator_'):
            estimators_init[name] = clone(search_result.best_estimator_)
            print(f"  Successfully initialized {name}.")
        else: print(f"  Warning: Optimization results for {name} not found. Skipping this model.")
    return estimators_init

def calculate_weights(scores, method='1/RMSE'):
    scores = np.array(scores)
    scores[np.isinf(scores) | np.isnan(scores) | (scores < 1e-9)] = 1e-9
    if method == 'uniform': weights = np.ones_like(scores)
    elif method == 'exponential': weights = np.exp(-scores)
    else: weights = 1.0 / scores
    total_weight = np.sum(weights)
    return weights / total_weight if total_weight > 1e-9 else np.ones_like(scores) / len(scores)

# =================================================================================
# --- 4. Optimization of Meta-Learner Hyperparameters (One-time Execution) ---
# =================================================================================
print("\n--- Step 1: Optimization of Meta-Learner Hyperparameters ---")
print("  Generating global OOF predictions for Meta-Learner tuning...")
oof_preds_full = np.zeros((len(X), len(model_names_available)))
kf_for_meta = KFold(n_splits=META_LEARNER_N_SPLITS_CV, shuffle=True, random_state=DEFAULT_MODEL_RANDOM_STATE)
base_estimators_for_meta = initialize_best_estimators(filtered_grid_searches)

for i, (name, estimator) in enumerate(base_estimators_for_meta.items()):
    fold_preds = np.zeros(len(X))
    for train_idx, val_idx in kf_for_meta.split(X):
        X_train_fold, X_val_fold, y_train_fold = X.iloc[train_idx], X.iloc[val_idx], Y.iloc[train_idx]
        est_clone = clone(estimator); est_clone.fit(X_train_fold, y_train_fold)
        fold_preds[val_idx] = est_clone.predict(X_val_fold)
    oof_preds_full[:, i] = fold_preds

print("  Performing Bayesian optimization on global OOF predictions...")
meta_learner_params = {'elasticnet__alpha': Real(1e-5, 10.0, prior='log-uniform'), 'elasticnet__l1_ratio': Real(0.0, 1.0, prior='uniform')}
meta_pipeline_template = Pipeline([('scaler', StandardScaler()), ('elasticnet', ElasticNet(random_state=DEFAULT_MODEL_RANDOM_STATE, max_iter=2000))])

meta_bayes_search = BayesSearchCV(
    estimator=meta_pipeline_template, 
    search_spaces=meta_learner_params, 
    n_iter=META_LEARNER_N_ITER_BAYESIAN, 
    scoring='neg_root_mean_squared_error', 
    cv=KFold(n_splits=META_LEARNER_N_SPLITS_CV, shuffle=True, random_state=DEFAULT_MODEL_RANDOM_STATE), 
    n_jobs=-1, 
    random_state=DEFAULT_MODEL_RANDOM_STATE, 
    verbose=0
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore", UserWarning)
    meta_bayes_search.fit(oof_preds_full, Y)

# Store the optimal parameters for subsequent independent training.
best_meta_learner_params = meta_bayes_search.best_params_
print(f"  Meta-Learner tuning completed. Best CV RMSE: {-meta_bayes_search.best_score_:.4f}")
print(f"  Optimal parameters found: {best_meta_learner_params}")
print(f"  These parameters will be used for independent training in each split.")


# --- 5. Main Execution Pipeline: Outer Cross-Validation (Unbiased Stacking Evaluation) ---
outer_cv = KFold(n_splits=N_SPLITS_OUTER_CV, shuffle=True, random_state=DEFAULT_MODEL_RANDOM_STATE)
print(f"\n--- Step 2: Initiating Outer Cross-Validation (Folds={N_SPLITS_OUTER_CV}) ---")

all_split_results, all_fold_scores = [], []
base_estimators = initialize_best_estimators(filtered_grid_searches)
split_counter = 0

for train_idx, val_idx in outer_cv.split(X, Y):
    split_counter += 1
    print(f"  --- Processing Fold {split_counter}/{N_SPLITS_OUTER_CV} ---")
    X_train, X_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], Y.iloc[train_idx], Y.iloc[val_idx]
    
    oof_preds_val = np.zeros((len(y_val), len(model_names_available)))
    split_result = {'split_index': split_counter, 'base_model_scores': {}, 'base_model_shap': {}, 'X_val': X_val}
    
    # Train Base Models
    for i, (name, estimator) in enumerate(base_estimators.items()):
        est_clone = clone(estimator); est_clone.fit(X_train, y_train)
        preds_on_val = est_clone.predict(X_val); oof_preds_val[:, i] = preds_on_val
        rmse = np.sqrt(mean_squared_error(y_val, preds_on_val)); mae = mean_absolute_error(y_val, preds_on_val)
        split_result['base_model_scores'][name] = {'rmse': rmse, 'mae': mae}
        print(f"    - {name}: RMSE={rmse:.4f}, MAE={mae:.4f}")
        all_fold_scores.extend([{'Split': split_counter, 'Model': name, 'Metric': 'RMSE', 'Value': rmse}, 
                                {'Split': split_counter, 'Model': name, 'Metric': 'MAE', 'Value': mae}])
        try:
            explainer = shap.TreeExplainer(est_clone)
            shap_values = explainer.shap_values(X_val) if name == 'CBR' else explainer(X_val).values
            split_result['base_model_shap'][name] = {'shap_values': shap_values, 'expected_value': explainer.expected_value}
        except Exception as e:
            print(f"      Warning: SHAP calculation failed for {name}: {e}")

    # --- Train Meta-Learner independently for current split (Prevent Data Leakage) ---
    # 1. Generate OOF predictions internally within the current external training set (X_train).
    oof_preds_train_for_meta = np.zeros((len(X_train), len(model_names_available)))
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=DEFAULT_MODEL_RANDOM_STATE) 
    for inner_train_idx, inner_val_idx in inner_cv.split(X_train):
        X_inner_train, X_inner_val, y_inner_train = X_train.iloc[inner_train_idx], X_train.iloc[inner_val_idx], y_train.iloc[inner_train_idx]
        for i, (name, estimator) in enumerate(base_estimators.items()):
            est_inner_clone = clone(estimator); est_inner_clone.fit(X_inner_train, y_inner_train)
            oof_preds_train_for_meta[inner_val_idx, i] = est_inner_clone.predict(X_inner_val)

    # 2. Instantiate a new Meta-Learner using the optimal hyperparameters found.
    meta_learner_for_split = clone(meta_pipeline_template)
    meta_learner_for_split.set_params(**best_meta_learner_params)
    meta_learner_for_split.fit(oof_preds_train_for_meta, y_train)
    
    # 3. Predict and evaluate using this unbiased Meta-Learner.
    y_pred_stacking = meta_learner_for_split.predict(oof_preds_val)
    stacking_rmse = np.sqrt(mean_squared_error(y_val, y_pred_stacking)); stacking_mae = mean_absolute_error(y_val, y_pred_stacking)
    split_result['stacking_score'] = {'rmse': stacking_rmse, 'mae': stacking_mae}
    print(f"    - Stacking: RMSE={stacking_rmse:.4f}, MAE={stacking_mae:.4f}")
    all_fold_scores.extend([{'Split': split_counter, 'Model': 'Stacking', 'Metric': 'RMSE', 'Value': stacking_rmse},
                            {'Split': split_counter, 'Model': 'Stacking', 'Metric': 'MAE', 'Value': stacking_mae}])

    all_split_results.append(split_result)
print("  All data splits processed.")


# --- 6. Result Aggregation and SHAP Analysis (Two-Level Weighting Logic) ---
print("\n--- Step 3: Aggregating Results and Performing SHAP Analysis ---")
all_model_names = model_names_available + ['Stacking']
final_analysis = {name: {} for name in all_model_names}

with pd.ExcelWriter(OUTPUT_EXCEL_FILENAME, engine='openpyxl') as writer:
    for model_name in all_model_names:
        print(f"\n--- Analyzing Model: {model_name} ---")
        all_shap_values_list, all_x_val_list = [], []
        
        if model_name == 'Stacking':
            print("  Applying [Two-Level Weighting] logic...")
            level1_shap_matrices, level2_raw_weights = [], []
            for res in all_split_results:
                base_rmses = [res['base_model_scores'][bn]['rmse'] for bn in model_names_available]
                level1_weights = calculate_weights(base_rmses, WEIGHTING_METHOD)
                
                # Fetch SHAP values from the first model to initialize shape
                first_shap_data = res['base_model_shap'].get(model_names_available[0])
                if first_shap_data is None: continue
                
                weighted_shap_for_split = np.zeros_like(first_shap_data['shap_values'])
                
                # Weighted Sum of Base Models (Level 1)
                for i, bn in enumerate(model_names_available):
                    if (shap_data := res['base_model_shap'].get(bn)):
                        weighted_shap_for_split += shap_data['shap_values'] * level1_weights[i]
                
                level1_shap_matrices.append(weighted_shap_for_split)
                all_x_val_list.append(res['X_val'])
                level2_raw_weights.append(res['stacking_score']['rmse'])
            
            if not level1_shap_matrices:
                print(f"  Model {model_name} has no valid SHAP data, skipping analysis."); continue
            
            combined_shap_df = pd.DataFrame(np.vstack(level1_shap_matrices), columns=feature_names_list)
            
            # Weighted Average across Splits (Level 2)
            level2_weights = calculate_weights(level2_raw_weights, method='1/RMSE')
            expanded_sample_weights = np.repeat(level2_weights, [len(x) for x in all_x_val_list])
            
            global_importance = pd.Series(np.average(combined_shap_df.abs().values, axis=0, weights=expanded_sample_weights), index=feature_names_list).sort_values(ascending=False)
        
        else:
            print("  Applying [Arithmetic Mean] logic...")
            for res in all_split_results:
                if (shap_data := res['base_model_shap'].get(model_name)):
                    all_shap_values_list.append(pd.DataFrame(shap_data['shap_values'], columns=feature_names_list))
                    all_x_val_list.append(res['X_val'])
            
            if not all_shap_values_list:
                print(f"  Model {model_name} has no valid SHAP data, skipping analysis."); continue
            
            combined_shap_df = pd.concat(all_shap_values_list, ignore_index=True)
            global_importance = pd.Series(np.mean(combined_shap_df.abs().values, axis=0), index=feature_names_list).sort_values(ascending=False)
        
        if 'combined_shap_df' not in locals() or combined_shap_df.empty: continue
        combined_x_val_df = pd.concat(all_x_val_list, ignore_index=True)
        final_analysis[model_name]['global_importance'] = global_importance
        
        # Export data
        global_importance.to_excel(writer, sheet_name=f'{model_name}_GlobalImportance')
        plot_df = pd.concat([combined_x_val_df.reset_index(drop=True), combined_shap_df.reset_index(drop=True).add_prefix('SHAP_')], axis=1)
        plot_df = plot_df[list(global_importance.index) + [f'SHAP_{f}' for f in global_importance.index]]
        plot_df.to_excel(writer, sheet_name=f'{model_name}_SwarmPlotData', index=False)
        
        # Visualization
        print("  Generating SHAP Beeswarm Plots and Feature Importance Bar Charts...")
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 9))
        feature_order = global_importance.head(N_FEATURES_TO_PLOT).index.tolist()
        
        plt.sca(ax1)
        shap.summary_plot(combined_shap_df[feature_order].values, combined_x_val_df[feature_order], feature_names=feature_order, show=False)
        ax1.set_title(f'SHAP Beeswarm Plot')
        
        top_features_for_plot = global_importance.reindex(feature_order)
        sns.barplot(x=top_features_for_plot.values, y=top_features_for_plot.index, ax=ax2, orient='h', palette='viridis')
        ax2.set_title(f'Global Feature Importance')
        ax2.set_xlabel(f'Mean(|SHAP Value|) - {"Two-Level Weighted" if model_name == "Stacking" else "Arithmetic Mean"}')
        ax2.set_ylabel('')
        
        fig.suptitle(f'SHAP Analysis for {model_name} (CV Folds={N_SPLITS_OUTER_CV})', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.show()

    # --- 7. Performance Metrics Processing and Export ---
    print("\n--- Step 4: Processing and Exporting Performance Metrics ---")
    if all_fold_scores:
        scores_df = pd.DataFrame(all_fold_scores)
        try:
            print("  Exporting per-fold performance metrics to Excel...")
            scores_pivot_df = scores_df.pivot_table(index='Split', columns=['Model', 'Metric'], values='Value')
            scores_pivot_df.columns = [f'{col[1]}_{col[0]}' for col in scores_pivot_df.columns]
            scores_pivot_df.to_excel(writer, sheet_name='Fold_Performance_Metrics')
            print(f"  Successfully exported to sheet 'Fold_Performance_Metrics'.")
        except Exception as e:
            print(f"  !!! Error exporting performance metrics: {e}")

        print("\n--- Step 5: Final Average Performance (Arithmetic Mean) ---")
        mean_scores_summary = scores_pivot_df.mean().reset_index()
        mean_scores_summary.columns = ['Metric_Model', 'Mean_Value']
        mean_scores_summary[['Metric', 'Model']] = mean_scores_summary['Metric_Model'].str.split('_', expand=True)
        final_summary_table = mean_scores_summary.pivot_table(index='Model', columns='Metric', values='Mean_Value')
        if 'RMSE' in final_summary_table.columns and 'MAE' in final_summary_table.columns:
            final_summary_table = final_summary_table[['RMSE', 'MAE']]
        print(final_summary_table.to_string())
    else:
        print("  No performance metrics collected.")

    # --- 8. Global Feature Importance Summary Generation ---
    print("\n--- Step 6: Generating Global Feature Importance Summary for All Models ---")
    if (importance_series_list := [res['global_importance'].rename(name) for name, res in final_analysis.items() if 'global_importance' in res and not res['global_importance'].empty]):
        summary_importance_df = pd.concat(importance_series_list, axis=1).sort_index()
        print("\nGlobal Feature Importance Summary:")
        print(summary_importance_df.to_string())
        summary_importance_df.to_excel(writer, sheet_name='Global_Importance_Summary')
    else:
        print("No feature importance data collected.")

print(f"\n--- Analysis Completed. Results exported to {OUTPUT_EXCEL_FILENAME} ---")
print(f"--- Script execution finished ---")

Prediction

In [None]:
# ==============================================================================
# --- USER CONFIGURATION AREA ---
# ==============================================================================
# Please modify all adjustable parameters within this section.

# --- 1. Prediction Data Settings ---
# Filename of the unknown dataset for prediction (Excel format).
UNKNOWN_DATA_FILE = 'prediction-FGA-Eb.xlsx'
# UNKNOWN_DATA_FILE = 'prediction-FGA-NVOA.xlsx'
# Column slicing definition for feature extraction (X_new) from Excel.
# (slice(None), slice(1, None)) selects all rows and columns starting from index 1.
# This effectively skips the first column (index 0) of the Excel file.
UNKNOWN_DATA_FILE_COLUMN_RANGE = (slice(None), slice(1, None)) 

# --- 2. Model Reuse and Training Configuration ---
# Whether to attempt reusing the Stacking Model trained in the previous script.
REUSE_PRETRAINED_STACKING_MODEL = False

# The following parameters apply only when REUSE_PRETRAINED_STACKING_MODEL = False (Retraining mode).
PREDICT_CV_N_SPLITS = 10                  # Number of folds for generating OOF predictions.
PREDICT_META_LEARNER_N_SPLITS_CV = 10     # Number of folds for Meta-Learner internal cross-validation.
PREDICT_META_LEARNER_N_REPEATS_CV = 10    # Number of repeats for Meta-Learner internal cross-validation.
PREDICT_META_LEARNER_N_ITER_BAYESIAN = 50 # Number of Bayesian optimization iterations for the Meta-Learner.

# --- 3. Base Learner Selection ---
# !!! IMPORTANT: This list must match the configuration in 'optimize_base_learners.py' and 'evaluate_stacking_model.py' !!!
ENABLED_BASE_LEARNERS = [
    'XGBR',    # XGBoost Regressor
    'RF',      # Random Forest Regressor
    'GBRT',    # Gradient Boosting Regressor
    'HGBR',    # Histogram-based Gradient Boosting Regressor
    'ETR',     # Extra Trees Regressor
    'CBR',     # CatBoost Regressor
    'LGBM',    # LightGBM Regressor
]

# --- 4. Result Export Settings ---
PREDICTION_OUTPUT_FILENAME_PREFIX = 'unknown_predictions'
PREDICTION_EXPORT_TO_EXCEL = True
PREDICTION_EXPORT_TO_CSV = False

# --- 5. Global Random State Setting ---
# !!! IMPORTANT: This value must be consistent with DEFAULT_MODEL_RANDOM_STATE in other scripts !!!
DEFAULT_MODEL_RANDOM_STATE = 0 


# ==============================================================================
# --- MAIN SCRIPT EXECUTION (Do not modify this section) ---
# ==============================================================================

# --- 1. Library Imports and Environment Check ---
import pandas as pd
import numpy as np
import time
import warnings
import os
import re
import sys 

import xgboost as XGB
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.base import clone
from skopt import BayesSearchCV
from skopt.space import Real

print("--- Initiating Prediction on Unknown Data ---")

try:
    from catboost import CatBoostRegressor
    catboost_available = True
except ImportError:
    catboost_available = False
try:
    from lightgbm import LGBMRegressor
    lightgbm_available = True
except ImportError:
    lightgbm_available = False
print("Library imports and environment check completed.")


# --- 2. Core Variable Loading and Preparation ---
# Check for the existence of required variables from upstream scripts.
if 'X' not in locals() and 'X' not in globals():
    print("Error: Variable 'X' (Training Features) is undefined. Please ensure the data preparation and optimization scripts have been executed.")
    sys.exit(1)
if 'Y' not in locals() and 'Y' not in globals():
    print("Error: Variable 'Y' (Training Targets) is undefined. Please ensure the data preparation and optimization scripts have been executed.")
    sys.exit(1)
if 'grid_searches' not in locals() and 'grid_searches' not in globals():
    print("Error: Variable 'grid_searches' is undefined. Please ensure the optimization script has been executed.")
    sys.exit(1)

# Filter the optimized models based on the current configuration.
filtered_grid_searches = {name: gs for name, gs in grid_searches.items() if name in ENABLED_BASE_LEARNERS}
print(f"\nModel filtering completed: Selected {len(filtered_grid_searches)} enabled models from {len(grid_searches)} available: {list(filtered_grid_searches.keys())}")
if not filtered_grid_searches:
    print("\n!!! FATAL ERROR: No Base Learners selected. Please check the ENABLED_BASE_LEARNERS list in the configuration.")
    sys.exit(1)

# Sanitize feature names in the training set to prevent errors in models like XGBoost.
# Store the sanitized list to ensure alignment with the prediction dataset.
feature_names_list = list(X.columns)
X.columns = [re.sub(r'\[|\]|<', '_', col) for col in feature_names_list]
feature_names_list = list(X.columns) # Update the list to reflect sanitized names.

print(f"\nAttempting to load unknown prediction dataset from '{UNKNOWN_DATA_FILE}'...")
if not os.path.exists(UNKNOWN_DATA_FILE):
    print(f"Error: Prediction file '{UNKNOWN_DATA_FILE}' does not exist. Please verify the file path.")
    sys.exit(1) 
else:
    try:
        full_data_from_excel = pd.read_excel(UNKNOWN_DATA_FILE)
        # Apply slicing based on UNKNOWN_DATA_FILE_COLUMN_RANGE
        X_new = full_data_from_excel.iloc[UNKNOWN_DATA_FILE_COLUMN_RANGE].copy()
    except Exception as e:
        print(f"Error: Failed to load or process '{UNKNOWN_DATA_FILE}': {e}. Please check file content and column range settings.")
        sys.exit(1) 

original_X_new_index = X_new.index # Preserve original indices for result export.

# --------------------------------------------------------------------------------
# --- Column Alignment Logic: Positional Mapping ---
# --------------------------------------------------------------------------------
# Verify that the number of features in the prediction set matches the training set.
if X_new.shape[1] != len(feature_names_list):
    print(f"Error: Prediction set X_new has {X_new.shape[1]} columns, but training set has {len(feature_names_list)} columns. Feature count mismatch.")
    print("Please verify 'UNKNOWN_DATA_FILE_COLUMN_RANGE' to ensure the extracted feature count matches the training data.")
    sys.exit(1)

# Directly map training feature names to the prediction dataset columns by position.
# This assumes the column order in the new dataset exactly matches the training data.
# This prevents issues arising from minor naming discrepancies.
X_new.columns = feature_names_list

# Check for missing values in the raw prediction data.
if X_new.isnull().any().any():
    missing_cols_in_raw_X_new = X_new.columns[X_new.isnull().any()].tolist()
    print(f"Warning: Prediction dataset X_new contains missing values (NaN). Affected columns: {missing_cols_in_raw_X_new}")
    # X_new = X_new.fillna(0) # Optional: Uncomment to enable zero-filling if required.
# --------------------------------------------------------------------------------

print(f"\nTraining Data X shape: {X.shape}")
print(f"Prediction Data X_new shape (after column alignment): {X_new.shape}")


# --- 3. Core Functions and Model Training ---
def initialize_best_estimators(grid_searches_dict):
    estimators_init = {}
    available_models = {
        'XGBR': (XGB.XGBRegressor, {'objective': 'reg:squarederror', 'random_state': DEFAULT_MODEL_RANDOM_STATE}),
        'RF': (RandomForestRegressor, {'random_state': DEFAULT_MODEL_RANDOM_STATE}),
        'GBRT': (GradientBoostingRegressor, {'random_state': DEFAULT_MODEL_RANDOM_STATE}),
        'ETR': (ExtraTreesRegressor, {'random_state': DEFAULT_MODEL_RANDOM_STATE}),
        'HGBR': (HistGradientBoostingRegressor, {'random_state': DEFAULT_MODEL_RANDOM_STATE})
    }
    if catboost_available: available_models['CBR'] = (CatBoostRegressor, {'verbose': False, 'random_state': DEFAULT_MODEL_RANDOM_STATE, 'allow_writing_files': False})
    if lightgbm_available: available_models['LGBM'] = (LGBMRegressor, {'random_state': DEFAULT_MODEL_RANDOM_STATE, 'verbosity': -1, 'objective': 'regression'})

    print("Initializing Base Learners...")
    for name in grid_searches_dict.keys():
        if name not in available_models:
            print(f"  Warning: Model {name} is not in the available models list, skipping initialization.")
            continue
        model_class, fixed_params = available_models[name]
        if name in grid_searches_dict and grid_searches_dict[name] is not None and hasattr(grid_searches_dict[name], 'best_estimator_'):
            try:
                estimators_init[name] = grid_searches_dict[name].best_estimator_
                print(f"  Successfully initialized {name} using optimized parameters.")
            except Exception as e:
                print(f"  Error initializing {name} (from best_estimator_): {e}. Skipping this model.")
        else:
            print(f"  Warning: Optimization results for {name} not found. Attempting initialization with default parameters.")
            try:
                estimators_init[name] = model_class(**fixed_params)
                print(f"  Successfully initialized {name} using default parameters.")
            except Exception as e:
                print(f"  Error initializing {name} (using defaults): {e}. Skipping this model.")
    initialized_estimators = {k: v for k, v in estimators_init.items() if v}
    if not initialized_estimators: print("Warning: No models were successfully initialized!")
    return initialized_estimators

meta_learner_params = {
    'elasticnet__alpha': Real(1e-5, 10.0, prior='log-uniform', name='elasticnet__alpha'),
    'elasticnet__l1_ratio': Real(0.0, 1.0, prior='uniform', name='elasticnet__l1_ratio')
}
final_meta_learner = None; base_estimators_for_final_training = None

if REUSE_PRETRAINED_STACKING_MODEL:
    print("\n--- Attempting to reuse pre-trained Stacking Model ---")
    if 'global_best_meta_learner' in locals() and global_best_meta_learner is not None and hasattr(global_best_meta_learner, 'predict'):
        final_meta_learner = global_best_meta_learner
        print("Successfully reused the pre-trained Meta-Learner.")
    else:
        print("Warning: Valid 'global_best_meta_learner' not found. Fallback to retraining mode.")
        REUSE_PRETRAINED_STACKING_MODEL = False

if not REUSE_PRETRAINED_STACKING_MODEL:
    print("\n--- Retraining the final Stacked Model ---")
    base_estimators_for_oof = initialize_best_estimators(filtered_grid_searches)
    if not base_estimators_for_oof:
        print("Error: No Base Learners available for Stacking Model training."); sys.exit(1)

    model_names_available = list(base_estimators_for_oof.keys())
    oof_preds_for_meta_training = np.zeros((len(X), len(model_names_available)))
    
    kf_for_oof = KFold(n_splits=PREDICT_CV_N_SPLITS, shuffle=True, random_state=DEFAULT_MODEL_RANDOM_STATE)

    print(f"Starting {PREDICT_CV_N_SPLITS}-fold Cross-Validation to generate OOF predictions (Random Seed: {DEFAULT_MODEL_RANDOM_STATE})...")
    for fold_idx, (train_idx, val_idx) in enumerate(kf_for_oof.split(X, Y)):
        print(f"  Fold {fold_idx+1}/{PREDICT_CV_N_SPLITS}")
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        Y_train_fold = Y.iloc[train_idx]
        for i, (name, estimator_template) in enumerate(base_estimators_for_oof.items()):
            estimator_fold = clone(estimator_template)
            try:
                estimator_fold.fit(X_train_fold, Y_train_fold)
                oof_preds_for_meta_training[val_idx, i] = estimator_fold.predict(X_val_fold)
            except Exception as e:
                print(f"    Warning: Model {name} failed during training or prediction: {e}. OOF predictions will remain zero.")

    if np.all(oof_preds_for_meta_training == 0) or np.any(np.isnan(oof_preds_for_meta_training)):
        print("Error: Invalid OOF predictions; unable to train Meta-Learner. Please verify Base Learner training."); sys.exit(1)

    print("Initiating Bayesian Optimization and training for Meta-Learner...")
    meta_bayes_search = BayesSearchCV(
        estimator=Pipeline([('scaler', StandardScaler()), ('elasticnet', ElasticNet(random_state=DEFAULT_MODEL_RANDOM_STATE, max_iter=2000))]),
        search_spaces=meta_learner_params, n_iter=PREDICT_META_LEARNER_N_ITER_BAYESIAN, scoring='r2',
        cv=RepeatedKFold(n_splits=PREDICT_META_LEARNER_N_SPLITS_CV, n_repeats=PREDICT_META_LEARNER_N_REPEATS_CV, random_state=DEFAULT_MODEL_RANDOM_STATE),
        n_jobs=-1, random_state=DEFAULT_MODEL_RANDOM_STATE, verbose=1)
    try:
        meta_bayes_search.fit(oof_preds_for_meta_training, Y)
        final_meta_learner = meta_bayes_search.best_estimator_
        print("\n--- Meta-Learner Training Completed ---")
        print(f"Meta-Learner Best Score (CV R² on Full OOF): {meta_bayes_search.best_score_:.4f}")
        print(f"Meta-Learner Best Parameters: {dict(meta_bayes_search.best_params_)}")

        # Output coefficients of Base Learners within the Meta-Learner
        if final_meta_learner and 'elasticnet' in final_meta_learner.named_steps:
            elastic_net_model = final_meta_learner.named_steps['elasticnet']
            if hasattr(elastic_net_model, 'coef_'):
                print("\n--- Base Learner Coefficients in Meta-Learner (ElasticNet) ---")
                for i, name in enumerate(model_names_available):
                    if i < len(elastic_net_model.coef_):
                        print(f"  {name}: {elastic_net_model.coef_[i]:.6f}")
                    else:
                        print(f"  Warning: Coefficient for {name} not found (index out of bounds).")
            else:
                print("Warning: Meta-Learner (ElasticNet) lacks 'coef_' attribute; cannot display coefficients.")
        else:
            print("Warning: Meta-Learner or ElasticNet component not found; cannot display coefficients.")

    except Exception as e:
        print(f"\n!!! Meta-Learner training failed: {e}. Prediction cannot proceed."); sys.exit(1)

print("\n--- Retraining Final Base Learners on Full Dataset ---")
base_estimators_for_final_training = initialize_best_estimators(filtered_grid_searches)
for name, estimator in base_estimators_for_final_training.items():
    print(f"  Training final {name} model...")
    try:
        estimator.fit(X, Y)
    except Exception as e:
        print(f"    Warning: Failed to train final {name} model: {e}. It will be removed from the prediction pipeline.")
        base_estimators_for_final_training[name] = None
base_estimators_for_final_training = {k: v for k, v in base_estimators_for_final_training.items() if v}


# --- 4. Prediction on Unknown Data ---
if not final_meta_learner or not base_estimators_for_final_training:
    print("FATAL ERROR: Stacking Model construction failed. Unable to proceed with prediction."); sys.exit(1)

print("\n--- Performing Stacked Prediction on Unknown Data ---")
base_predictions_on_new_data, base_model_names_for_prediction = [], []
print("Generating Base Learner predictions on unknown data...")
for name, estimator in base_estimators_for_final_training.items():
    try:
        pred = estimator.predict(X_new)
        base_predictions_on_new_data.append(pred)
        base_model_names_for_prediction.append(name)
        print(f"  {name} prediction completed.")
    except Exception as e:
        print(f"  Error: Model {name} failed during prediction on unknown data: {e}. Skipping this model.")

if not base_predictions_on_new_data:
    print("Error: All Base Learners failed to predict on unknown data."); sys.exit(1)

meta_features_for_prediction = np.column_stack(base_predictions_on_new_data)
print(f"Base Learner Prediction Shape (Meta-Learner Input): {meta_features_for_prediction.shape}")

print("Executing final Stacked Prediction using Meta-Learner...")
try:
    final_stacked_predictions = final_meta_learner.predict(meta_features_for_prediction)
    print("Final Stacked Prediction completed.")
except Exception as e:
    print(f"Error during Meta-Learner final prediction: {e}"); sys.exit(1)


# --- 5. Result Compilation and Export ---
print("\n--- Compiling and Exporting Prediction Results ---")
results_df = pd.DataFrame(index=original_X_new_index) # Use original indices
for i, name in enumerate(base_model_names_for_prediction):
    results_df[f'{name}_Prediction'] = base_predictions_on_new_data[i]
results_df['Stacked_Prediction'] = final_stacked_predictions

current_timestamp = time.strftime("%Y%m%d_%H%M%S")
output_filename_base = f"{PREDICTION_OUTPUT_FILENAME_PREFIX}_{current_timestamp}"

if PREDICTION_EXPORT_TO_EXCEL:
    excel_filename = f"{output_filename_base}.xlsx"
    try:
        results_df.to_excel(excel_filename, index=True, engine='openpyxl')
        print(f"Prediction results successfully exported to Excel: {excel_filename}")
    except ImportError:
        print("!!! 'openpyxl' library required for Excel export. Falling back to CSV export.")
        PREDICTION_EXPORT_TO_EXCEL = False
    except Exception as e:
        print(f"!!! Error exporting results to Excel: {e}")
        PREDICTION_EXPORT_TO_EXCEL = False

if PREDICTION_EXPORT_TO_CSV or not PREDICTION_EXPORT_TO_EXCEL:
    csv_filename = f"{output_filename_base}.csv"
    try:
        results_df.to_csv(csv_filename, index=True)
        print(f"Prediction results successfully exported to CSV: {csv_filename}")
    except Exception as e:
        print(f"!!! Error exporting results to CSV: {e}")

print("\nUnknown Data Prediction Script Execution Finished.")