In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ======================= #
#     Strat of Classification Codes    #
# ======================= #

# ## **Step 1: Automated Package Installation**

# - **Input:**
#   - List of required packages for the analysis.

# - **Process:**
#   - **1.1** Import necessary modules: `sys`, `subprocess`.
#   - **1.2** Define the `install(package)` function to install missing packages using `pip`.
#   - **1.3** Create a list of required packages:
#     - `'skrebate'`, `'rulefit'`, `'scikit-learn'`, `'pandas'`, `'numpy'`, `'matplotlib'`, `'seaborn'`, `'xgboost'`, `'lightgbm'`, `'catboost'`.
#   - **1.4** Iterate through the list of required packages:
#     - Try to import each package using `__import__(package)`.
#     - If an `ImportError` occurs, print a message and call `install(package)` to install it.

# - **Output:**
#   - All required packages are installed and available for use in the environment.

# ---

# ## **Step 2: Importing Modules**

# - **Input:**
#   - None (standard and third-party libraries).

# - **Process:**
#   - **2.1** Import essential libraries for system operations and utility functions:
#     - `os`, `shutil`, `inspect`, `datetime`.
#   - **2.2** Import data manipulation and numerical computation libraries:
#     - `pandas` as `pd`, `numpy` as `np`.
#   - **2.3** Import visualization libraries:
#     - `matplotlib.pyplot` as `plt`, `seaborn` as `sns`.
#   - **2.4** Import scikit-learn modules for data preprocessing:
#     - Imputers: `SimpleImputer`, `KNNImputer`, `IterativeImputer` (after enabling experimental features).
#     - Scalers: `MinMaxScaler`, `StandardScaler`, `RobustScaler`, `Normalizer`, `MaxAbsScaler`.
#   - **2.5** Import scikit-learn modules for model selection and evaluation:
#     - `train_test_split`, `StratifiedKFold`, `GridSearchCV`, `RandomizedSearchCV`.
#     - Metrics: `accuracy_score`, `precision_score`, `recall_score`, `f1_score`, `confusion_matrix`.
#   - **2.6** Import feature selection methods:
#     - `SelectKBest`, `chi2`, `f_classif`, `mutual_info_classif`.
#   - **2.7** Import machine learning classifiers:
#     - Linear models: `LogisticRegression`, `Lasso`.
#     - Tree-based models: `DecisionTreeClassifier`, `RandomForestClassifier`.
#     - Ensemble methods: `StackingClassifier`.
#     - Support Vector Machine: `SVC`.
#     - Neural Networks: `MLPClassifier`.
#     - Discriminant Analysis: `LinearDiscriminantAnalysis`.
#     - Nearest Neighbors: `KNeighborsClassifier`.
#     - Naive Bayes: `GaussianNB`.
#     - Gradient Boosting models: `XGBClassifier`, `LGBMClassifier`, `CatBoostClassifier`.
#     - Rule-based model: `RuleFit`.
#   - **2.8** Enable experimental features if necessary (e.g., `enable_iterative_imputer`).

# - **Output:**
#   - All necessary libraries are imported and ready for use in the script.

# ---

# ## **Step 3: Parameters Configuration**

# - **Input:**
#   - None (parameters are defined within the script).

# - **Process:**
#   - **3.1 General Settings:**
#     - **3.1.1** Set random seed for reproducibility:
#       - `RANDOM_SEED = 11`.
#     - **3.1.2** Define the number of top features to select:
#       - `NOF = 5`.
#     - **3.1.3** Set the number of folds for cross-validation:
#       - `N_FOLDS = 5`.
#     - **3.1.4** Define test size for train-test split:
#       - `TEST_SIZE = 0.15` (i.e., 15% of data for testing).
#   - **3.2 Classes Selection and Mapping:**
#     - **3.2.1** Select specific classes to include:
#       - `SELECTED_CLASSES = [0, 1, 2, 3, 4, 5]` (adjust as needed).
#     - **3.2.2** Define class mapping to consolidate classes:
#       - Map classes `0`, `1`, `2`, `3` to `0` (negative class).
#       - Map classes `4`, `5` to `1` (positive class).
#       - `CLASS_MAPPING = {0: 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1}`.
#   - **3.3 Class Selection Percentages:**
#     - **3.3.1** Define the percentage of patients to select from each class before mapping:
#       - `CLASS_SELECTION_PERCENT = {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1}` (100% selection for all classes).
#   - **3.4 Data Paths:**
#     - **3.4.1** Define the base directory for saving results:
#       - `BASE_RESULTS_DIRECTORY = r'Add the path of the folder that you would like to store the results'`.
#     - **3.4.2** Specify the file path for the input data:
#       - `FILE_PATH = r'Add a excel file with .xlsx format'`.
#     - **3.4.3** Set the sheet name for Excel files:
#       - `SHEET_NAME = 0`.
#   - **3.5 Imputation Strategy:**
#     - **3.5.1** Specify the strategy for handling missing values:
#       - `IMPUTATION_STRATEGY = 'SimpleImputer_mean'`.
#   - **3.6 Scaling Method:**
#     - **3.6.1** Specify the method for scaling features:
#       - `SCALING_METHOD = 'MinMaxScaler'`.
#   - **3.7 Feature Selectors:**
#     - **3.7.1** Define feature selection methods and their parameters in `INVOLVED_FEATURE_SELECTORS`:
#       - Examples include:
#         - 'Chi-Square Test (CST)'
#         - 'Correlation Coefficient (CC)'
#         - 'Mutual Information (MI)'
#         - 'Variance Threshold (VT)'
#         - 'ANOVA F-test (AFT)'
#         - 'Information Gain (IG)'
#         - 'Univariate Feature Selection (UFS)'
#         - 'Fisher Score (FS)'
#         - 'LASSO'
#   - **3.8 Classifiers:**
#     - **3.8.1** Define classifiers, their initial parameters, and hyperparameter grids in `INVOLVED_CLASSIFIERS`:
#       - Classifiers include:
#         - **Decision Tree Classification (DTC)**
#         - **Logistic Regression (LR)**
#         - **Linear Discriminant Analysis (LDA)**
#         - **Naive Bayes Classifier (NBC)**
#         - **K-Nearest Neighbors (KNN)**
#         - **Random Forest Classifier (RFC)**
#         - **Support Vector Machine (SVM)**
#         - **XGBoost Classifier**
#         - **LightGBM Classifier**
#         - **CatBoost Classifier**
#         - **Stacking Classifier**
#         - **MLP Classifier (MLP)**
#         - **RuleFit Classifier (RUC)**
#       - Each classifier has:
#         - A model class.
#         - Initial parameters (`params`).
#         - A hyperparameter grid (`param_grid`) for tuning.
#   - **3.9 Grid Search Configuration:**
#     - **3.9.1** Set the grid search mode:
#       - `GRID_SEARCH_MODE = 'randomized'`.
#     - **3.9.2** Define the number of iterations for randomized search:
#       - `GRID_SEARCH_ITER = 5`.

# - **Output:**
#   - All configuration parameters are set and ready for use in the analysis.

# ---

# ## **Step 4: Data Reading, Shuffling, Class Selection, and Mapping**

# - **Input:**
#   - The data file specified by `FILE_PATH` (supports `.xlsx`, `.xls`, `.csv` formats).

# - **Process:**
#   - **4.1 Data Reading:**
#     - **4.1.1** Read the data file using `pandas` based on the file extension.
#       - If the file is Excel (`.xlsx`, `.xls`), use `pd.read_excel(FILE_PATH, sheet_name=SHEET_NAME)`.
#       - If the file is CSV (`.csv`), use `pd.read_csv(FILE_PATH)`.
#     - **4.1.2** Extract components from the data:
#       - `Patient_ID`: The first column.
#       - `Data`: All columns from the second to the second-to-last.
#       - `Outcome`: The last column.
#   - **4.2 Diagnostics Before Mapping:**
#     - **4.2.1** Print unique values in `Outcome` before filtering and mapping.
#     - **4.2.2** Check and print the number of missing values in `Outcome`.
#   - **4.3 Data Filtering and Shuffling:**
#     - **4.3.1** Filter the dataset to include only `SELECTED_CLASSES`.
#       - Create a mask and apply it to the data.
#     - **4.3.2** Shuffle the filtered data if necessary to ensure randomness.
#     - **4.3.3** Update `Patient_ID`, `Data`, and `Outcome` after filtering and shuffling.
#   - **4.4 Class Selection by Percentage:**
#     - **4.4.1** Define a function `select_percentage_per_class` to select a specified percentage of patients from each class.
#     - **4.4.2** Apply the function to select data as per `CLASS_SELECTION_PERCENT`.
#     - **4.4.3** Update `Patient_ID`, `Data`, and `Outcome` after selection.
#     - **4.4.4** Verify the selection by printing class distribution after selection.
#   - **4.5 Class Mapping:**
#     - **4.5.1** Map original classes to new classes using `CLASS_MAPPING`.
#       - Use `Outcome_mapped = Outcome.map(CLASS_MAPPING)`.
#     - **4.5.2** Ensure no missing values after mapping.
#       - If missing values are found (unmapped classes), raise a `ValueError`.
#     - **4.5.3** Update `Outcome` with the mapped classes.
#   - **4.6 Feature Separation:**
#     - **4.6.1** Identify numeric columns using `Data.select_dtypes(include=[np.number])`.
#     - **4.6.2** Identify non-numeric columns using `Data.select_dtypes(exclude=[np.number])`.
#     - **4.6.3** Separate numeric and non-numeric data.

# - **Output:**
#   - Prepared `Patient_ID`, `Data`, and `Outcome` variables for further processing.

# ---

# ## **Step 5: Preprocessing**

# - **Input:**
#   - `Data`, `Outcome`.

# - **Process:**
#   - **5.1 Stratified Splitting:**
#     - **5.1.1** Perform a stratified train-test split using `train_test_split` to maintain class distribution.
#       - Split data into:
#         - `X_train_num`, `X_test_num`: Numeric features.
#         - `y_train`, `y_test`: Target variable.
#         - `Patient_ID_train`, `Patient_ID_test`: Patient identifiers.
#       - Parameters:
#         - `test_size=TEST_SIZE`
#         - `random_state=RANDOM_SEED`
#         - `stratify=Outcome`
#   - **5.2 Imputation:**
#     - **5.2.1** Initialize the imputer based on `IMPUTATION_STRATEGY`.
#       - For `'SimpleImputer_mean'`, use `SimpleImputer(strategy='mean')`.
#     - **5.2.2** Fit the imputer on `X_train_num` and transform both training and testing data:
#       - `X_train_num_imputed = imputer.fit_transform(X_train_num)`
#       - `X_test_num_imputed = imputer.transform(X_test_num)`
#   - **5.3 Scaling:**
#     - **5.3.1** Initialize the scaler based on `SCALING_METHOD`.
#       - For `'MinMaxScaler'`, use `MinMaxScaler()`.
#     - **5.3.2** Fit the scaler on the imputed training data and transform both training and testing data:
#       - `X_train_scaled = scaler.fit_transform(X_train_num_imputed)`
#       - `X_test_scaled = scaler.transform(X_test_num_imputed)`

# - **Output:**
#   - `X_train_scaled` and `X_test_scaled`: Preprocessed feature matrices ready for feature selection and modeling.

# ---

# ## **Step 6: Feature Selection**

# - **Input:**
#   - `X_train_scaled`, `y_train`, feature selectors, `NOF`.

# - **Process:**
#   - **6.1 Define Feature Selector Functions:**
#     - Implement functions for each feature selection method:
#       - **6.1.1** `apply_correlation_coefficient`:
#         - Calculate the absolute correlation between each feature and the target.
#         - Select the top `NOF` features with the highest correlation.
#       - **6.1.2** `apply_chi_square`:
#         - Use `SelectKBest` with `score_func=chi2` to select top `NOF` features.
#       - **6.1.3** `apply_mutual_information`:
#         - Use `SelectKBest` with `score_func=mutual_info_classif` to select top `NOF` features.
#       - **6.1.4** `apply_variance_threshold`:
#         - Calculate variance of each feature and select top `NOF` features with highest variance.
#       - **6.1.5** `apply_anova_f_test`:
#         - Use `SelectKBest` with `score_func=f_classif` to select top `NOF` features.
#       - **6.1.6** `apply_information_gain`:
#         - Equivalent to mutual information; use `apply_mutual_information`.
#       - **6.1.7** `apply_univariate_feature_selection`:
#         - Use `SelectKBest` with a specified `score_func` to select top `NOF` features.
#       - **6.1.8** `apply_fisher_score`:
#         - Manually compute Fisher Scores and select top `NOF` features.
#       - **6.1.9** `apply_lasso`:
#         - Use `Lasso` regression to select features with non-zero coefficients.
#         - Adjust for cases where fewer than `NOF` features are selected.
#   - **6.2 Map Functions:**
#     - **6.2.1** Create a dictionary `feature_selector_functions` mapping function names to the actual functions.

# - **Output:**
#   - Feature selector functions are ready to be applied.

# ---

# ## **Step 7: Applying Feature Selection and Classifiers**

# - **Input:**
#   - Preprocessed data (`X_train_scaled`, `X_test_scaled`), `y_train`, `y_test`, `Patient_ID_train`, `Patient_ID_test`, feature selectors, classifiers, grid search configuration.

# - **Process:**
#   - **7.1 Initialize Storage Structures:**
#     - **7.1.1** Initialize lists and dictionaries to store results, selected features, confusion matrices, and best hyperparameters.
#     - **7.1.2** Create a timestamped `results_directory` to store outputs.
#     - **7.1.3** Create subdirectories:
#       - `Predicted_Outcome/Fivefold Cross Validation`
#       - `Predicted_Outcome/External Testing`
#       - `Tuning_Hyperparameters`
#   - **7.2 Initialize Stratified K-Fold:**
#     - **7.2.1** Set up `StratifiedKFold` with `n_splits=N_FOLDS`, `shuffle=True`, `random_state=RANDOM_SEED`.
#   - **7.3 Iterate Over Feature Selectors:**
#     - For each feature selector in `feature_selectors`:
#       - **7.3.1 Apply Feature Selection:**
#         - **7.3.1.1** Retrieve the function and parameters from `INVOLVED_FEATURE_SELECTORS`.
#         - **7.3.1.2** Apply the feature selection function to `X_train_scaled` and `y_train`.
#         - **7.3.1.3** Obtain selected feature indices and names.
#         - **7.3.1.4** Apply the same feature selection to `X_test_scaled`.
#         - **7.3.1.5** Store selected features for later reference.
#       - **7.3.2 Iterate Over Classifiers:**
#         - For each classifier in `classifiers`:
#           - **7.3.2.1 Hyperparameter Tuning:**
#             - **7.3.2.1.1** Retrieve the model class, initial parameters, and hyperparameter grid from `INVOLVED_CLASSIFIERS`.
#             - **7.3.2.1.2** Initialize the classifier with initial parameters.
#             - **7.3.2.1.3** Use `RandomizedSearchCV` or `GridSearchCV` for hyperparameter tuning:
#               - If `GRID_SEARCH_MODE == 'randomized'`, use `RandomizedSearchCV` with `n_iter=GRID_SEARCH_ITER`.
#               - Handle cases where `param_grid` is a list (switch to `GridSearchCV`).
#             - **7.3.2.1.4** Fit the search object on `X_train_selected` and `y_train`.
#             - **7.3.2.1.5** Retrieve and store the best hyperparameters.
#           - **7.3.2.2 K-Fold Cross-Validation:**
#             - **7.3.2.2.1** Initialize metrics storage for each fold.
#             - **7.3.2.2.2** For each fold in `StratifiedKFold`:
#               - **7.3.2.2.2.1** Split `X_train_selected` and `y_train` into training and validation sets.
#               - **7.3.2.2.2.2** Instantiate a new classifier with the best hyperparameters.
#               - **7.3.2.2.2.3** Train the classifier on the training fold.
#               - **7.3.2.2.2.4** Predict on the validation fold.
#               - **7.3.2.2.2.5** Compute evaluation metrics: Accuracy, Precision, Recall, F1-Score.
#               - **7.3.2.2.2.6** Store predictions, true labels, patient IDs, and fold numbers.
#               - **7.3.2.2.2.7** Predict on the external test set.
#               - **7.3.2.2.2.8** Compute test metrics and store predictions.
#               - **7.3.2.2.2.9** Compute and store confusion matrices for both validation and test sets.
#             - **7.3.2.2.3** Aggregate metrics across folds and compute means and standard deviations.
#             - **7.3.2.2.4** Save cross-validation and test predictions to CSV files in their respective directories.
#           - **7.3.2.3 Save Best Hyperparameters:**
#             - **7.3.2.3.1** Save the best hyperparameters for the classifier to a CSV file in `Tuning_Hyperparameters`.
#       - **7.3.3 Save Selected Features:**
#         - **7.3.3.1** Save the list of selected features for the feature selector to `selected_features.csv`.

# - **Output:**
#   - Results including evaluation metrics, selected features, confusion matrices, and best hyperparameters are collected and ready for saving.

# ---

# ## **Step 8: Saving and Aggregating Results**

# - **Input:**
#   - Collected results from Step 7.

# - **Process:**
#   - **8.1 Aggregate Metrics:**
#     - **8.1.1** Concatenate results from all classifiers and feature selectors into a single DataFrame (`results_df`).
#     - **8.1.2** Compute average metrics (mean and standard deviation) grouped by `Feature Selector` and `Classifier`:
#       - Metrics include:
#         - Validation and Test Accuracy
#         - Precision
#         - Recall
#         - F1-Score
#   - **8.2 Save Metrics:**
#     - **8.2.1** Save detailed evaluation metrics to `evaluation_metrics.csv` in `results_directory`.
#     - **8.2.2** Save average metrics to `average_metrics.csv`.
#     - **8.2.3** Extract and save standard deviations to `STD_metrics.csv`.
#   - **8.3 Save Confusion Matrices:**
#     - **8.3.1** Save all confusion matrices to `confusion_matrices.csv`.
#   - **8.4 Copy Original Data File:**
#     - **8.4.1** Copy the original data file to the `results_directory` for reference.
#   - **8.5 Save Workflow and Code:**
#     - **8.5.1** Save the workflow description as `Workflow_Ver18.txt`.
#     - **8.5.2** Save the script code as `Code_Ver18.py`:
#       - Use `inspect.getsource` to retrieve the code.
#       - Handle exceptions if `inspect` cannot retrieve the code.
#   - **8.6 Save Best Hyperparameters:**
#     - **8.6.1** Combine best hyperparameters from all classifiers and feature selectors.
#     - **8.6.2** Save to `best_parameters.csv` in `Tuning_Hyperparameters`.

# - **Output:**
#   - All results are saved in the organized `results_directory`.

# ---

# ## **Step 9: Final Output Messages**

# - **Input:**
#   - None.

# - **Process:**
#   - **9.1** Print messages summarizing the completion of the script and locations of saved files:
#     - Results files (evaluation metrics, average metrics, standard deviations).
#     - Selected features.
#     - Confusion matrices.
#     - Predictions from cross-validation and testing.
#     - Best hyperparameters.
#     - Original data file.
#     - Workflow description and code.
#   - **9.2** If no results are available, inform the user to check data and parameters.

# - **Output:**
#   - User is informed about the successful completion and where to find all outputs.

# # ======================= #
# #    End of Classification Codes      #
# # ======================= #


In [None]:
# ======================= #
#      Start of Classification Codes         #
# ======================= #

# ----------------------- #
# 1. Automated Package Installation
# ----------------------- #
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of required packages
required_packages = [
    'skrebate',
    'rulefit',
    'scikit-learn',  # Ensure scikit-learn is up-to-date
    'pandas',
    'numpy',
    'matplotlib',
    'seaborn',
    'xgboost',       # XGBoost
    'lightgbm',      # LightGBM
    'catboost',      # CatBoost
]

# Install missing packages
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing package: {package}")
        install(package)

# ----------------------- #
# 2. Importing Modules
# ----------------------- #
import os
import shutil
import inspect  # For getting the source code
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt  # For plotting
import seaborn as sns  # For plotting
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier  # Importing XGBoost
from lightgbm import LGBMClassifier  # Importing LightGBM
from catboost import CatBoostClassifier  # Importing CatBoost
from sklearn.ensemble import RandomForestClassifier, StackingClassifier  # Random Forest and Stacking
from sklearn.svm import SVC  # Support Vector Machine
from sklearn.neural_network import MLPClassifier  # Importing MLPClassifier

# Enable IterativeImputer before importing it
from sklearn.experimental import enable_iterative_imputer  # Enable the experimental feature
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer  # Now import IterativeImputer
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, RobustScaler, Normalizer, MaxAbsScaler
)
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from rulefit import RuleFit
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# ----------------------- #
# 3. Parameters Configuration
# ----------------------- #

### PARAMETERS ###

# --------------------- #
# General Settings
# --------------------- #
# Random seed for reproducibility
RANDOM_SEED = 11

# Number of top features to select
NOF = 5

# Number of folds for cross-validation
N_FOLDS = 5

# Test size for train-test split
TEST_SIZE = 0.15   # TRAIN Size=1-TEST_SIZE

# Classes Selection and Mapping
# Select specific classes to include
SELECTED_CLASSES = [0, 1, 2, 3, 4, 5]  # Adjust as needed

# Define class mapping: map class 3 to 0, and classes 4 & 5 to 1
CLASS_MAPPING = {
    0: 0,
    1: 0,
    2: 0,
    3: 0,  # Example: You might want to adjust based on your specific needs
    4: 1,
    5: 1
}
print("Selected Classes:", SELECTED_CLASSES)
print("CLASS_MAPPING:", CLASS_MAPPING)

# --------------------- #
# Class Selection Percentages
# --------------------- #
# Define the percentage of patients to select from each class before mapping
# Values should be between 0 and 1
CLASS_SELECTION_PERCENT = {
    0: 1,    # Select 100% of patients from class 0
    1: 1,    # Select 100% of patients from class 1
    2: 1,    # Select 100% of patients from class 2
    3: 1, # Select 100% of patients from class 3
    4: 1, # Select 100% of patients from class 4
    5: 1  # Select 100% of patients from class 5
}

print("Class Selection Percentages:", CLASS_SELECTION_PERCENT)

# --------------------- #
# Data Paths
# --------------------- #
# Base directory for saving all results
BASE_RESULTS_DIRECTORY = r'Add the path of the folder that you would like to store the results'#  Saving path directory

# File path for input data
FILE_PATH = r'Add a excel file with .xlsx formt'# The first column is patinet ID and the last column is true outcomes

# Sheet name (if applicable)
SHEET_NAME = 0  # Set to the sheet name or index if reading from an Excel file

# --------------------- #
# Imputation Strategy
# --------------------- #
# Define the imputation strategy for handling missing values.
# Options:
# - 'SimpleImputer_mean'
# - 'SimpleImputer_median'
# - 'SimpleImputer_most_frequent'
# - 'SimpleImputer_constant'
# - 'KNNImputer'
# - 'IterativeImputer'
IMPUTATION_STRATEGY = 'SimpleImputer_mean'  # Adjust as needed

# --------------------- #
# Scaling Method
# --------------------- #
# Define the scaling method for preprocessing.
# Options:
# - 'MinMaxScaler'
# - 'StandardScaler'
# - 'RobustScaler'
# - 'Normalizer'
# - 'MaxAbsScaler'
# - None (No scaling)
SCALING_METHOD = 'MinMaxScaler'  # Adjust as needed

# --------------------- #
# Feature Selectors
# --------------------- #
# Define all feature selectors to be included.
# Each key is a descriptive name, and the value contains the function name and parameters.
INVOLVED_FEATURE_SELECTORS = {
    'Chi-Square Test (CST)': {
        'function': 'apply_chi_square',
        'params': {'score_func': chi2},
    },
    'Correlation Coefficient (CC)': {
        'function': 'apply_correlation_coefficient',
        'params': {},
    },
    'Mutual Information (MI)': {
        'function': 'apply_mutual_information',
        'params': {},
    },
    'Variance Threshold (VT)': {
        'function': 'apply_variance_threshold',
        'params': {},
    },
    'ANOVA F-test (AFT)': {
        'function': 'apply_anova_f_test',
        'params': {},
    },
    'Information Gain (IG)': {
        'function': 'apply_information_gain',
        'params': {},
    },
    'Univariate Feature Selection (UFS)': {
        'function': 'apply_univariate_feature_selection',
        'params': {'score_func': mutual_info_classif},  # Example using MI
    },
    'Fisher Score (FS)': {
        'function': 'apply_fisher_score',
        'params': {},
    },
    'LASSO': {
        'function': 'apply_lasso',
        'params': {'alpha': 0.01},  # Adjust alpha as needed
    },
}

# --------------------- #
# Classifiers
# --------------------- #
# Define all classifiers to be involved.
# Each key is a descriptive name, and the value contains the model class, initial parameters, and hyperparameter grid.
INVOLVED_CLASSIFIERS = {
    'Decision Tree Classification (DTC)': {
        'model': DecisionTreeClassifier,
        'params': {'random_state': RANDOM_SEED},
        'param_grid': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'criterion': ['gini', 'entropy']
        }
    },
    'Logistic Regression (LR)': {
        'model': LogisticRegression,
        'params': {'max_iter': 1000, 'random_state': RANDOM_SEED},
        'param_grid': {
            'C': [0.01, 0.1, 1, 10, 100],
            'penalty': ['l2'],
            'solver': ['lbfgs', 'saga']
        }
    },
    'Linear Discriminant Analysis (LDA)': {
        'model': LinearDiscriminantAnalysis,
        'params': {},
        'param_grid': [
            {'solver': ['svd']},
            {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto']}
        ]
    },
    'Naive Bayes Classifier (NBC)': {
        'model': GaussianNB,
        'params': {},
        'param_grid': {
            'var_smoothing': [1e-09, 1e-08, 1e-07]
        }
    },
    'K-Nearest Neighbors (KNN)': {
        'model': KNeighborsClassifier,
        'params': {'n_neighbors': 5},
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
        }
    },
    'Random Forest Classifier (RFC)': {
        'model': RandomForestClassifier,
        'params': {'random_state': RANDOM_SEED},
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'Support Vector Machine (SVM)': {
        'model': SVC,
        'params': {'random_state': RANDOM_SEED},
        'param_grid': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 'auto']
        }
    },
    'XGBoost Classifier': {
        'model': XGBClassifier,
        'params': {'random_state': RANDOM_SEED, 'use_label_encoder': False, 'eval_metric': 'logloss'},
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    },
    'LightGBM Classifier': {
        'model': LGBMClassifier,
        'params': {'random_state': RANDOM_SEED},
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [-1, 10, 20]
        }
    },
    'CatBoost Classifier': {
        'model': CatBoostClassifier,
        'params': {'random_state': RANDOM_SEED, 'verbose': 0},  # Silent mode for CatBoost
        'param_grid': {
            'iterations': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'depth': [3, 5, 7]
        }
    },
    'Stacking Classifier': {
        'model': StackingClassifier,
        'params': {
            'estimators': [
                ('rf', RandomForestClassifier(random_state=RANDOM_SEED)),
                ('svc', SVC(probability=True, random_state=RANDOM_SEED))
            ],
            'final_estimator': LogisticRegression(),
        },
        'param_grid': {
            'final_estimator__C': [0.1, 1, 10]
        }
    },
    # --------------------- #
    # Added 'MLP Classifier (MLP)'
    # --------------------- #
    'MLP Classifier (MLP)': {
        'model': MLPClassifier,
        'params': {'random_state': RANDOM_SEED},
        'param_grid': {
            'hidden_layer_sizes': [(100,), (50, 50), (100, 50, 25)],
            'activation': ['relu', 'tanh', 'logistic'],
            'solver': ['adam', 'sgd'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive'],
        }
    },
    # --------------------- #
    # Updated 'RuleFit Classifier (RUC)'
    # --------------------- #
    'RuleFit Classifier (RUC)': {
        'model': RuleFit,  # Use RuleFit directly without calibration
        'params': {
            'random_state': RANDOM_SEED,
            'max_rules': 2000,
            'memory_par': 0.01,
            'tree_size': 4,
            'lin_trim_quantile': 0.025,
            'lin_standardise': True,
            'rfmode': 'classify',
            # Removed 'n_jobs' and other unsupported parameters
        },
        'param_grid': {
            'max_rules': [1000, 2000],
            'tree_size': [3, 4, 5],
            'memory_par': [0.01, 0.05],
            'lin_trim_quantile': [0.025, 0.05],
            'lin_standardise': [True, False]
        }
    },
}

# --------------------- #
# Define the available feature selectors and classifiers with labels
# --------------------- #
feature_selectors = [
    "Chi-Square Test (CST)",  # FSA1
    "Correlation Coefficient (CC)",  # FSA2
    "Mutual Information (MI)",  # FSA3
    "Variance Threshold (VT)",  # FSA4
    "ANOVA F-test (AFT)",  # FSA5
    "Information Gain (IG)",  # FSA6
    "Univariate Feature Selection (UFS)",  # FSA7
    "Fisher Score (FS)", # FSA8
    "LASSO"  # FSA9
]

classifiers = [
    "Decision Tree Classification (DTC)",  # C1
    "Logistic Regression (LR)",  # C2
    "Linear Discriminant Analysis (LDA)",  # C3
    "Naive Bayes Classifier (NBC)",  # C4
    "K-Nearest Neighbors (KNN)",  # C5
    "Random Forest Classifier (RFC)",  # C6
    "Support Vector Machine (SVM)",  # C7
    "XGBoost Classifier",  # C8
    "LightGBM Classifier",  # C9
    "CatBoost Classifier",  # C10
    "Stacking Classifier",  # C11
    "MLP Classifier (MLP)",  # C12
    "RuleFit Classifier (RUC)" # C13
]

# --------------------- #
# Grid Search Configuration
# --------------------- #
# Options for GRID_SEARCH_MODE: 'exhaustive' or 'randomized'
GRID_SEARCH_MODE = 'randomized'  # Set to 'randomized'
GRID_SEARCH_ITER = 5  # Number of parameter settings sampled in RandomizedSearchCV

### END OF PARAMETERS ###

# ----------------------- #
# 4. Data Reading, Shuffling, Class Selection, and Mapping
# ----------------------- #

# Read the data file
if FILE_PATH.endswith('.xlsx') or FILE_PATH.endswith('.xls'):
    Org_Data = pd.read_excel(FILE_PATH, sheet_name=SHEET_NAME)
elif FILE_PATH.endswith('.csv'):
    Org_Data = pd.read_csv(FILE_PATH)
else:
    raise ValueError("Unsupported file format. Please provide a .xlsx, .xls, or .csv file.")

# Extract the Patient ID, Data, and Outcome
Patient_ID = Org_Data.iloc[:, 0]
Data = Org_Data.iloc[:, 1:-1]   # Data (from second to second-to-last column)
Outcome = Org_Data.iloc[:, -1]  # Outcome (last column)

# **Diagnostics Start**
print("Unique values in Outcome before filtering and mapping:", Outcome.unique())
print("Number of missing values in Outcome:", Outcome.isnull().sum())

# Filter the dataset to include only SELECTED_CLASSES
mask = Outcome.isin(SELECTED_CLASSES)
filtered_data = Org_Data[mask].copy()

# Shuffle the filtered_data to ensure randomness before sampling
filtered_data = filtered_data  # Placeholder for shuffling if needed

# Update Patient_ID, Data, and Outcome after filtering and shuffling
Patient_ID = filtered_data.iloc[:, 0]
Data = filtered_data.iloc[:, 1:-1]
Outcome = filtered_data.iloc[:, -1]

print(f"After filtering and shuffling, number of samples: {len(Outcome)}")
print("Unique values in Outcome after filtering and mapping:", Outcome.unique())

# Identify unexpected values after filtering (should not be any)
expected_classes = list(CLASS_MAPPING.keys())
unexpected_values = Outcome[~Outcome.isin(expected_classes)].unique()
print("Unexpected Outcome values not in CLASS_MAPPING after filtering:", unexpected_values)
# **Diagnostics End**

### CLASS SELECTION BY PERCENTAGE ###
# Select a specific percentage of patients from each class
def select_percentage_per_class(df, class_column, selection_percent, random_state=RANDOM_SEED):
    selected_df = pd.DataFrame()
    for cls, percent in selection_percent.items():
        cls_df = df[df[class_column] == cls]
        n_samples = int(len(cls_df) * percent)
        if n_samples == 0 and len(cls_df) > 0:
            n_samples = 1  # Ensure at least one sample is selected if possible
        cls_selected = cls_df.sample(n=n_samples, random_state=random_state) if n_samples > 0 else pd.DataFrame()
        selected_df = pd.concat([selected_df, cls_selected], axis=0)
    return selected_df

# Apply selection
selected_data = select_percentage_per_class(
    filtered_data,
    class_column=Outcome.name,
    selection_percent=CLASS_SELECTION_PERCENT,
    random_state=RANDOM_SEED
)
print(f"After class-wise selection, number of samples: {len(selected_data)}")

# Update Patient_ID, Data, and Outcome after selection
Patient_ID = selected_data.iloc[:, 0]
Data = selected_data.iloc[:, 1:-1]
Outcome = selected_data.iloc[:, -1]

# Verify the selection
print("Class distribution after selection:")
print(Outcome.value_counts(normalize=True))

### CLASS MAPPING ###
# Map original classes to new classes as per CLASS_MAPPING
Outcome_mapped = Outcome.map(CLASS_MAPPING)

# Ensure no missing values after mapping
if Outcome_mapped.isnull().any():
    raise ValueError("Some classes in Outcome do not have a mapping.")

# Update Outcome variable
Outcome = Outcome_mapped

# Separate numeric and non-numeric columns
numeric_columns = Data.select_dtypes(include=[np.number]).columns
non_numeric_columns = Data.select_dtypes(exclude=[np.number]).columns

numeric_data = Data[numeric_columns]
non_numeric_data = Data.select_dtypes(exclude=[np.number]).columns

# ----------------------- #
# 5. Preprocessing
# ----------------------- #

# Stratified train-test split for external testing
# Also split Patient_ID
X_train_num, X_test_num, y_train, y_test, Patient_ID_train, Patient_ID_test = train_test_split(
    numeric_data,
    Outcome,
    Patient_ID,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=Outcome,
)

# Impute only numeric data
if IMPUTATION_STRATEGY.startswith('SimpleImputer'):
    strategy = IMPUTATION_STRATEGY.split('_')[1]
    if strategy == 'constant':
        imputer = SimpleImputer(strategy=strategy, fill_value=0)  # Example fill value
    else:
        imputer = SimpleImputer(strategy=strategy)
elif IMPUTATION_STRATEGY == 'KNNImputer':
    imputer = KNNImputer()
elif IMPUTATION_STRATEGY == 'IterativeImputer':
    imputer = IterativeImputer(random_state=RANDOM_SEED)
else:
    raise ValueError(f"Unsupported imputation strategy: {IMPUTATION_STRATEGY}")

X_train_num_imputed = imputer.fit_transform(X_train_num)
X_test_num_imputed = imputer.transform(X_test_num)

# Scale only numeric data
if SCALING_METHOD == 'MinMaxScaler':
    scaler = MinMaxScaler()
elif SCALING_METHOD == 'StandardScaler':
    scaler = StandardScaler()
elif SCALING_METHOD == 'RobustScaler':
    scaler = RobustScaler()
elif SCALING_METHOD == 'Normalizer':
    scaler = Normalizer()
elif SCALING_METHOD == 'MaxAbsScaler':
    scaler = MaxAbsScaler()
elif SCALING_METHOD is None:
    scaler = None
else:
    raise ValueError(f"Unsupported scaling method: {SCALING_METHOD}")

if scaler is not None:
    X_train_scaled = scaler.fit_transform(X_train_num_imputed)
    X_test_scaled = scaler.transform(X_test_num_imputed)
else:
    X_train_scaled = X_train_num_imputed
    X_test_scaled = X_test_num_imputed

# ----------------------- #
# 6. Feature Selection
# ----------------------- #

# Define feature selector functions

# --------------------- #1 apply_correlation_coefficient# ---------------------
def apply_correlation_coefficient(X_train, y_train, NOF, **kwargs):
    corrs = []
    y_train_np = y_train.ravel()
    for i in range(X_train.shape[1]):
        corr = np.corrcoef(X_train[:, i], y_train_np)[0, 1]
        if np.isnan(corr):
            corr = 0
        corrs.append(abs(corr))
    corrs = np.array(corrs)
    top_k_idx = np.argsort(corrs)[-NOF:]
    X_train_corr = X_train[:, top_k_idx]
    return X_train_corr, top_k_idx

# --------------------- #2 apply_chi_square# ---------------------
def apply_chi_square(X_train, y_train, NOF, **kwargs):
    score_func = kwargs.get('score_func', chi2)
    chi2_selector = SelectKBest(score_func=score_func, k=NOF)
    X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
    return X_train_chi2, chi2_selector

# --------------------- #3 apply_mutual_information# ---------------------
def apply_mutual_information(X_train, y_train, NOF, **kwargs):
    mi_selector = SelectKBest(score_func=mutual_info_classif, k=NOF)
    X_train_mi = mi_selector.fit_transform(X_train, y_train)
    return X_train_mi, mi_selector

# --------------------- #4 apply_variance_threshold# ---------------------
def apply_variance_threshold(X_train, y_train, NOF, **kwargs):
    # VarianceThreshold removes all features whose variance doesn't meet the threshold.
    # Here, we compute the threshold to select top NOF features based on variance.
    variances = np.var(X_train, axis=0)
    top_k_idx = np.argsort(variances)[-NOF:]
    X_train_var = X_train[:, top_k_idx]
    return X_train_var, top_k_idx

# --------------------- #5 apply_anova_f_test# ---------------------
def apply_anova_f_test(X_train, y_train, NOF, **kwargs):
    anova_selector = SelectKBest(score_func=f_classif, k=NOF)
    X_train_anova = anova_selector.fit_transform(X_train, y_train)
    return X_train_anova, anova_selector

# --------------------- #6 apply_information_gain# ---------------------
def apply_information_gain(X_train, y_train, NOF, **kwargs):
    # Information Gain is equivalent to Mutual Information for classification tasks
    ig_selector = SelectKBest(score_func=mutual_info_classif, k=NOF)
    X_train_ig = ig_selector.fit_transform(X_train, y_train)
    return X_train_ig, ig_selector

# --------------------- #7 apply_univariate_feature_selection# ---------------------
def apply_univariate_feature_selection(X_train, y_train, NOF, **kwargs):
    score_func = kwargs.get('score_func', mutual_info_classif)
    ufs_selector = SelectKBest(score_func=score_func, k=NOF)
    X_train_ufs = ufs_selector.fit_transform(X_train, y_train)
    return X_train_ufs, ufs_selector

# --------------------- #8 apply_fisher_score# ---------------------
def apply_fisher_score(X_train, y_train, NOF, **kwargs):
    """
    Manual implementation of Fisher Score for feature selection.
    """
    classes = np.unique(y_train)
    mean_total = np.mean(X_train, axis=0)
    numerator = 0
    denominator = 0
    for cls in classes:
        X_cls = X_train[y_train == cls]
        n_cls = X_cls.shape[0]
        mean_cls = np.mean(X_cls, axis=0)
        var_cls = np.var(X_cls, axis=0)
        numerator += n_cls * (mean_cls - mean_total) ** 2
        denominator += n_cls * var_cls
    # To avoid division by zero
    denominator = np.where(denominator == 0, 1e-10, denominator)
    fisher_scores = numerator / denominator
    top_k_idx = np.argsort(fisher_scores)[-NOF:]
    X_train_fs = X_train[:, top_k_idx]
    return X_train_fs, top_k_idx

# --------------------- #9 apply_lasso# ---------------------
def apply_lasso(X_train, y_train, NOF, **kwargs):
    alpha = kwargs.get('alpha', 0.01)
    lasso = Lasso(alpha=alpha, random_state=RANDOM_SEED)
    lasso.fit(X_train, y_train)

    # Get the indices of features with non-zero coefficients
    selected_idx = np.where(lasso.coef_ != 0)[0]

    # Check if the number of selected features is less than NOF
    if len(selected_idx) < NOF:
        # If fewer features are selected, select additional top features based on absolute coefficient values
        coef_abs_sorted = np.argsort(np.abs(lasso.coef_))[-NOF:]  # Sort by absolute value of coefficients
        selected_idx = np.union1d(selected_idx, coef_abs_sorted)  # Combine selected and additional features
    else:
        # Sort the selected features by absolute coefficient value and select the top NOF
        coef_abs_selected = np.abs(lasso.coef_[selected_idx])
        sorted_order = np.argsort(coef_abs_selected)[::-1]
        selected_idx = selected_idx[sorted_order][:NOF]

    # Ensure selected_idx is a NumPy array before indexing
    selected_idx = np.array(selected_idx)

    # Select the top features from the training data
    X_train_selected = X_train[:, selected_idx]

    return X_train_selected, selected_idx

# Map function names to actual functions
feature_selector_functions = {
    'apply_chi_square': apply_chi_square,
    'apply_correlation_coefficient': apply_correlation_coefficient,
    'apply_mutual_information': apply_mutual_information,
    'apply_variance_threshold': apply_variance_threshold,
    'apply_anova_f_test': apply_anova_f_test,
    'apply_information_gain': apply_information_gain,
    'apply_univariate_feature_selection': apply_univariate_feature_selection,
    'apply_fisher_score': apply_fisher_score,
    'apply_lasso': apply_lasso,
}

# ----------------------- #
# 7. Applying Feature Selection and Classifiers
# ----------------------- #

# To store the results
results = []
selected_features_all = []
confusion_matrices_all = []
best_parameters_all = []  # To store best hyperparameters

# Create results directory with timestamp
current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
results_directory = os.path.join(BASE_RESULTS_DIRECTORY, f"results_{current_time}")
os.makedirs(results_directory, exist_ok=True)

# Create Predicted_Outcome directory and subdirectories
predicted_outcome_directory = os.path.join(results_directory, 'Predicted_Outcome')
fivefold_cv_directory = os.path.join(predicted_outcome_directory, 'Fivefold Cross Validation')
external_test_directory = os.path.join(predicted_outcome_directory, 'External Testing')
tuning_hyperparameters_directory = os.path.join(results_directory, 'Tuning_Hyperparameters')

os.makedirs(fivefold_cv_directory, exist_ok=True)
os.makedirs(external_test_directory, exist_ok=True)
os.makedirs(tuning_hyperparameters_directory, exist_ok=True)

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

# Modify the loop to iterate through the 'feature_selectors' list
for feature_selector_name in feature_selectors:
    if feature_selector_name not in INVOLVED_FEATURE_SELECTORS:
        print(f"Feature Selector '{feature_selector_name}' not found in INVOLVED_FEATURE_SELECTORS. Skipping.")
        continue  # Skip if not defined

    fs_info = INVOLVED_FEATURE_SELECTORS[feature_selector_name]
    print(f"\n\033[1m--- Applying Feature Selector: {feature_selector_name} ---\033[0m")

    fs_function_name = fs_info['function']
    fs_params = fs_info['params']
    feature_selector_function = feature_selector_functions.get(fs_function_name)

    if feature_selector_function is None:
        print(f"Feature selector function '{fs_function_name}' not found. Skipping '{feature_selector_name}'.")
        continue  # Skip if function not defined

    X_train_selected, selector_obj = feature_selector_function(X_train_scaled, y_train, NOF, **fs_params)

    # Get the feature names
    if isinstance(selector_obj, SelectKBest):
        selected_feature_names = numeric_columns[selector_obj.get_support()].tolist()
    elif isinstance(selector_obj, np.ndarray):
        # Assume selector_obj contains integer indices
        selected_feature_names = numeric_columns.take(selector_obj).tolist()
    elif isinstance(selector_obj, list):
        selected_feature_names = numeric_columns.take(selector_obj).tolist()
    else:
        # For selectors that return indices
        selected_feature_names = numeric_columns.take(selector_obj).tolist()

    # Convert to list of strings if necessary
    selected_feature_names = [str(feature) for feature in selected_feature_names]

    print(f"Selected Features: {selected_feature_names}")

    # Check for duplicates
    if len(selected_feature_names) != len(set(selected_feature_names)):
        print("Warning: Duplicate features found in selected features.")
        selected_feature_names = list(dict.fromkeys(selected_feature_names))  # Remove duplicates
        print(f"Duplicate features have been removed. Updated Selected Features: {selected_feature_names}")

    selected_features_all.append({
        'Feature Selector': feature_selector_name,
        'Selected Features': selected_feature_names,
    })

    # Apply the same feature selection to the test set
    if hasattr(selector_obj, 'transform'):
        X_test_selected = selector_obj.transform(X_test_scaled)
    elif isinstance(selector_obj, (np.ndarray, list)):
        X_test_selected = X_test_scaled[:, selector_obj]
    else:
        # For selectors that return indices
        X_test_selected = X_test_scaled[:, selector_obj]

    # --------------------- #
    # 7.1. Iterate Through Classifiers
    # --------------------- #

    # To store best parameters for each classifier
    best_parameters = {}

    for clf_label in classifiers:
        clf_name = clf_label  # Since classifiers list contains the full name
        if clf_name not in INVOLVED_CLASSIFIERS:
            print(f"Classifier '{clf_name}' not found in INVOLVED_CLASSIFIERS. Skipping.")
            continue  # Skip classifiers not defined

        clf_info = INVOLVED_CLASSIFIERS[clf_name]
        print(f"\n\033[1m--- Performing Grid Search for {clf_name} ---\033[0m")

        model_class = clf_info['model']
        model_initial_params = clf_info['params'].copy()  # Use a copy to prevent mutation
        param_grid = clf_info.get('param_grid', {})

        # Instantiate the classifier with initial parameters
        try:
            model = model_class(**model_initial_params)
        except TypeError as e:
            print(f"Error initializing model for {clf_name}: {e}")
            continue  # Skip this classifier

        # Initialize GridSearchCV or RandomizedSearchCV based on GRID_SEARCH_MODE
        if GRID_SEARCH_MODE == 'exhaustive':
            search = GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                cv=3,  # 3-fold cross-validation for grid search
                scoring='accuracy',  # You can change this to other metrics if desired
                n_jobs=-1,
                verbose=1
            )
        elif GRID_SEARCH_MODE == 'randomized':
            # Ensure param_grid is a dictionary
            if isinstance(param_grid, dict):
                param_distributions = param_grid
            elif isinstance(param_grid, list):
                # If param_grid is a list of dicts, RandomizedSearchCV cannot handle it directly
                # Therefore, we can use GridSearchCV in this case
                print(f"Param grid for {clf_name} is a list. Switching to GridSearchCV.")
                search = GridSearchCV(
                    estimator=model,
                    param_grid=param_grid,
                    cv=3,
                    scoring='accuracy',
                    n_jobs=-1,
                    verbose=1
                )
                try:
                    search.fit(X_train_selected, y_train)
                    best_params = search.best_params_
                    best_score = search.best_score_
                    print(f"Best parameters for {clf_name}: {best_params}")
                    print(f"Best cross-validation accuracy: {best_score:.4f}")
                    best_parameters[clf_name] = best_params

                    # Save the best parameters to CSV
                    best_params_df = pd.DataFrame([best_params])
                    tuning_save_path = os.path.join(
                        tuning_hyperparameters_directory,
                        f'best_parameters_{feature_selector_name}_{clf_name}.csv'
                    )
                    best_params_df.to_csv(tuning_save_path, index=False)
                    print(f"Best parameters saved to: {tuning_save_path}")

                except Exception as e:
                    print(f"Grid search failed for {clf_name} with error: {e}")
                    best_parameters[clf_name] = model_initial_params  # Fallback to initial params
                continue  # Move to the next classifier

            search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_distributions,
                n_iter=GRID_SEARCH_ITER,  # Now set to 5
                cv=3,  # 3-fold cross-validation for randomized search
                scoring='accuracy',  # You can change this to other metrics if desired
                random_state=RANDOM_SEED,
                n_jobs=-1,
                verbose=1
            )
        else:
            raise ValueError("Invalid GRID_SEARCH_MODE. Choose 'exhaustive' or 'randomized'.")

        # Fit GridSearchCV or RandomizedSearchCV on the entire selected training data
        try:
            search.fit(X_train_selected, y_train)
            best_params = search.best_params_
            best_score = search.best_score_
            print(f"Best parameters for {clf_name}: {best_params}")
            print(f"Best cross-validation accuracy: {best_score:.4f}")
            best_parameters[clf_name] = best_params

            # Save the best parameters to CSV
            best_params_df = pd.DataFrame([best_params])
            tuning_save_path = os.path.join(
                tuning_hyperparameters_directory,
                f'best_parameters_{feature_selector_name}_{clf_name}.csv'
            )
            best_params_df.to_csv(tuning_save_path, index=False)
            print(f"Best parameters saved to: {tuning_save_path}")

        except Exception as e:
            print(f"Grid search failed for {clf_name} with error: {e}")
            best_parameters[clf_name] = model_initial_params  # Fallback to initial params

    # Save all best parameters for the current feature selector
    best_parameters_all.append({
        'Feature Selector': feature_selector_name,
        'Best Parameters': best_parameters
    })

    # --------------------- #
    # 7.2. K-Fold Cross-Validation with Tuned Hyperparameters
    # --------------------- #

    for clf_label in classifiers:
        clf_name = clf_label
        if clf_name not in INVOLVED_CLASSIFIERS:
            print(f"Classifier '{clf_name}' not found in INVOLVED_CLASSIFIERS. Skipping.")
            continue  # Skip classifiers not defined

        clf_info = INVOLVED_CLASSIFIERS[clf_name]
        print(f"\n\033[1m--- {N_FOLDS}-Fold Cross-Validation with {clf_name} ---\033[0m")

        model_class = clf_info['model']
        model_initial_params = clf_info['params'].copy()
        best_params = best_parameters.get(clf_name, model_initial_params).copy()

        # Instantiate the model with best hyperparameters
        try:
            model = model_class(**{**model_initial_params, **best_params})
        except TypeError as e:
            print(f"Error initializing {clf_name} with parameters {best_params}: {e}")
            print("Falling back to initial parameters.")
            try:
                model = model_class(**model_initial_params)
            except Exception as e_inner:
                print(f"Failed to initialize {clf_name} with initial parameters: {e_inner}")
                continue  # Skip this classifier

        # Initialize metrics
        fold_metrics = {
            'fold': [], 'validation_accuracy': [], 'validation_precision': [], 'validation_recall': [],
            'validation_f1_score': [], 'test_accuracy': [], 'test_precision': [],
            'test_recall': [], 'test_f1_score': [],
        }
        confusion_matrices = []

        # Initialize lists to store predictions
        cv_true_labels = []
        cv_predicted_labels = []
        cv_patient_ids = []
        cv_fold_numbers = []

        test_true_labels = []
        test_predicted_labels = []
        test_patient_ids = []
        test_fold_numbers = []

        validation_results = []
        test_results = []

        # Loop through the folds of cross-validation
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_selected, y_train), 1):
            # Instantiate a fresh model for each fold to prevent estimator accumulation
            try:
                model_fold = model_class(**{**model_initial_params, **best_params})
            except TypeError as e:
                print(f"Error initializing {clf_name} for fold {fold} with parameters {best_params}: {e}")
                print("Falling back to initial parameters.")
                try:
                    model_fold = model_class(**model_initial_params)
                except Exception as e_inner:
                    print(f"Failed to initialize {clf_name} for fold {fold}: {e_inner}")
                    continue  # Skip this fold

            X_train_fold, X_val_fold = X_train_selected[train_idx], X_train_selected[val_idx]
            y_train_fold, y_val_fold = y_train.values[train_idx], y_train.values[val_idx]
            Patient_ID_train_fold = Patient_ID_train.iloc[train_idx]
            Patient_ID_val_fold = Patient_ID_train.iloc[val_idx]

            # Fit the model on the current fold
            try:
                model_fold.fit(X_train_fold, y_train_fold)
            except Exception as e:
                print(f"Error fitting model {clf_name} on fold {fold}: {e}")
                continue

            y_val_pred = model_fold.predict(X_val_fold)

            # Append validation predictions and true labels along with Patient IDs
            cv_true_labels.extend(y_val_fold)
            cv_predicted_labels.extend(y_val_pred)
            cv_patient_ids.extend(Patient_ID_val_fold)
            cv_fold_numbers.extend([fold] * len(y_val_fold))

            # Validation metrics
            val_accuracy = accuracy_score(y_val_fold, y_val_pred)
            val_precision = precision_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)
            val_recall = recall_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)
            val_f1 = f1_score(y_val_fold, y_val_pred, average='weighted', zero_division=0)

            validation_results.append({
                'Fold': fold, 'Validation Accuracy': val_accuracy, 'Validation Precision': val_precision,
                'Validation Recall': val_recall, 'Validation F1-Score': val_f1,
            })

            # Test set predictions
            y_test_pred = model_fold.predict(X_test_selected)

            # Append test predictions and true labels along with Patient IDs
            test_true_labels.extend(y_test)
            test_predicted_labels.extend(y_test_pred)
            test_patient_ids.extend(Patient_ID_test)
            test_fold_numbers.extend([fold] * len(y_test))  # Use the same fold number for consistency

            test_accuracy = accuracy_score(y_test, y_test_pred)
            test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
            test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
            test_f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

            test_results.append({
                'Fold': fold, 'Test Accuracy': test_accuracy, 'Test Precision': test_precision,
                'Test Recall': test_recall, 'Test F1-Score': test_f1,
            })

            fold_metrics['fold'].append(fold)
            fold_metrics['validation_accuracy'].append(val_accuracy)
            fold_metrics['validation_precision'].append(val_precision)
            fold_metrics['validation_recall'].append(val_recall)
            fold_metrics['validation_f1_score'].append(val_f1)
            fold_metrics['test_accuracy'].append(test_accuracy)
            fold_metrics['test_precision'].append(test_precision)
            fold_metrics['test_recall'].append(test_recall)
            fold_metrics['test_f1_score'].append(test_f1)

            confusion_val = confusion_matrix(y_val_fold, y_val_pred)
            confusion_matrices.append({
                'Feature Selector': feature_selector_name,
                'Classifier': clf_name,
                'Fold': fold,
                'Type': 'Validation',
                'Confusion Matrix': confusion_val.tolist(),
            })

            confusion_test = confusion_matrix(y_test, y_test_pred)
            confusion_matrices.append({
                'Feature Selector': feature_selector_name,
                'Classifier': clf_name,
                'Fold': fold,
                'Type': 'External Test',
                'Confusion Matrix': confusion_test.tolist(),
            })

        print(f"\n\033[1mResults of {N_FOLDS} Validation Folds:\033[0m")
        for res in validation_results:
            print(f"Fold {res['Fold']} - Validation Accuracy: {res['Validation Accuracy']:.4f}, "
                  f"Precision: {res['Validation Precision']:.4f}, Recall: {res['Validation Recall']:.4f}, "
                  f"F1-Score: {res['Validation F1-Score']:.4f}")

        val_acc_mean = np.mean(fold_metrics['validation_accuracy'])
        val_acc_std = np.std(fold_metrics['validation_accuracy'])
        val_prec_mean = np.mean(fold_metrics['validation_precision'])
        val_prec_std = np.std(fold_metrics['validation_precision'])
        val_rec_mean = np.mean(fold_metrics['validation_recall'])
        val_rec_std = np.std(fold_metrics['validation_recall'])
        val_f1_mean = np.mean(fold_metrics['validation_f1_score'])
        val_f1_std = np.std(fold_metrics['validation_f1_score'])

        print(f"\n\033[1mAverage Validation Metrics:\033[0m")
        print(f"Accuracy: {val_acc_mean:.4f} ± {val_acc_std:.4f}, "
              f"Precision: {val_prec_mean:.4f} ± {val_prec_std:.4f}, "
              f"Recall: {val_rec_mean:.4f} ± {val_rec_std:.4f}, "
              f"F1-Score: {val_f1_mean:.4f} ± {val_f1_std:.4f}")

        print(f"\n\033[1mResults of {N_FOLDS} External Testing:\033[0m")
        for res in test_results:
            print(f"Fold {res['Fold']} - Test Accuracy: {res['Test Accuracy']:.4f}, "
                  f"Precision: {res['Test Precision']:.4f}, Recall: {res['Test Recall']:.4f}, "
                  f"F1-Score: {res['Test F1-Score']:.4f}")

        test_acc_mean = np.mean(fold_metrics['test_accuracy'])
        test_acc_std = np.std(fold_metrics['test_accuracy'])
        test_prec_mean = np.mean(fold_metrics['test_precision'])
        test_prec_std = np.std(fold_metrics['test_precision'])
        test_rec_mean = np.mean(fold_metrics['test_recall'])
        test_rec_std = np.std(fold_metrics['test_recall'])
        test_f1_mean = np.mean(fold_metrics['test_f1_score'])
        test_f1_std = np.std(fold_metrics['test_f1_score'])

        print(f"\n\033[1mAverage External Test Metrics:\033[0m")
        print(f"Accuracy: {test_acc_mean:.4f} ± {test_acc_std:.4f}, "
              f"Precision: {test_prec_mean:.4f} ± {test_prec_std:.4f}, "
              f"Recall: {test_rec_mean:.4f} ± {test_rec_std:.4f}, "
              f"F1-Score: {test_f1_mean:.4f} ± {test_f1_std:.4f}")

        fold_metrics['Feature Selector'] = [feature_selector_name] * N_FOLDS
        fold_metrics['Classifier'] = [clf_name] * N_FOLDS

        # Check if all lists in fold_metrics have the same length
        lengths = [len(v) for v in fold_metrics.values()]
        if len(set(lengths)) != 1:
            print("Warning: Not all metric lists have the same length. Skipping saving this fold's metrics.")
            continue  # Skip if lengths are inconsistent

        df_fold_metrics = pd.DataFrame(fold_metrics)
        results.append(df_fold_metrics)

        confusion_matrices_all.extend(confusion_matrices)

        # Save cross-validation predictions
        cv_results_df = pd.DataFrame({
            'Patient ID': cv_patient_ids,
            'Fold': cv_fold_numbers,
            'True Label': cv_true_labels,
            'Predicted Label': cv_predicted_labels
        })

        # Create subdirectory path
        cv_subdirectory = os.path.join(
            fivefold_cv_directory,
            f"{feature_selector_name}_{clf_name}"
        )
        os.makedirs(cv_subdirectory, exist_ok=True)

        cv_predictions_save_path = os.path.join(
            cv_subdirectory,
            f'cv_predictions_{feature_selector_name}_{clf_name}.csv'
        )

        cv_results_df.to_csv(cv_predictions_save_path, index=False)

        # Save external test predictions
        test_results_df = pd.DataFrame({
            'Patient ID': test_patient_ids,
            'Fold': test_fold_numbers,
            'True Label': test_true_labels,
            'Predicted Label': test_predicted_labels
        })

        # Create subdirectory path
        test_subdirectory = os.path.join(
            external_test_directory,
            f"{feature_selector_name}_{clf_name}"
        )
        os.makedirs(test_subdirectory, exist_ok=True)

        test_predictions_save_path = os.path.join(
            test_subdirectory,
            f'test_predictions_{feature_selector_name}_{clf_name}.csv'
        )

        test_results_df.to_csv(test_predictions_save_path, index=False)

    # Save selected features
    selected_features_df = pd.DataFrame(selected_features_all)

    # Expand the 'Selected Features' column into separate columns for clarity
    selected_features_expanded = selected_features_df['Selected Features'].apply(pd.Series)
    selected_features_expanded.columns = [f'Feature_{i+1}' for i in selected_features_expanded.columns]

    # Combine the feature selector names with the expanded features
    selected_features_combined = pd.concat([selected_features_df[['Feature Selector']], selected_features_expanded], axis=1)

    # Save to CSV
    selected_features_save_path = os.path.join(results_directory, 'selected_features.csv')
    selected_features_combined.to_csv(selected_features_save_path, index=False)
    print(f"Selected Features have been saved to: {selected_features_save_path}")

    # Save best parameters
    # Convert the list of dictionaries to a DataFrame
    best_parameters_records = []
    for record in best_parameters_all:
        feature_selector = record['Feature Selector']
        for clf, params in record['Best Parameters'].items():
            # Prefix classifier name to parameter keys to avoid duplicates
            params_flat = {f"{clf}_{k}": v for k, v in params.items()}
            params_flat['Feature Selector'] = feature_selector
            params_flat['Classifier'] = clf
            best_parameters_records.append(params_flat)

    best_parameters_df = pd.DataFrame(best_parameters_records)
    best_parameters_save_path = os.path.join(tuning_hyperparameters_directory, 'best_parameters.csv')
    best_parameters_df.to_csv(best_parameters_save_path, index=False)
    print(f"Best Hyperparameters have been saved to: {best_parameters_save_path}")

    # ----------------------- #
    # 8. Saving and Aggregating Results
    # ----------------------- #

    if results:
        results_df = pd.concat(results, ignore_index=True)
        average_metrics = results_df.groupby(['Feature Selector', 'Classifier']).agg({
            'validation_accuracy': ['mean', 'std'],
            'validation_precision': ['mean', 'std'],
            'validation_recall': ['mean', 'std'],
            'validation_f1_score': ['mean', 'std'],
            'test_accuracy': ['mean', 'std'],
            'test_precision': ['mean', 'std'],
            'test_recall': ['mean', 'std'],
            'test_f1_score': ['mean', 'std'],
        }).reset_index()

        average_metrics.columns = [' '.join(col).strip() if col[1] else col[0] for col in average_metrics.columns.values]

        # Define save paths
        results_save_path = os.path.join(results_directory, 'evaluation_metrics.csv')
        average_metrics_save_path = os.path.join(results_directory, 'average_metrics.csv')
        confusion_matrix_save_path = os.path.join(results_directory, 'confusion_matrices.csv')
        workflow_save_path = os.path.join(results_directory, 'Workflow_Ver18.txt')
        code_save_path = os.path.join(results_directory, 'Code_Ver18.py')  # Updated to Ver18

        # Save metrics
        results_df.to_csv(results_save_path, index=False)
        average_metrics.to_csv(average_metrics_save_path, index=False)
        print(f"Evaluation Metrics have been saved to: {results_save_path}")
        print(f"Average Metrics have been saved to: {average_metrics_save_path}")

        # Save standard deviations separately
        # Extract std columns
        std_columns = [col for col in average_metrics.columns if 'std' in col]
        std_metrics_df = average_metrics[['Feature Selector', 'Classifier'] + std_columns]
        std_metrics_save_path = os.path.join(results_directory, 'STD_metrics.csv')
        std_metrics_df.to_csv(std_metrics_save_path, index=False)
        print(f"Standard Deviations of evaluation metrics have been saved to: {std_metrics_save_path}")

        # Save confusion matrices
        confusion_matrices_df = pd.DataFrame(confusion_matrices_all)
        confusion_matrices_df.to_csv(confusion_matrix_save_path, index=False)
        print(f"Confusion Matrices have been saved to: {confusion_matrix_save_path}")

        # Copy original data file to results directory
        original_data_filename = os.path.basename(FILE_PATH)
        original_data_filename_with_label = f"Original_{original_data_filename}"
        destination_path = os.path.join(results_directory, original_data_filename_with_label)
        shutil.copyfile(FILE_PATH, destination_path)
        print(f"Original data file has been copied to: {destination_path}")

        # Save workflow description (placeholder)
        workflow_text = """
    [Workflow_Ver18 content goes here]
    """
        with open(workflow_save_path, 'w') as workflow_file:
            workflow_file.write(workflow_text)
        print(f"Workflow has been saved to: {workflow_save_path}")

        # Save the current script's code
        try:
            # Since we've modified the code structure, ensure that the current script can be retrieved
            code_text = inspect.getsource(inspect.currentframe())
            with open(code_save_path, 'w') as code_file:
                code_file.write(code_text)
            print(f"Code has been saved to: {code_save_path}")
        except Exception as e:
            print(f"Unable to save the code: {e}")

    else:
        print("No results to save. Please check your data and parameters.")

# ======================= #
#       End of Script or Classification Codes    #
# ======================= #
