# 2. Model Exploration and Hyperparameter Tuning

## üìù Overview
This notebook is the second step in the dementia prediction pipeline. Its purpose is to:
1.  **Load** the pre-cleaned and split data from `1_dataset_analysis.ipynb`.
2.  Define a **preprocessing pipeline** to handle scaling and encoding.
3.  Use **SMOTE** to address class imbalance in the training data.
4.  Train a variety of machine learning models using **GridSearchCV** to find the best hyperparameters for each.
5.  **Save** the trained models and their performance results for the final implementation phase.

## Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')
import joblib
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFECV, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import  RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import label_binarize
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

## ML Model Results Storage Framework

In [2]:
# ML Model Result Storage Framework
precision = []
roc_auc = []

# Function to call for storing the results
def store_results(model, config, acc, f1_score, rec, prec, roc):
    """
    Store model performance results.
    """
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(acc, 6))
    f1.append(round(f1_score, 6))
    recall.append(round(rec, 6))
    precision.append(round(prec, 6))
    roc_auc.append(round(roc, 6))

# Function to display and save results
def display_and_save_results(filename_prefix='model_exploration'):
    """
    Create dataframe from results, display, and save to CSV in the 'AnalysisMain/results' directory.
    """
    # Creating the dataframe
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    
    # Remove duplicates if any
    result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)
    
    print("\n" + "="*100)
    print("MODEL PERFORMANCE RESULTS")
    print("="*100)
    print(result.to_string(index=False))
    
    # Saving the result to a CSV file
    save_path = os.path.join(results_dir, f'{filename_prefix}_results.csv')
    result.to_csv(save_path, index=False)
    print(f"\nResults saved to {save_path}")
    
    # Sorting the dataframe on F1 Score and Accuracy
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    
    print("\n" + "="*100)
    print("SORTED MODEL PERFORMANCE RESULTS (by F1 Score and Accuracy)")
    print("="*100)
    print(sorted_result.to_string(index=False))
    
    # Saving the sorted result to a CSV file
    sorted_save_path = os.path.join(results_dir, f'sorted_{filename_prefix}_results.csv')
    sorted_result.to_csv(sorted_save_path, index=False)
    print(f"\nSorted results saved to {sorted_save_path}")
    
    return result, sorted_result

# Function to clear results
def clear_results():
    """Clear all stored results."""
    global ML_Model, ML_Config, accuracy, f1, recall, precision, roc_auc
    ML_Model.clear()
    ML_Config.clear()
    accuracy.clear()
    f1.clear()
    recall.clear()
    precision.clear()
    roc_auc.clear()
    print("Results cleared!")

# Function to plot model comparison
def plot_model_comparison(result_df, plot_filename="model_performance_comparison.png"):
    """
    Create visualization comparing model performances and save to 'AnalysisMain/plots'.
    """
    # Convert scores to percentages for plotting
    metrics_cols = ['Accuracy', 'F1 Score', 'Recall', 'Precision', 'ROC_AUC']
    plot_df = result_df.copy()
    
    for col in metrics_cols:
        plot_df[col] = plot_df[col] * 100
    
    # Create subplot for each metric
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    axes = axes.ravel()
    
    for idx, metric in enumerate(metrics_cols):
        # Group by model and get mean performance across configurations
        model_performance = plot_df.groupby('ML Model')[metric].mean().sort_values(ascending=False)
        
        # Create bar plot
        ax = axes[idx]
        bars = sns.barplot(x=model_performance.index, y=model_performance.values, ax=ax, palette='Blues_r')
        
        ax.set_title(f'Average {metric}', fontweight='bold')
        ax.set_ylabel(f'{metric} (%)')
        ax.set_xlabel('')
        ax.tick_params(axis='x', rotation=45)
        ax.grid(axis='y', alpha=0.5)
        
        # Add value labels on bars
        for bar in bars.patches:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}', ha='center', va='bottom', fontsize=9)
    
    # Hide the last subplot if we have 5 metrics
    if len(metrics_cols) < 6:
        axes[5].set_visible(False)
    
    plt.suptitle('Model Performance Comparison', fontsize=20, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    
    # Save the plot
    save_path = os.path.join(plots_dir, plot_filename)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Comparison plot saved to: {save_path}")

print("Model results storage framework loaded successfully!")
print("This framework will save results, plots, and models to the 'ModelExploration' directory.")

Model results storage framework loaded successfully!
This framework will save results, plots, and models to the 'ModelExploration' directory.


# Define Preprocessing Pipeline

Before training the models, we need to create a preprocessing pipeline. This pipeline will handle:
1.  **Scaling**: Applying `StandardScaler` to all numerical features to standardize their range.
2.  **Encoding**: Applying `OneHotEncoder` to the categorical feature (`M/F`) to convert it into a numerical format.

We use a `ColumnTransformer` to apply these different transformations to the correct columns. This ensures that the same steps are consistently applied during both training and validation.

In [3]:
# Define Preprocessing Pipeline

# Define the directory where the processed data was saved from the previous notebook
processed_data_dir = 'Analysis/processed_data'

# Load the training and validation sets
X_train = joblib.load(os.path.join(processed_data_dir, 'X_train.joblib'))
X_val = joblib.load(os.path.join(processed_data_dir, 'X_val.joblib'))
y_train = joblib.load(os.path.join(processed_data_dir, 'y_train.joblib'))
y_val = joblib.load(os.path.join(processed_data_dir, 'y_val.joblib'))

print("Data loaded successfully from 'processed_data' directory!")
print("-" * 50)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"\nTraining target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\nValidation target distribution:\n{y_val.value_counts(normalize=True)}")

# Display the first few rows of the training data to confirm
print("\nFirst 5 rows of X_train:")
X_train.head()

Data loaded successfully from 'processed_data' directory!
--------------------------------------------------
X_train shape: (647, 12)
X_val shape: (162, 12)

Training target distribution:
Group
Nondemented    0.774343
Demented       0.225657
Name: proportion, dtype: float64

Validation target distribution:
Group
Nondemented    0.771605
Demented       0.228395
Name: proportion, dtype: float64

First 5 rows of X_train:


Unnamed: 0,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay,Visit,MR Delay
764,M,84,14.0,2.0,22.0,0.5,1550,0.665,1.132,2.0,2.0,621.0
213,M,75,5.0,2.0,29.0,0.0,1534,0.771,1.144,2.0,1.0,0.0
382,F,69,4.0,3.0,29.0,0.0,1380,0.809,1.272,2.0,1.0,0.0
456,F,80,16.0,2.0,29.0,0.0,1323,0.738,1.326,2.0,2.0,730.0
393,F,50,12.0,2.0,30.0,0.0,1385,0.819,1.267,2.0,1.0,0.0


---

# SVM

### SVM with PCA 90

In [None]:

# =============================================================================
# SVM with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: SVM Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids
# =============================================================================

# Grid 1: Preprocessed Data - Focus on RBF and linear kernels
param_grid_1 = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.001, 0.01],
    'kernel': ['rbf', 'linear'],
    'degree': [2],  # Not used for rbf/linear but required
    'coef0': [0.0]
}

# Grid 2: Normalized Data - Explore polynomial kernels
param_grid_2 = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['poly', 'rbf'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 3: SelectKBest - Focus on simpler models
param_grid_3 = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['rbf', 'poly', 'linear'],
    'degree': [2, 3],
    'coef0': [0.0, 0.1]
}

# Grid 4: RFECV - Similar to SelectKBest but different ranges
param_grid_4 = {
    'C': [0.5, 1, 5, 10],
    'gamma': ['scale', 0.001, 0.01],
    'kernel': ['rbf', 'sigmoid'],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

# Grid 5: PCA - Focus on linear and sigmoid
param_grid_5 = {
    'C': [0.1, 1, 10, 50],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'degree': [2],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 6 & 7: SMOTE pipelines - Balanced approach (Oversampling)
param_grid_smote = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.001, 0.01, 0.1],
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__degree': [2, 3],
    'model__coef0': [0.0, 0.5]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.001, 0.01, 0.1],
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__degree': [2, 3],
    'model__coef0': [0.0, 0.5]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        SVC(kernel='linear', random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
svm_estimator = SVC(kernel='linear', random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=svm_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=svm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run SVM with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING SVM WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                SVC(probability=True, random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Support Vector Machine Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Good generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'SVM',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('SVM')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ SVM evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('svm_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")



Numeric features: ['Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF', 'Delay', 'Visit', 'MR Delay']
Categorical features: ['M/F']

Preprocessed data shape: (647, 13)
All features are now numeric: True
Initialized result storage lists.

=== START: SVM Configuration Sweep with Custom Hyperparameters ===

‚úì Configuration 1: Preprocessed Data
‚úì Configuration 2: Normalized Data (MinMax)

=== SelectKBest Feature Selection ===
Optimal number of features: 10
‚úì Configuration 3: SelectKBest

=== RFECV Feature Selection ===
Optimal number of features by RFECV: 3
‚úì Configuration 4: RFECV

=== PCA Dimensionality Reduction ===
Number of components for 90.0% variance: 3
‚úì Configuration 5: PCA
‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)
‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)

=== Adding Undersampling Configurations ===
‚úì Configuration 8: RandomUnderSampler (Undersampling)
‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)
‚úì Configura

### SVM with PCA 95

In [5]:

# =============================================================================
# SVM with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: SVM Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids
# =============================================================================

# Grid 1: Preprocessed Data - Focus on RBF and linear kernels
param_grid_1 = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.001, 0.01],
    'kernel': ['rbf', 'linear'],
    'degree': [2],  # Not used for rbf/linear but required
    'coef0': [0.0]
}

# Grid 2: Normalized Data - Explore polynomial kernels
param_grid_2 = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['poly', 'rbf'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 3: SelectKBest - Focus on simpler models
param_grid_3 = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['rbf', 'poly', 'linear'],
    'degree': [2, 3],
    'coef0': [0.0, 0.1]
}

# Grid 4: RFECV - Similar to SelectKBest but different ranges
param_grid_4 = {
    'C': [0.5, 1, 5, 10],
    'gamma': ['scale', 0.001, 0.01],
    'kernel': ['rbf', 'sigmoid'],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

# Grid 5: PCA - Focus on linear and sigmoid
param_grid_5 = {
    'C': [0.1, 1, 10, 50],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'degree': [2],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 6 & 7: SMOTE pipelines - Balanced approach (Oversampling)
param_grid_smote = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.001, 0.01, 0.1],
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__degree': [2, 3],
    'model__coef0': [0.0, 0.5]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.001, 0.01, 0.1],
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__degree': [2, 3],
    'model__coef0': [0.0, 0.5]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        SVC(kernel='linear', random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
svm_estimator = SVC(kernel='linear', random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=svm_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=svm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.95
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run SVM with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING SVM WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                SVC(probability=True, random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Support Vector Machine Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Good generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'SVM',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('SVM')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ SVM evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('svm_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")



Numeric features: ['Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF', 'Delay', 'Visit', 'MR Delay']
Categorical features: ['M/F']

Preprocessed data shape: (647, 13)
All features are now numeric: True
Results cleared!

=== START: SVM Configuration Sweep with Custom Hyperparameters ===

‚úì Configuration 1: Preprocessed Data
‚úì Configuration 2: Normalized Data (MinMax)

=== SelectKBest Feature Selection ===
Optimal number of features: 10
‚úì Configuration 3: SelectKBest

=== RFECV Feature Selection ===
Optimal number of features by RFECV: 3
‚úì Configuration 4: RFECV

=== PCA Dimensionality Reduction ===
Number of components for 95.0% variance: 3
‚úì Configuration 5: PCA
‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)
‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)

=== Adding Undersampling Configurations ===
‚úì Configuration 8: RandomUnderSampler (Undersampling)
‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)
‚úì Configuration 10: NearMiss

### SVM with PCA 99

In [6]:

# =============================================================================
# SVM with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: SVM Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids
# =============================================================================

# Grid 1: Preprocessed Data - Focus on RBF and linear kernels
param_grid_1 = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.001, 0.01],
    'kernel': ['rbf', 'linear'],
    'degree': [2],  # Not used for rbf/linear but required
    'coef0': [0.0]
}

# Grid 2: Normalized Data - Explore polynomial kernels
param_grid_2 = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['poly', 'rbf'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 3: SelectKBest - Focus on simpler models
param_grid_3 = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['rbf', 'poly', 'linear'],
    'degree': [2, 3],
    'coef0': [0.0, 0.1]
}

# Grid 4: RFECV - Similar to SelectKBest but different ranges
param_grid_4 = {
    'C': [0.5, 1, 5, 10],
    'gamma': ['scale', 0.001, 0.01],
    'kernel': ['rbf', 'sigmoid'],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

# Grid 5: PCA - Focus on linear and sigmoid
param_grid_5 = {
    'C': [0.1, 1, 10, 50],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'degree': [2],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 6 & 7: SMOTE pipelines - Balanced approach (Oversampling)
param_grid_smote = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.001, 0.01, 0.1],
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__degree': [2, 3],
    'model__coef0': [0.0, 0.5]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.001, 0.01, 0.1],
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__degree': [2, 3],
    'model__coef0': [0.0, 0.5]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        SVC(kernel='linear', random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
svm_estimator = SVC(kernel='linear', random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=svm_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=svm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.99
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run SVM with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING SVM WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                SVC(probability=True, random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Support Vector Machine Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Good generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'SVM',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('SVM')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ SVM evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('svm_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")



Numeric features: ['Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF', 'Delay', 'Visit', 'MR Delay']
Categorical features: ['M/F']

Preprocessed data shape: (647, 13)
All features are now numeric: True
Results cleared!

=== START: SVM Configuration Sweep with Custom Hyperparameters ===

‚úì Configuration 1: Preprocessed Data
‚úì Configuration 2: Normalized Data (MinMax)

=== SelectKBest Feature Selection ===
Optimal number of features: 10
‚úì Configuration 3: SelectKBest

=== RFECV Feature Selection ===
Optimal number of features by RFECV: 3
‚úì Configuration 4: RFECV

=== PCA Dimensionality Reduction ===
Number of components for 99.0% variance: 3
‚úì Configuration 5: PCA
‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)
‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)

=== Adding Undersampling Configurations ===
‚úì Configuration 8: RandomUnderSampler (Undersampling)
‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)
‚úì Configuration 10: NearMiss

---

# Random Forest

### Random Forest with PCA 90

In [10]:
# =============================================================================
# Random Forest with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: Random Forest Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for Random Forest
# =============================================================================

# Grid 1: Preprocessed Data - Balanced exploration
param_grid_1 = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True]
}

# Grid 2: Normalized Data - Focus on deeper trees
param_grid_2 = {
    'n_estimators': [100, 200, 300],
    'max_depth': [15, 25, 35, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

# Grid 3: SelectKBest - More conservative (prevent overfitting)
param_grid_3 = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini'],
    'bootstrap': [True]
}

# Grid 4: RFECV - Focus on feature importance
param_grid_4 = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True]
}

# Grid 5: PCA - Simpler models for reduced dimensions
param_grid_5 = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data (Oversampling)
param_grid_smote = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2'],
    'model__criterion': ['gini', 'entropy'],
    'model__bootstrap': [True]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2'],
    'model__criterion': ['gini', 'entropy'],
    'model__bootstrap': [True]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        RandomForestClassifier(n_estimators=50, random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
rf_estimator = RandomForestClassifier(n_estimators=50, random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=rf_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run Random Forest with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING RANDOM FOREST WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                RandomForestClassifier(random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Random Forest Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Good generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'Random Forest',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('Random Forest')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Feature importance for non-pipeline configurations
        if kind == 'array' and hasattr(best_model, 'feature_importances_'):
            print("\nüå≤ Top 10 Most Important Features:")
            importances = best_model.feature_importances_
            indices = np.argsort(importances)[::-1][:10]
            feature_cols = X_tr_cfg.columns if hasattr(X_tr_cfg, 'columns') else [f'Feature {i}' for i in range(len(importances))]
            for i, idx in enumerate(indices, 1):
                print(f"  {i}. {feature_cols[idx]}: {importances[idx]:.4f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ Random Forest evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('random_forest_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")


Numeric features: ['Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF', 'Delay', 'Visit', 'MR Delay']
Categorical features: ['M/F']

Preprocessed data shape: (647, 13)
All features are now numeric: True
Results cleared!

=== START: Random Forest Configuration Sweep with Custom Hyperparameters ===

‚úì Configuration 1: Preprocessed Data
‚úì Configuration 2: Normalized Data (MinMax)

=== SelectKBest Feature Selection ===
Optimal number of features: 12
‚úì Configuration 3: SelectKBest

=== RFECV Feature Selection ===
Optimal number of features by RFECV: 2
‚úì Configuration 4: RFECV

=== PCA Dimensionality Reduction ===
Number of components for 90.0% variance: 2
‚úì Configuration 5: PCA
‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)
‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)

=== Adding Undersampling Configurations ===
‚úì Configuration 8: RandomUnderSampler (Undersampling)
‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)
‚úì Configuration 10

---

# Gradient Boosting

### Gradient Boosting with PCA 90

In [None]:
# =============================================================================
# Gradient Boosting with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: Gradient Boosting Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for Gradient Boosting
# =============================================================================

# Grid 1: Preprocessed Data - Balanced exploration
param_grid_1 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}

# Grid 2: Normalized Data - Focus on learning rate and subsample
param_grid_2 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.6, 0.8, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

# Grid 3: SelectKBest - More conservative (prevent overfitting)
param_grid_3 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 4, 6],
    'subsample': [0.7, 0.8, 0.9],
    'max_features': ['sqrt', 'log2']
}

# Grid 4: RFECV - Focus on sequential boosting
param_grid_4 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3],
    'subsample': [0.7, 0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}

# Grid 5: PCA - Simpler models for reduced dimensions
param_grid_5 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 3],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data (Oversampling)
param_grid_smote = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 4, 5],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__max_features': ['sqrt', 'log2']
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 4, 5],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__max_features': ['sqrt', 'log2']
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
gb_estimator = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=gb_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=gb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', GradientBoostingClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', GradientBoostingClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', GradientBoostingClassifier(random_state=RANDOM_STATE))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', GradientBoostingClassifier(random_state=RANDOM_STATE))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', GradientBoostingClassifier(random_state=RANDOM_STATE))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', GradientBoostingClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run Gradient Boosting with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING GRADIENT BOOSTING WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                GradientBoostingClassifier(random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Gradient Boosting Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Good generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'Gradient Boosting',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('Gradient Boosting')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Feature importance for non-pipeline configurations
        if kind == 'array' and hasattr(best_model, 'feature_importances_'):
            print("\nüéØ Top 10 Most Important Features:")
            importances = best_model.feature_importances_
            indices = np.argsort(importances)[::-1][:10]
            feature_cols = X_tr_cfg.columns if hasattr(X_tr_cfg, 'columns') else [f'Feature {i}' for i in range(len(importances))]
            for i, idx in enumerate(indices, 1):
                print(f"  {i}. {feature_cols[idx]}: {importances[idx]:.4f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ Gradient Boosting evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('gradient_boosting_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")


Numeric features: ['Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF', 'Delay', 'Visit', 'MR Delay']
Categorical features: ['M/F']

Preprocessed data shape: (647, 13)
All features are now numeric: True
Initialized result storage lists.

=== START: Gradient Boosting Configuration Sweep with Custom Hyperparameters ===

‚úì Configuration 1: Preprocessed Data
‚úì Configuration 2: Normalized Data (MinMax)

=== SelectKBest Feature Selection ===
Optimal number of features: 6
‚úì Configuration 3: SelectKBest

=== RFECV Feature Selection ===
Optimal number of features by RFECV: 2
‚úì Configuration 4: RFECV

=== PCA Dimensionality Reduction ===
Number of components for 90.0% variance: 2
‚úì Configuration 5: PCA
‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)
‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)

=== Adding Undersampling Configurations ===
‚úì Configuration 8: RandomUnderSampler (Undersampling)
‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)


---

# Decision Tree

In [None]:
# =============================================================================
# Decision Tree with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: Decision Tree Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for Decision Tree
# =============================================================================

# Grid 1: Preprocessed Data - Balanced exploration with pruning
param_grid_1 = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'min_impurity_decrease': [0.0, 0.0001, 0.001],
    'ccp_alpha': [0.0, 0.001, 0.01]
}

# Grid 2: Normalized Data - Focus on depth and splitting criteria
param_grid_2 = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best'],
    'max_depth': [5, 10, 15, 20, 25, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'min_impurity_decrease': [0.0, 0.0001, 0.0005, 0.001],
    'ccp_alpha': [0.0]
}

# Grid 3: SelectKBest - More conservative (prevent overfitting)
param_grid_3 = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best'],
    'max_depth': [3, 5, 8, 10, 15],
    'min_samples_split': [5, 10, 15, 20],
    'min_samples_leaf': [2, 4, 6, 8, 10],
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [0.0001, 0.001, 0.005],
    'ccp_alpha': [0.0, 0.005, 0.01, 0.02]
}

# Grid 4: RFECV - Explore complexity with pruning
param_grid_4 = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None],
    'min_impurity_decrease': [0.0, 0.0005, 0.001, 0.005],
    'ccp_alpha': [0.0, 0.001, 0.005]
}

# Grid 5: PCA - Simpler trees for reduced dimensions
param_grid_5 = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best'],
    'max_depth': [5, 10, 15, 20, 25, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.0001, 0.001, 0.01],
    'ccp_alpha': [0.0, 0.001, 0.01]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data (Oversampling)
param_grid_smote = {
    'model__criterion': ['gini', 'entropy'],
    'model__splitter': ['best'],
    'model__max_depth': [5, 10, 15, 20, None],
    'model__min_samples_split': [2, 5, 10, 20],
    'model__min_samples_leaf': [1, 2, 4, 8],
    'model__max_features': ['sqrt', 'log2'],
    'model__min_impurity_decrease': [0.0, 0.0001, 0.001],
    'model__ccp_alpha': [0.0, 0.001, 0.01]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__criterion': ['gini', 'entropy'],
    'model__splitter': ['best'],
    'model__max_depth': [5, 10, 15, 20, None],
    'model__min_samples_split': [2, 5, 10, 20],
    'model__min_samples_leaf': [1, 2, 4, 8],
    'model__max_features': ['sqrt', 'log2'],
    'model__min_impurity_decrease': [0.0, 0.0001, 0.001],
    'model__ccp_alpha': [0.0, 0.001, 0.01]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        DecisionTreeClassifier(random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
dt_estimator = DecisionTreeClassifier(random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=dt_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=dt_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', DecisionTreeClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run Decision Tree with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING DECISION TREE WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                DecisionTreeClassifier(random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Decision Tree Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning (Decision Trees are prone to overfitting)
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
            print("   Consider: increasing min_samples_split, min_samples_leaf, or ccp_alpha")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Excellent generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'Decision Tree',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('Decision Tree')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Tree complexity metrics
        if kind == 'array':
            tree_model = best_model
        else:
            tree_model = best_model.named_steps['model']
        
        print(f"\nüå≥ Tree Complexity Metrics:")
        print(f"  Number of leaves: {tree_model.get_n_leaves()}")
        print(f"  Tree depth: {tree_model.get_depth()}")
        
        # Feature importance for non-pipeline configurations
        if kind == 'array' and hasattr(best_model, 'feature_importances_'):
            print("\nüìä Top 10 Most Important Features:")
            importances = best_model.feature_importances_
            indices = np.argsort(importances)[::-1][:10]
            feature_cols = X_tr_cfg.columns if hasattr(X_tr_cfg, 'columns') else [f'Feature {i}' for i in range(len(importances))]
            for i, idx in enumerate(indices, 1):
                print(f"  {i}. {feature_cols[idx]}: {importances[idx]:.4f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ Decision Tree evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('decision_tree_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")


---

# LightGBM

In [None]:
# =============================================================================
# LightGBM with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: LightGBM Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for LightGBM
# =============================================================================

# Grid 1: Preprocessed Data - Balanced exploration
param_grid_1 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 5, 10],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1]
}

# Grid 2: Normalized Data - Focus on tree complexity
param_grid_2 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'num_leaves': [31, 50, 70, 100],
    'max_depth': [-1, 7, 10],
    'min_child_samples': [10, 20, 40],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 0.5]
}

# Grid 3: SelectKBest - More conservative (prevent overfitting)
param_grid_3 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [20, 31, 50],
    'max_depth': [3, 5, 7],
    'min_child_samples': [20, 40, 60],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'reg_alpha': [0.1, 0.5, 1.0],
    'reg_lambda': [0.1, 0.5, 1.0]
}

# Grid 4: RFECV - Focus on feature importance
param_grid_4 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300, 400],
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 5, 10, 15],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 0.5]
}

# Grid 5: PCA - Optimized for reduced dimensions
param_grid_5 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'num_leaves': [31, 50, 70, 100],
    'max_depth': [-1, 7, 10],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data (Oversampling)
param_grid_smote = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__n_estimators': [100, 200, 300],
    'model__num_leaves': [31, 50, 70],
    'model__max_depth': [-1, 5, 10],
    'model__min_child_samples': [10, 20, 30],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8],
    'model__reg_alpha': [0.0, 0.1],
    'model__reg_lambda': [0.0, 0.1]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__n_estimators': [100, 200, 300],
    'model__num_leaves': [31, 50, 70],
    'model__max_depth': [-1, 5, 10],
    'model__min_child_samples': [10, 20, 30],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8],
    'model__reg_alpha': [0.0, 0.1],
    'model__reg_lambda': [0.0, 0.1]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        LGBMClassifier(n_estimators=50, learning_rate=0.1, random_state=RANDOM_STATE, verbose=-1), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
lgbm_estimator = LGBMClassifier(n_estimators=50, learning_rate=0.1, random_state=RANDOM_STATE, verbose=-1)
rfecv = RFECV(
    estimator=lgbm_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=lgbm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', LGBMClassifier(random_state=RANDOM_STATE, verbose=-1))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run LightGBM with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING LIGHTGBM WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                LGBMClassifier(random_state=RANDOM_STATE, verbose=-1), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä LightGBM Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
            print("   Consider: increasing min_child_samples, reg_alpha/reg_lambda, or reducing num_leaves")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Excellent generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'LightGBM',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('LightGBM')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Feature importance for non-pipeline configurations
        if kind == 'array' and hasattr(best_model, 'feature_importances_'):
            print("\nüí° Top 10 Most Important Features:")
            importances = best_model.feature_importances_
            indices = np.argsort(importances)[::-1][:10]
            feature_cols = X_tr_cfg.columns if hasattr(X_tr_cfg, 'columns') else [f'Feature {i}' for i in range(len(importances))]
            for i, idx in enumerate(indices, 1):
                print(f"  {i}. {feature_cols[idx]}: {importances[idx]:.4f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ LightGBM evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('lightgbm_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")


---

# CatBoost

In [None]:
# =============================================================================
# CatBoost with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: CatBoost Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for CatBoost
# =============================================================================

# Grid 1: Preprocessed Data - Balanced exploration
param_grid_1 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0, 0.5, 1],
    'random_strength': [0, 1, 5],
    'border_count': [32, 128, 254],
    'subsample': [0.7, 0.8, 1.0]
}

# Grid 2: Normalized Data - Focus on tree complexity
param_grid_2 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations': [100, 200, 300, 500],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 10],
    'bagging_temperature': [0, 0.5, 1],
    'random_strength': [0, 1, 5, 10],
    'border_count': [32, 128],
    'subsample': [0.6, 0.8, 1.0]
}

# Grid 3: SelectKBest - More conservative (prevent overfitting)
param_grid_3 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'depth': [4, 5, 6],
    'l2_leaf_reg': [3, 5, 10, 20],
    'bagging_temperature': [0, 0.5],
    'random_strength': [1, 5, 10],
    'border_count': [32, 64],
    'subsample': [0.7, 0.8]
}

# Grid 4: RFECV - Focus on feature importance
param_grid_4 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300, 400],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0, 0.5, 1],
    'random_strength': [0, 1, 5],
    'border_count': [32, 128],
    'subsample': [0.7, 0.8, 1.0]
}

# Grid 5: PCA - Optimized for reduced dimensions
param_grid_5 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations': [100, 200, 300, 500],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0.5, 1],
    'random_strength': [0, 1],
    'border_count': [128, 254],
    'subsample': [0.8, 1.0]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data (Oversampling)
param_grid_smote = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__iterations': [100, 200, 300],
    'model__depth': [4, 6, 8],
    'model__l2_leaf_reg': [1, 3, 5],
    'model__bagging_temperature': [0, 0.5, 1],
    'model__random_strength': [0, 1, 5],
    'model__border_count': [32, 128],
    'model__subsample': [0.7, 0.8, 1.0]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__iterations': [100, 200, 300],
    'model__depth': [4, 6, 8],
    'model__l2_leaf_reg': [1, 3, 5],
    'model__bagging_temperature': [0, 0.5, 1],
    'model__random_strength': [0, 1, 5],
    'model__border_count': [32, 128],
    'model__subsample': [0.7, 0.8, 1.0]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        CatBoostClassifier(iterations=50, learning_rate=0.1, random_state=RANDOM_STATE, verbose=0), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
cb_estimator = CatBoostClassifier(iterations=50, learning_rate=0.1, random_state=RANDOM_STATE, verbose=0)
rfecv = RFECV(
    estimator=cb_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=cb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', CatBoostClassifier(random_state=RANDOM_STATE, verbose=0))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', CatBoostClassifier(random_state=RANDOM_STATE, verbose=0))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', CatBoostClassifier(random_state=RANDOM_STATE, verbose=0))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', CatBoostClassifier(random_state=RANDOM_STATE, verbose=0))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', CatBoostClassifier(random_state=RANDOM_STATE, verbose=0))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', CatBoostClassifier(random_state=RANDOM_STATE, verbose=0))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run CatBoost with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING CATBOOST WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                CatBoostClassifier(random_state=RANDOM_STATE, verbose=0), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä CatBoost Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
            print("   Consider: increasing l2_leaf_reg, reducing depth, or adjusting bagging_temperature")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Excellent generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'CatBoost',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('CatBoost')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Feature importance for non-pipeline configurations
        if kind == 'array' and hasattr(best_model, 'feature_importances_'):
            print("\nüèÜ Top 10 Most Important Features:")
            importances = best_model.feature_importances_
            indices = np.argsort(importances)[::-1][:10]
            feature_cols = X_tr_cfg.columns if hasattr(X_tr_cfg, 'columns') else [f'Feature {i}' for i in range(len(importances))]
            for i, idx in enumerate(indices, 1):
                print(f"  {i}. {feature_cols[idx]}: {importances[idx]:.4f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ CatBoost evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('catboost_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")


---

# Adaboost

### Adaboost with PCA 90

In [None]:
# =============================================================================
# AdaBoost with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: AdaBoost Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for AdaBoost
# =============================================================================

# Grid 1: Preprocessed Data - Balanced exploration
param_grid_1 = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'algorithm': ['SAMME', 'SAMME.R'],
    'estimator__max_depth': [1, 2, 3],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2]
}

# Grid 2: Normalized Data - Focus on more estimators
param_grid_2 = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'algorithm': ['SAMME.R'],
    'estimator__max_depth': [1, 2, 3, 4],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

# Grid 3: SelectKBest - More conservative (prevent overfitting)
param_grid_3 = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'algorithm': ['SAMME.R'],
    'estimator__max_depth': [1, 2],
    'estimator__min_samples_split': [5, 10, 20],
    'estimator__min_samples_leaf': [2, 4, 6]
}

# Grid 4: RFECV - Explore algorithm variations
param_grid_4 = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'algorithm': ['SAMME', 'SAMME.R'],
    'estimator__max_depth': [1, 2, 3],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2, 4]
}

# Grid 5: PCA - Simpler weak learners
param_grid_5 = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'algorithm': ['SAMME.R'],
    'estimator__max_depth': [1, 2, 3],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2, 3]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data (Oversampling)
param_grid_smote = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.5, 1.0],
    'model__algorithm': ['SAMME.R'],
    'model__estimator__max_depth': [1, 2, 3],
    'model__estimator__min_samples_split': [2, 5],
    'model__estimator__min_samples_leaf': [1, 2]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1, 0.5, 1.0],
    'model__algorithm': ['SAMME.R'],
    'model__estimator__max_depth': [1, 2, 3],
    'model__estimator__min_samples_split': [2, 5],
    'model__estimator__min_samples_leaf': [1, 2]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
ada_estimator = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=ada_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=ada_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE), random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE), random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE), random_state=RANDOM_STATE))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE), random_state=RANDOM_STATE))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE), random_state=RANDOM_STATE))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE), random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run AdaBoost with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING ADABOOST WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=RANDOM_STATE), random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä AdaBoost Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
            print("   Consider: reducing n_estimators, lowering learning_rate, or increasing min_samples_split")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Excellent generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'AdaBoost',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('AdaBoost')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Display number of estimators used
        print(f"\nüìà Model Complexity:")
        print(f"  Number of estimators: {best_model.n_estimators}")
        if hasattr(best_model, 'estimators_'):
            print(f"  Actual estimators used: {len(best_model.estimators_)}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ AdaBoost evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('adaboost_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 25

=== RFECV Feature Selection with AdaBoost ===
Optimal number of features selected by RFECV: 11

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 10

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.966825  0.860668 0.805082   0.942229 0.992096
Best hyperparameters found by GridSearchCV:
{'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 1, 'n_estimators': 200}

Running AdaBoost with Normalized Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Ac

---

# XGBoost

### XGBoost with PCA 90

In [None]:
# =============================================================================
# XGBoost with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: XGBoost Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for XGBoost
# =============================================================================

# Grid 1: Preprocessed Data - Balanced exploration
param_grid_1 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.5],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 1.5]
}

# Grid 2: Normalized Data - Focus on tree complexity
param_grid_2 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Grid 3: SelectKBest - More conservative (prevent overfitting)
param_grid_3 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [3, 5, 7],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'gamma': [0.1, 0.5, 1.0],
    'reg_alpha': [0.1, 0.5, 1.0],
    'reg_lambda': [1, 2, 3]
}

# Grid 4: RFECV - Focus on feature importance
param_grid_4 = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5]
}

# Grid 5: PCA - Optimized for reduced dimensions
param_grid_5 = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 1.5]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data (Oversampling)
param_grid_smote = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8],
    'model__gamma': [0, 0.1, 0.5],
    'model__reg_alpha': [0, 0.1],
    'model__reg_lambda': [1, 1.5]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8],
    'model__gamma': [0, 0.1, 0.5],
    'model__reg_alpha': [0, 0.1],
    'model__reg_lambda': [1, 1.5]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=RANDOM_STATE, eval_metric='logloss'), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
xgb_estimator = XGBClassifier(n_estimators=50, learning_rate=0.1, random_state=RANDOM_STATE, eval_metric='logloss')
rfecv = RFECV(
    estimator=xgb_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=xgb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run XGBoost with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING XGBOOST WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä XGBoost Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
            print("   Consider: reducing max_depth, increasing min_child_weight, or increasing gamma/regularization")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Excellent generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'XGBoost',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('XGBoost')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Feature importance for non-pipeline configurations
        if kind == 'array' and hasattr(best_model, 'feature_importances_'):
            print("\nüöÄ Top 10 Most Important Features:")
            importances = best_model.feature_importances_
            indices = np.argsort(importances)[::-1][:10]
            feature_cols = X_tr_cfg.columns if hasattr(X_tr_cfg, 'columns') else [f'Feature {i}' for i in range(len(importances))]
            for i, idx in enumerate(indices, 1):
                print(f"  {i}. {feature_cols[idx]}: {importances[idx]:.4f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ XGBoost evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('xgboost_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 15

=== RFECV Feature Selection with XGBoost ===
Optimal number of features selected by RFECV: 15

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 13

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with Original Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.952607  0.755429 0.651515   0.983498 0.986044
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6}

Running XGBoost with Normalized Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Scor

---

# Voting Classifier

In [None]:
# =============================================================================
# Voting Classifier with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: Voting Classifier Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for Voting Classifier
# =============================================================================

# Grid 1: Preprocessed Data - Explore voting types and weights
param_grid_1 = {
    'voting': ['soft', 'hard'],
    'weights': [(1,1,1,1), (2,1,1,1), (1,2,1,1), (1,1,2,1), (1,1,1,2)],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.05, 0.1],
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 5]
}

# Grid 2: Normalized Data - Focus on individual estimator tuning
param_grid_2 = {
    'voting': ['soft'],
    'weights': [(1,1,1,1), (2,1,1,1), (1,1,2,1)],
    'lr__C': [0.1, 1.0, 10],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [15, 25],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.01, 0.1],
    'xgb__learning_rate': [0.05, 0.1]
}

# Grid 3: SelectKBest - More conservative parameters
param_grid_3 = {
    'voting': ['soft', 'hard'],
    'weights': [(1,1,1,1), (2,1,1,1)],
    'rf__n_estimators': [50, 100],
    'rf__max_depth': [5, 10],
    'rf__min_samples_split': [5, 10],
    'gb__n_estimators': [50, 100],
    'gb__learning_rate': [0.05, 0.1],
    'xgb__max_depth': [3, 5],
    'xgb__reg_alpha': [0, 0.1]
}

# Grid 4: RFECV - Balanced tuning
param_grid_4 = {
    'voting': ['soft'],
    'weights': [(1,1,1,1), (1,2,1,1), (1,1,1,2)],
    'lr__C': [1.0, 10],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20],
    'gb__learning_rate': [0.05, 0.1],
    'xgb__n_estimators': [100, 200]
}

# Grid 5: PCA - Optimized for reduced dimensions
param_grid_5 = {
    'voting': ['soft', 'hard'],
    'weights': [(1,1,1,1), (2,1,1,1), (1,1,2,1)],
    'lr__C': [0.1, 1.0],
    'rf__n_estimators': [100, 200],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.1, 0.2],
    'xgb__learning_rate': [0.05, 0.1]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data
param_grid_smote = {
    'model__voting': ['soft'],
    'model__weights': [(1,1,1,1), (2,1,1,1)],
    'model__rf__n_estimators': [100, 200],
    'model__rf__max_depth': [10, 20],
    'model__gb__n_estimators': [100, 200],
    'model__xgb__max_depth': [3, 5]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__voting': ['soft'],
    'model__weights': [(1,1,1,1), (2,1,1,1)],
    'model__rf__n_estimators': [100, 200],
    'model__gb__learning_rate': [0.05, 0.1],
    'model__xgb__n_estimators': [100, 200]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

# Create base estimators for voting
lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)
xgb = XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss')

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    
    voting_clf = VotingClassifier(
        estimators=[('lr', lr), ('rf', rf), ('gb', gb), ('xgb', xgb)],
        voting='soft'
    )
    
    score = cross_val_score(
        voting_clf, X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
voting_estimator = VotingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('gb', gb), ('xgb', xgb)],
    voting='soft'
)
rfecv = RFECV(
    estimator=voting_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=voting_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        voting='soft'
    ))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        voting='soft'
    ))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        voting='soft'
    ))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        voting='soft'
    ))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        voting='soft'
    ))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        voting='soft'
    ))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run Voting Classifier with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING VOTING CLASSIFIER WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            # Create voting classifier with base estimators
            voting_clf = VotingClassifier(
                estimators=[
                    ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
                    ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
                    ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
                    ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
                ],
                voting='soft'
            )
            
            grid_search = GridSearchCV(
                voting_clf, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Voting Classifier Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
            print("   Consider: adjusting individual estimator parameters or changing voting weights")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Excellent generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'Voting Classifier',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('Voting Classifier')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Display estimator information
        print(f"\nüó≥Ô∏è Voting Ensemble Details:")
        print(f"  Voting Type: {best_model.voting}")
        if hasattr(best_model, 'weights') and best_model.weights is not None:
            print(f"  Estimator Weights: {best_model.weights}")
        print(f"  Number of Base Estimators: {len(best_model.estimators_)}")
        print(f"  Base Estimators: {[name for name, _ in best_model.estimators]}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ Voting Classifier evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('voting_classifier_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")


---

# Stacking Classifier

In [None]:
# =============================================================================
# Stacking Classifier with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: Stacking Classifier Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for Stacking Classifier
# =============================================================================

# Grid 1: Preprocessed Data - Explore meta-learners and passthrough
param_grid_1 = {
    'passthrough': [False, True],
    'cv': [5],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.05, 0.1],
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 5],
    'final_estimator__C': [0.1, 1.0, 10]
}

# Grid 2: Normalized Data - Focus on meta-learner tuning
param_grid_2 = {
    'passthrough': [False, True],
    'cv': [5],
    'lr__C': [0.1, 1.0, 10],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [15, 25],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.01, 0.1],
    'xgb__learning_rate': [0.05, 0.1],
    'final_estimator__C': [1.0, 10]
}

# Grid 3: SelectKBest - More conservative parameters
param_grid_3 = {
    'passthrough': [False],
    'cv': [5],
    'rf__n_estimators': [50, 100],
    'rf__max_depth': [5, 10],
    'rf__min_samples_split': [5, 10],
    'gb__n_estimators': [50, 100],
    'gb__learning_rate': [0.05, 0.1],
    'xgb__max_depth': [3, 5],
    'xgb__reg_alpha': [0, 0.1],
    'final_estimator__C': [1.0, 10]
}

# Grid 4: RFECV - Balanced tuning with passthrough
param_grid_4 = {
    'passthrough': [False, True],
    'cv': [5],
    'lr__C': [1.0, 10],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20],
    'gb__learning_rate': [0.05, 0.1],
    'xgb__n_estimators': [100, 200],
    'final_estimator__C': [0.1, 1.0]
}

# Grid 5: PCA - Optimized for reduced dimensions
param_grid_5 = {
    'passthrough': [False, True],
    'cv': [5],
    'lr__C': [0.1, 1.0],
    'rf__n_estimators': [100, 200],
    'gb__n_estimators': [100, 200],
    'gb__learning_rate': [0.1, 0.2],
    'xgb__learning_rate': [0.05, 0.1],
    'final_estimator__C': [1.0, 10]
}

# Grid 6 & 7: SMOTE pipelines - Handle imbalanced data
param_grid_smote = {
    'model__passthrough': [False],
    'model__cv': [5],
    'model__rf__n_estimators': [100, 200],
    'model__rf__max_depth': [10, 20],
    'model__gb__n_estimators': [100, 200],
    'model__xgb__max_depth': [3, 5],
    'model__final_estimator__C': [1.0, 10]
}

# Grid 8-11: Undersampling and Combined pipelines
param_grid_sampling = {
    'model__passthrough': [False],
    'model__cv': [5],
    'model__rf__n_estimators': [100, 200],
    'model__gb__learning_rate': [0.05, 0.1],
    'model__xgb__n_estimators': [100, 200],
    'model__final_estimator__C': [1.0, 10]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote,
    'RandomUnderSampler': param_grid_sampling,
    'TomekLinks': param_grid_sampling,
    'NearMiss': param_grid_sampling,
    'SMOTE + Tomek': param_grid_sampling,
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

# Create base estimators for stacking
lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
rf = RandomForestClassifier(random_state=RANDOM_STATE)
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)
xgb = XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss')

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    
    stacking_clf = StackingClassifier(
        estimators=[('lr', lr), ('rf', rf), ('gb', gb), ('xgb', xgb)],
        final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        cv=5
    )
    
    score = cross_val_score(
        stacking_clf, X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
stacking_estimator = StackingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('gb', gb), ('xgb', xgb)],
    final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    cv=5
)
rfecv = RFECV(
    estimator=stacking_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=stacking_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        cv=5
    ))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        cv=5
    ))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# =============================================================================
# UNDERSAMPLING CONFIGURATIONS
# =============================================================================
print("\n=== Adding Undersampling Configurations ===")

# --- Configuration 8: RandomUnderSampler ---
pipeline_rus = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        cv=5
    ))
])
configurations.append(('RandomUnderSampler', 'pipeline', pipeline_rus, None))
print("‚úì Configuration 8: RandomUnderSampler (Undersampling)")

# --- Configuration 9: TomekLinks ---
pipeline_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', TomekLinks()),
    ('model', StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        cv=5
    ))
])
configurations.append(('TomekLinks', 'pipeline', pipeline_tomek, None))
print("‚úì Configuration 9: TomekLinks (Undersampling - removes noisy samples)")

# --- Configuration 10: NearMiss ---
pipeline_nearmiss = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('undersampler', NearMiss(version=1)),
    ('model', StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        cv=5
    ))
])
configurations.append(('NearMiss', 'pipeline', pipeline_nearmiss, None))
print("‚úì Configuration 10: NearMiss (Undersampling - selective)")

# --- Configuration 11: SMOTE + Tomek (Combined) ---
pipeline_smote_tomek = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('sampler', SMOTETomek(random_state=RANDOM_STATE)),
    ('model', StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
            ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
            ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
            ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
        ],
        final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        cv=5
    ))
])
configurations.append(('SMOTE + Tomek', 'pipeline', pipeline_smote_tomek, None))
print("‚úì Configuration 11: SMOTE + Tomek (Combined Over + Under)")

print(f"\nTotal configurations: {len(configurations)}")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run Stacking Classifier with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING STACKING CLASSIFIER WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            # Create stacking classifier with base estimators and meta-learner
            stacking_clf = StackingClassifier(
                estimators=[
                    ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
                    ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
                    ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE)),
                    ('xgb', XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss'))
                ],
                final_estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
                cv=5
            )
            
            grid_search = GridSearchCV(
                stacking_clf, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Stacking Classifier Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
            print("   Consider: adjusting base estimators, meta-learner parameters, or CV folds")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Excellent generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'Stacking Classifier',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('Stacking Classifier')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
        # Display stacking details
        print(f"\nüìö Stacking Ensemble Details:")
        print(f"  CV Folds: {best_model.cv}")
        print(f"  Passthrough: {best_model.passthrough}")
        print(f"  Number of Base Estimators: {len(best_model.estimators_)}")
        print(f"  Base Estimators: {[name for name, _ in best_model.estimators]}")
        print(f"  Meta-Learner: {type(best_model.final_estimator_).__name__}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ Stacking Classifier evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('stacking_classifier_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))

    # Group by sampling technique
    print("\nüìä Performance by Sampling Technique:")
    print("-" * 80)
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }

    for group_name, configs in sampling_groups.items():
        group_data = result[result['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{group_name}:")
            print(f"  Avg Accuracy: {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score: {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg ROC-AUC: {group_data['ROC_AUC'].mean():.6f}")
            print(f"  Best Config: {group_data.loc[group_data['F1 Score'].idxmax(), 'Configuration']}")
    
    # Compare with baseline
    if len(result) > 0:
        best_idx = result['F1 Score'].idxmax()
        print(f"\nüèÖ Best Overall Configuration:")
        print(f"  Configuration: {result.loc[best_idx, 'Configuration']}")
        print(f"  F1 Score: {result.loc[best_idx, 'F1 Score']:.6f}")
        print(f"  Accuracy: {result.loc[best_idx, 'Accuracy']:.6f}")
        print(f"  ROC-AUC: {result.loc[best_idx, 'ROC_AUC']:.6f}")


---

# Bagging

### Bagging classification with PCA 90

In [None]:
# =============================================================================
# Bagging Classifier with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: Bagging Classifier Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids for Bagging Classifier
# =============================================================================

# Grid 1: Preprocessed Data - Explore bootstrap and sampling
param_grid_1 = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.7, 0.8, 1.0],
    'max_features': [0.7, 0.8, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [False],
    'estimator__max_depth': [5, 10, 15],
    'estimator__min_samples_split': [2, 5],
    'estimator



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 22

=== RFECV Feature Selection with Bagging ===
Optimal number of features selected by RFECV: 1

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 1

=== Bagging Model Performance with Hyperparameter Tuning ===

Running Bagging with Original Data configuration...
Fitting 10 folds for each of 81 candidates, totalling 810 fits

Bagging Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.981013  0.916868 0.857639   0.993197 0.999923
    Test  0.933649  0.599386 0.530303   0.977346 0.954901
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(min_samples_split=10), 'max_features': 0.8, 'max_samples': 1.0, 'n_estimators': 100}

Running Bagging with Normalized Data configuration...
Fitting 10 folds for each of 81 candidates, totalling 810 fits

---

# Results

In [8]:
# =============================================================================
# COMPREHENSIVE RESULTS ANALYSIS - ALL MODELS
# =============================================================================
import os
import pandas as pd

# Check if results storage framework variables exist
try:
    # Try to access the variables
    test = ML_Model
    print("‚úì Results storage framework is active")
    print(f"‚úì Total entries in storage: {len(ML_Model)}")
except NameError:
    print("‚ö†Ô∏è  Results storage framework not initialized")
    print("Please run the model evaluation cells first, or load saved results from CSV")
    
    # Try to load from saved CSV files
    results_dir = 'Analysis/Main/results'
    all_csv_files = []
    
    if os.path.exists(results_dir):
        # Find all result CSV files
        for file in os.listdir(results_dir):
            if file.endswith('_results.csv') and not file.startswith('sorted'):
                all_csv_files.append(os.path.join(results_dir, file))
        
        if all_csv_files:
            print(f"\n‚úì Found {len(all_csv_files)} saved result files")
            # Load and combine all results
            dfs = []
            for csv_file in all_csv_files:
                df = pd.read_csv(csv_file)
                dfs.append(df)
                print(f"  - Loaded: {os.path.basename(csv_file)} ({len(df)} rows)")
            
            all_results = pd.concat(dfs, ignore_index=True)
            print(f"\n‚úì Combined results: {len(all_results)} total configurations")
        else:
            print("\n‚ùå No saved result files found in", results_dir)
            all_results = pd.DataFrame()
    else:
        print(f"\n‚ùå Results directory not found: {results_dir}")
        all_results = pd.DataFrame()

# If variables exist, compile from storage
if 'ML_Model' in globals() and len(ML_Model) > 0:
    all_results = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    
    # Remove duplicates
    all_results.drop_duplicates(subset=['ML Model', 'Configuration'], inplace=True)
    
    print("\n" + "="*120)
    print("üìà COMPREHENSIVE RESULTS - ALL MACHINE LEARNING MODELS")
    print("="*120)
    print(all_results.to_string(index=False))
    
    # Save comprehensive results
    results_dir = 'Analysis/Main/results'
    os.makedirs(results_dir, exist_ok=True)
    save_path = os.path.join(results_dir, 'all_models_comprehensive_results.csv')
    all_results.to_csv(save_path, index=False)
    print(f"\n‚úì Comprehensive results saved to {save_path}")
    
    # Sort by F1 Score and Accuracy
    sorted_results = all_results.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    
    print("\n" + "="*120)
    print("üèÜ SORTED COMPREHENSIVE RESULTS (by F1 Score & Accuracy)")
    print("="*120)
    print(sorted_results.to_string(index=False))
    
    # Save sorted results
    sorted_save_path = os.path.join(results_dir, 'all_models_sorted_results.csv')
    sorted_results.to_csv(sorted_save_path, index=False)
    print(f"\n‚úì Sorted results saved to {sorted_save_path}")

elif len(all_results) > 0:
    # Working with loaded CSV data
    sorted_results = all_results.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    
    print("\n" + "="*120)
    print("üìà LOADED RESULTS - ALL MACHINE LEARNING MODELS")
    print("="*120)
    print(all_results.to_string(index=False))
    
    print("\n" + "="*120)
    print("üèÜ SORTED RESULTS (by F1 Score & Accuracy)")
    print("="*120)
    print(sorted_results.to_string(index=False))
else:
    print("\n‚ùå No results available. Please run model evaluations first.")
    sorted_results = pd.DataFrame()


‚ö†Ô∏è  Results storage framework not initialized
Please run the model evaluation cells first, or load saved results from CSV

‚ùå Results directory not found: Analysis/Main/results

‚ùå No results available. Please run model evaluations first.


In [9]:
# =============================================================================
# ADVANCED ANALYSIS - ALL MODELS
# =============================================================================

if len(all_results) > 0:
    
    # =========================================================================
    # 1. GROUP BY SAMPLING TECHNIQUE
    # =========================================================================
    print("\n" + "="*120)
    print("üìä PERFORMANCE BY SAMPLING TECHNIQUE (ALL MODELS)")
    print("="*120)
    
    sampling_groups = {
        'No Sampling': ['Preprocessed Data', 'Normalized Data', 'SelectKBest', 'RFECV', 'PCA'],
        'Oversampling': ['SMOTE + StandardScaler', 'SMOTE + GridSearchCV'],
        'Undersampling': ['RandomUnderSampler', 'TomekLinks', 'NearMiss'],
        'Combined': ['SMOTE + Tomek']
    }
    
    for group_name, configs in sampling_groups.items():
        group_data = all_results[all_results['Configuration'].isin(configs)]
        if not group_data.empty:
            print(f"\n{'‚îÄ'*60}")
            print(f"üîπ {group_name}")
            print(f"{'‚îÄ'*60}")
            print(f"  Configurations: {len(group_data)}")
            print(f"  Avg Accuracy:    {group_data['Accuracy'].mean():.6f}")
            print(f"  Avg F1 Score:    {group_data['F1 Score'].mean():.6f}")
            print(f"  Avg Recall:      {group_data['Recall'].mean():.6f}")
            print(f"  Avg Precision:   {group_data['Precision'].mean():.6f}")
            print(f"  Avg ROC-AUC:     {group_data['ROC_AUC'].mean():.6f}")
            best_in_group = group_data.loc[group_data['F1 Score'].idxmax()]
            print(f"  Best Model:      {best_in_group['ML Model']}")
            print(f"  Best Config:     {best_in_group['Configuration']}")
            print(f"  Best F1 Score:   {best_in_group['F1 Score']:.6f}")
    
    # =========================================================================
    # 2. TOP CONFIGURATION PER MODEL
    # =========================================================================
    print("\n" + "="*120)
    print("ü•á TOP CONFIGURATION PER MODEL")
    print("="*120)
    
    top_per_model = sorted_results.groupby('ML Model', as_index=False).first()
    print(top_per_model.to_string(index=False))
    
    # Save top configurations
    results_dir = 'Analysis/Main/results'
    os.makedirs(results_dir, exist_ok=True)
    top_save_path = os.path.join(results_dir, 'top_configuration_per_model.csv')
    top_per_model.to_csv(top_save_path, index=False)
    print(f"\n‚úì Top configurations saved to {top_save_path}")
    
    # =========================================================================
    # 3. BEST OVERALL CONFIGURATION
    # =========================================================================
    best_idx = all_results['F1 Score'].idxmax()
    
    print("\n" + "="*120)
    print("üèÖ BEST OVERALL CONFIGURATION ACROSS ALL MODELS")
    print("="*120)
    print(f"  Model:           {all_results.loc[best_idx, 'ML Model']}")
    print(f"  Configuration:   {all_results.loc[best_idx, 'Configuration']}")
    print(f"  {'‚îÄ'*70}")
    print(f"  ‚úì Accuracy:      {all_results.loc[best_idx, 'Accuracy']:.6f}")
    print(f"  ‚úì F1 Score:      {all_results.loc[best_idx, 'F1 Score']:.6f}")
    print(f"  ‚úì Recall:        {all_results.loc[best_idx, 'Recall']:.6f}")
    print(f"  ‚úì Precision:     {all_results.loc[best_idx, 'Precision']:.6f}")
    print(f"  ‚úì ROC-AUC:       {all_results.loc[best_idx, 'ROC_AUC']:.6f}")
    print("="*120)
    
    # =========================================================================
    # 4. MODEL COMPARISON
    # =========================================================================
    print("\n" + "="*120)
    print("üîç MODEL-BY-MODEL COMPARISON (AVERAGE PERFORMANCE)")
    print("="*120)
    
    model_comparison = all_results.groupby('ML Model').agg({
        'Accuracy': 'mean',
        'F1 Score': 'mean',
        'Recall': 'mean',
        'Precision': 'mean',
        'ROC_AUC': 'mean'
    }).round(6).sort_values('F1 Score', ascending=False)
    
    print(model_comparison.to_string())
    
    # Save model comparison
    comparison_save_path = os.path.join(results_dir, 'model_comparison_averages.csv')
    model_comparison.to_csv(comparison_save_path)
    print(f"\n‚úì Model comparison saved to {comparison_save_path}")
    
    # =========================================================================
    # 5. KEY INSIGHTS & STATISTICS
    # =========================================================================
    print("\n" + "="*120)
    print("üìå KEY INSIGHTS & STATISTICS")
    print("="*120)
    
    print(f"\n  üìä Overall Statistics:")
    print(f"     ‚Ä¢ Total models evaluated: {all_results['ML Model'].nunique()}")
    print(f"     ‚Ä¢ Total configurations tested: {len(all_results)}")
    print(f"     ‚Ä¢ Avg tests per model: {len(all_results) / all_results['ML Model'].nunique():.1f}")
    
    print(f"\n  üéØ Performance Thresholds:")
    print(f"     ‚Ä¢ Configurations with F1 > 0.95: {len(all_results[all_results['F1 Score'] > 0.95])}")
    print(f"     ‚Ä¢ Configurations with F1 > 0.97: {len(all_results[all_results['F1 Score'] > 0.97])}")
    print(f"     ‚Ä¢ Configurations with Accuracy > 0.98: {len(all_results[all_results['Accuracy'] > 0.98])}")
    print(f"     ‚Ä¢ Configurations with ROC-AUC > 0.99: {len(all_results[all_results['ROC_AUC'] > 0.99])}")
    
    print(f"\n  üìà Performance Ranges:")
    print(f"     ‚Ä¢ F1 Score:   [{all_results['F1 Score'].min():.6f} - {all_results['F1 Score'].max():.6f}] (Œî {all_results['F1 Score'].max() - all_results['F1 Score'].min():.6f})")
    print(f"     ‚Ä¢ Accuracy:   [{all_results['Accuracy'].min():.6f} - {all_results['Accuracy'].max():.6f}] (Œî {all_results['Accuracy'].max() - all_results['Accuracy'].min():.6f})")
    print(f"     ‚Ä¢ ROC-AUC:    [{all_results['ROC_AUC'].min():.6f} - {all_results['ROC_AUC'].max():.6f}] (Œî {all_results['ROC_AUC'].max() - all_results['ROC_AUC'].min():.6f})")
    
    print(f"\n  üìâ Variability (Standard Deviation):")
    print(f"     ‚Ä¢ F1 Score:   {all_results['F1 Score'].std():.6f}")
    print(f"     ‚Ä¢ Accuracy:   {all_results['Accuracy'].std():.6f}")
    print(f"     ‚Ä¢ ROC-AUC:    {all_results['ROC_AUC'].std():.6f}")
    
    print(f"\n  üèÜ Best Performing Models:")
    top_3 = sorted_results.head(3)
    for i, (_, row) in enumerate(top_3.iterrows(), 1):
        print(f"     {i}. {row['ML Model']} ({row['Configuration']}): F1={row['F1 Score']:.6f}")
    
    print(f"\n  üí° Best Sampling Technique:")
    best_sampling = None
    best_sampling_f1 = 0
    for group_name, configs in sampling_groups.items():
        group_data = all_results[all_results['Configuration'].isin(configs)]
        if not group_data.empty:
            avg_f1 = group_data['F1 Score'].mean()
            if avg_f1 > best_sampling_f1:
                best_sampling_f1 = avg_f1
                best_sampling = group_name
    if best_sampling:
        print(f"     ‚Ä¢ {best_sampling} (Avg F1: {best_sampling_f1:.6f})")
    
    print("\n" + "="*120)
    print("‚úÖ COMPREHENSIVE MODEL EVALUATION COMPLETE!")
    print("="*120)

else:
    print("\n‚ö†Ô∏è  No results to display. Please run model evaluations first or check if CSV files exist.")



‚ö†Ô∏è  No results to display. Please run model evaluations first or check if CSV files exist.


---

# END