# 2. Model Exploration and Hyperparameter Tuning

## üìù Overview
This notebook is the second step in the dementia prediction pipeline. Its purpose is to:
1.  **Load** the pre-cleaned and split data from `1_dataset_analysis.ipynb`.
2.  Define a **preprocessing pipeline** to handle scaling and encoding.
3.  Use **SMOTE** to address class imbalance in the training data.
4.  Train a variety of machine learning models using **GridSearchCV** to find the best hyperparameters for each.
5.  **Save** the trained models and their performance results for the final implementation phase.

## Libraries

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFECV, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import  RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import label_binarize
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

## ML Model Results Storage Framework

In [7]:
 
precision = []
roc_auc = []

# Function to call for storing the results
def store_results(model, config, acc, f1_score, rec, prec, roc):
    """
    Store model performance results.
    """
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(acc, 6))
    f1.append(round(f1_score, 6))
    recall.append(round(rec, 6))
    precision.append(round(prec, 6))
    roc_auc.append(round(roc, 6))

# Function to display and save results
def display_and_save_results(filename_prefix='model_exploration'):
    """
    Create dataframe from results, display, and save to CSV in the 'AnalysisMain/results' directory.
    """
    # Creating the dataframe
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    
    # Remove duplicates if any
    result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)
    
    print("\n" + "="*100)
    print("MODEL PERFORMANCE RESULTS")
    print("="*100)
    print(result.to_string(index=False))
    
    # Saving the result to a CSV file
    save_path = os.path.join(results_dir, f'{filename_prefix}_results.csv')
    result.to_csv(save_path, index=False)
    print(f"\nResults saved to {save_path}")
    
    # Sorting the dataframe on F1 Score and Accuracy
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    
    print("\n" + "="*100)
    print("SORTED MODEL PERFORMANCE RESULTS (by F1 Score and Accuracy)")
    print("="*100)
    print(sorted_result.to_string(index=False))
    
    # Saving the sorted result to a CSV file
    sorted_save_path = os.path.join(results_dir, f'sorted_{filename_prefix}_results.csv')
    sorted_result.to_csv(sorted_save_path, index=False)
    print(f"\nSorted results saved to {sorted_save_path}")
    
    return result, sorted_result

# Function to clear results
def clear_results():
    """Clear all stored results."""
    global ML_Model, ML_Config, accuracy, f1, recall, precision, roc_auc
    ML_Model.clear()
    ML_Config.clear()
    accuracy.clear()
    f1.clear()
    recall.clear()
    precision.clear()
    roc_auc.clear()
    print("Results cleared!")

# Function to plot model comparison
def plot_model_comparison(result_df, plot_filename="model_performance_comparison.png"):
    """
    Create visualization comparing model performances and save to 'AnalysisMain/plots'.
    """
    # Convert scores to percentages for plotting
    metrics_cols = ['Accuracy', 'F1 Score', 'Recall', 'Precision', 'ROC_AUC']
    plot_df = result_df.copy()
    
    for col in metrics_cols:
        plot_df[col] = plot_df[col] * 100
    
    # Create subplot for each metric
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    axes = axes.ravel()
    
    for idx, metric in enumerate(metrics_cols):
        # Group by model and get mean performance across configurations
        model_performance = plot_df.groupby('ML Model')[metric].mean().sort_values(ascending=False)
        
        # Create bar plot
        ax = axes[idx]
        bars = sns.barplot(x=model_performance.index, y=model_performance.values, ax=ax, palette='Blues_r')
        
        ax.set_title(f'Average {metric}', fontweight='bold')
        ax.set_ylabel(f'{metric} (%)')
        ax.set_xlabel('')
        ax.tick_params(axis='x', rotation=45)
        ax.grid(axis='y', alpha=0.5)
        
        # Add value labels on bars
        for bar in bars.patches:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}', ha='center', va='bottom', fontsize=9)
    
    # Hide the last subplot if we have 5 metrics
    if len(metrics_cols) < 6:
        axes[5].set_visible(False)
    
    plt.suptitle('Model Performance Comparison', fontsize=20, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    
    # Save the plot
    save_path = os.path.join(plots_dir, plot_filename)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Comparison plot saved to: {save_path}")

print("Model results storage framework loaded successfully!")
print("This framework will save results, plots, and models to the 'ModelExploration' directory.")

Model results storage framework loaded successfully!
This framework will save results, plots, and models to the 'ModelExploration' directory.


# Define Preprocessing Pipeline

Before training the models, we need to create a preprocessing pipeline. This pipeline will handle:
1.  **Scaling**: Applying `StandardScaler` to all numerical features to standardize their range.
2.  **Encoding**: Applying `OneHotEncoder` to the categorical feature (`M/F`) to convert it into a numerical format.

We use a `ColumnTransformer` to apply these different transformations to the correct columns. This ensures that the same steps are consistently applied during both training and validation.

In [8]:
# Import necessary libraries
import joblib
import os

# Define the directory where the processed data was saved from the previous notebook
processed_data_dir = 'Analysis/processed_data'

# Load the training and validation sets
X_train = joblib.load(os.path.join(processed_data_dir, 'X_train.joblib'))
X_val = joblib.load(os.path.join(processed_data_dir, 'X_val.joblib'))
y_train = joblib.load(os.path.join(processed_data_dir, 'y_train.joblib'))
y_val = joblib.load(os.path.join(processed_data_dir, 'y_val.joblib'))

print("Data loaded successfully from 'processed_data' directory!")
print("-" * 50)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"\nTraining target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\nValidation target distribution:\n{y_val.value_counts(normalize=True)}")

# Display the first few rows of the training data to confirm
print("\nFirst 5 rows of X_train:")
X_train.head()

Data loaded successfully from 'processed_data' directory!
--------------------------------------------------
X_train shape: (647, 12)
X_val shape: (162, 12)

Training target distribution:
Group
Nondemented    0.774343
Demented       0.225657
Name: proportion, dtype: float64

Validation target distribution:
Group
Nondemented    0.771605
Demented       0.228395
Name: proportion, dtype: float64

First 5 rows of X_train:


Unnamed: 0,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay,Visit,MR Delay
764,M,84,14.0,2.0,22.0,0.5,1550,0.665,1.132,2.0,2.0,621.0
213,M,75,5.0,2.0,29.0,0.0,1534,0.771,1.144,2.0,1.0,0.0
382,F,69,4.0,3.0,29.0,0.0,1380,0.809,1.272,2.0,1.0,0.0
456,F,80,16.0,2.0,29.0,0.0,1323,0.738,1.326,2.0,2.0,730.0
393,F,50,12.0,2.0,30.0,0.0,1385,0.819,1.267,2.0,1.0,0.0


---

# SVM

### SVM with PCA 90

In [None]:


# =============================================================================
# SVM with Configuration-Specific Hyperparameter Grids
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Define preprocessor for categorical and numeric features
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

# Create preprocessing transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Apply preprocessing to get fully numeric data first
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Get feature names after preprocessing
try:
    feature_names = (numeric_features + 
                    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))
except:
    feature_names = [f'feature_{i}' for i in range(X_train_preprocessed.shape[1])]

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_val.index)

print(f"\nPreprocessed data shape: {X_train_preprocessed.shape}")
print(f"All features are now numeric: {X_train_preprocessed.select_dtypes(include=np.number).shape[1] == X_train_preprocessed.shape[1]}")

# Clear previous results
try:
    clear_results()
except:
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Initialized result storage lists.")

print("\n=== START: SVM Configuration Sweep with Custom Hyperparameters ===\n")

# =============================================================================
# Configuration-Specific Hyperparameter Grids
# =============================================================================

# Grid 1: Preprocessed Data - Focus on RBF and linear kernels
param_grid_1 = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.001, 0.01],
    'kernel': ['rbf', 'linear'],
    'degree': [2],  # Not used for rbf/linear but required
    'coef0': [0.0]
}

# Grid 2: Normalized Data - Explore polynomial kernels
param_grid_2 = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['poly', 'rbf'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 3: SelectKBest - Focus on simpler models
param_grid_3 = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['rbf', 'poly', 'linear'],
    'degree': [2, 3],
    'coef0': [0.0, 0.1]
}

# Grid 4: RFECV - Similar to SelectKBest but different ranges
param_grid_4 = {
    'C': [0.5, 1, 5, 10],
    'gamma': ['scale', 0.001, 0.01],
    'kernel': ['rbf', 'sigmoid'],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

# Grid 5: PCA - Focus on linear and sigmoid
param_grid_5 = {
    'C': [0.1, 1, 10, 50],
    'gamma': ['scale', 'auto', 0.01],
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'degree': [2],
    'coef0': [0.0, 0.5, 1.0]
}

# Grid 6 & 7: SMOTE pipelines - Balanced approach
param_grid_smote = {
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 0.001, 0.01, 0.1],
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__degree': [2, 3],
    'model__coef0': [0.0, 0.5]
}

# Map grids to configurations
hyperparameter_grids = {
    'Preprocessed Data': param_grid_1,
    'Normalized Data': param_grid_2,
    'SelectKBest': param_grid_3,
    'RFECV': param_grid_4,
    'PCA': param_grid_5,
    'SMOTE + StandardScaler': param_grid_smote,
    'SMOTE + GridSearchCV': param_grid_smote
}

# Initialize configurations list
configurations = []

# --- Configuration 1: Preprocessed Data ---
configurations.append(('Preprocessed Data', 'array', X_train_preprocessed, X_val_preprocessed))
print("‚úì Configuration 1: Preprocessed Data")

# --- Configuration 2: Normalized Data (MinMax on preprocessed) ---
scaler_minmax = MinMaxScaler()
X_train_normalized = pd.DataFrame(
    scaler_minmax.fit_transform(X_train_preprocessed), 
    columns=X_train_preprocessed.columns, 
    index=X_train_preprocessed.index
)
X_val_normalized = pd.DataFrame(
    scaler_minmax.transform(X_val_preprocessed), 
    columns=X_val_preprocessed.columns, 
    index=X_val_preprocessed.index
)
configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))
print("‚úì Configuration 2: Normalized Data (MinMax)")

# --- Configuration 3: SelectKBest ---
print("\n=== SelectKBest Feature Selection ===")
scores = []
max_features = min(X_train_normalized.shape[1], 20)

for k in range(1, max_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(
        SVC(kernel='linear', random_state=RANDOM_STATE), 
        X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(
    kbest.fit_transform(X_train_normalized, y_train), 
    columns=X_train_normalized.columns[kbest.get_support()]
)
X_val_kbest = pd.DataFrame(
    kbest.transform(X_val_normalized), 
    columns=X_train_kbest.columns
)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))
print("‚úì Configuration 3: SelectKBest")

# --- Configuration 4: RFECV ---
print("\n=== RFECV Feature Selection ===")
svm_estimator = SVC(kernel='linear', random_state=RANDOM_STATE)
rfecv = RFECV(
    estimator=svm_estimator, 
    step=1, 
    cv=StratifiedKFold(5), 
    scoring='accuracy', 
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=svm_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(
    rfe.fit_transform(X_train_kbest, y_train), 
    columns=X_train_kbest.columns[rfe.get_support()]
)
X_val_rfe = pd.DataFrame(
    rfe.transform(X_val_kbest), 
    columns=X_train_rfe.columns
)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))
print("‚úì Configuration 4: RFECV")

# --- Configuration 5: PCA ---
print("\n=== PCA Dimensionality Reduction ===")
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
desired_variance = 0.90
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
n_components = max(2, n_components)
print(f'Number of components for {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))
print("‚úì Configuration 5: PCA")

# --- Configuration 6: SMOTE + StandardScaler (Pipeline) ---
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))
print("‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)")

# --- Configuration 7: SMOTE + GridSearchCV (Pipeline) ---
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', SVC(probability=True, random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))
print("‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)")

# Safe ROC AUC helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(
                pd.get_dummies(y_true), y_proba, 
                multi_class='ovr', average='macro'
            )
    except Exception:
        return np.nan

# =============================================================================
# Run SVM with Configuration-Specific GridSearchCV
# =============================================================================
print("\n" + "="*80)
print("RUNNING SVM WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING")
print("="*80)

for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\n{'='*80}")
    print(f"Configuration: {name}")
    print(f"{'='*80}")
    
    # Get the specific parameter grid for this configuration
    param_grid = hyperparameter_grids[name]
    
    print(f"Hyperparameter grid for '{name}':")
    for key, values in param_grid.items():
        print(f"  {key}: {values}")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    
    try:
        if kind == 'pipeline':
            pipeline = X_tr_cfg
            grid_search = GridSearchCV(
                pipeline, param_grid, 
                cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_train)
            y_val_pred = best_model.predict(X_val)
            y_train_proba = best_model.predict_proba(X_train)
            y_val_proba = best_model.predict_proba(X_val)
        else:
            grid_search = GridSearchCV(
                SVC(probability=True, random_state=RANDOM_STATE), 
                param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1_macro'
            )
            grid_search.fit(X_tr_cfg, y_train)
            best_model = grid_search.best_estimator_
            y_train_pred = best_model.predict(X_tr_cfg)
            y_val_pred = best_model.predict(X_val_cfg)
            y_train_proba = best_model.predict_proba(X_tr_cfg)
            y_val_proba = best_model.predict_proba(X_val_cfg)

        # Calculate train-test gap for overfitting detection
        train_acc = metrics.accuracy_score(y_train, y_train_pred)
        test_acc = metrics.accuracy_score(y_val, y_val_pred)
        train_test_gap = train_acc - test_acc

        # Build metrics dictionary
        metrics_dict = {
            "Dataset": ["Training", "Test"],
            "Accuracy": [train_acc, test_acc],
            "F1 Score": [
                metrics.f1_score(y_train, y_train_pred, average='macro'),
                metrics.f1_score(y_val, y_val_pred, average='macro'),
            ],
            "Recall": [
                metrics.recall_score(y_train, y_train_pred, average='macro'),
                metrics.recall_score(y_val, y_val_pred, average='macro'),
            ],
            "Precision": [
                metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
                metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
            ],
            "AUC-ROC": [
                safe_roc_auc(y_train, y_train_proba),
                safe_roc_auc(y_val, y_val_proba),
            ]
        }

        df_metrics = pd.DataFrame(metrics_dict)
        pd.options.display.float_format = '{:.6f}'.format
        print("\nüìä Support Vector Machine Model Performance Metrics")
        print(df_metrics.to_string(index=False))

        # Overfitting warning
        if train_test_gap > 0.10:
            print(f"\n‚ö†Ô∏è  WARNING: Overfitting detected! Train-Test gap: {train_test_gap:.4f}")
        elif train_test_gap < 0.05:
            print(f"\n‚úì Good generalization. Train-Test gap: {train_test_gap:.4f}")
        else:
            print(f"\n‚Üí Acceptable gap: {train_test_gap:.4f}")

        # Store test metrics
        test_metrics = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
        try:
            store_results(
                'SVM',
                name,
                float(test_metrics['Accuracy']),
                float(test_metrics['F1 Score']),
                float(test_metrics['Recall']),
                float(test_metrics['Precision']),
                float(test_metrics['AUC-ROC'])
            )
        except:
            ML_Model.append('SVM')
            ML_Config.append(name)
            accuracy.append(round(float(test_metrics['Accuracy']), 6))
            f1.append(round(float(test_metrics['F1 Score']), 6))
            recall.append(round(float(test_metrics['Recall']), 6))
            precision.append(round(float(test_metrics['Precision']), 6))
            roc_auc.append(round(float(test_metrics['AUC-ROC']), 6))

        print("\nüéØ Best hyperparameters found:")
        best_params = grid_search.best_params_
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        print(f"\nBest CV score: {grid_search.best_score_:.6f}")
        
    except Exception as e:
        print(f"‚ùå Error in configuration '{name}': {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "="*80)
print("‚úÖ SVM evaluation complete for all configurations.")
print("="*80)

# Display final results
try:
    display_and_save_results('svm_all_configs')
except:
    result = pd.DataFrame({
        'ML Model': ML_Model,
        'Configuration': ML_Config,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'Recall': recall,
        'Precision': precision,
        'ROC_AUC': roc_auc,
    })
    print("\nüìà Final Results:")
    print(result.to_string(index=False))
    
    # Sort by F1 Score
    sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)
    print("\nüèÜ Sorted Results (by F1 Score):")
    print(sorted_result.to_string(index=False))


Numeric features: ['Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF', 'Delay', 'Visit', 'MR Delay']
Categorical features: ['M/F']

Preprocessed data shape: (647, 13)
All features are now numeric: True
Results cleared!

=== START: SVM Configuration Sweep with Custom Hyperparameters ===

‚úì Configuration 1: Preprocessed Data
‚úì Configuration 2: Normalized Data (MinMax)

=== SelectKBest Feature Selection ===
Optimal number of features: 10
‚úì Configuration 3: SelectKBest

=== RFECV Feature Selection ===
Optimal number of features by RFECV: 3
‚úì Configuration 4: RFECV

=== PCA Dimensionality Reduction ===
Number of components for 90.0% variance: 3
‚úì Configuration 5: PCA
‚úì Configuration 6: SMOTE + StandardScaler (Pipeline)
‚úì Configuration 7: SMOTE + GridSearchCV (Pipeline)

RUNNING SVM WITH CONFIGURATION-SPECIFIC HYPERPARAMETER TUNING

Configuration: Preprocessed Data
Hyperparameter grid for 'Preprocessed Data':
  C: [0.01, 0.1, 1, 10]
  gamma: ['scale', 'auto', 0.001, 0.0

### SVM with PCA 95

In [None]:



from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, RFECV
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
# This list will hold different versions of our data for testing
configurations = []

# Define numeric and categorical features from our loaded X_train
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Create a standard preprocessor for scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# --- Configuration 1: The approach from main.ipynb (SMOTE + StandardScaler) ---
# This is a pipeline, not a pre-transformed dataset. We will handle it specially in the loop.
configurations.append(('SMOTE + StandardScaler', None, None))
print("Configuration 1: 'SMOTE + StandardScaler' pipeline defined.")

# --- Configuration 2: Scaled Data (No SMOTE) ---
X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
configurations.append(('StandardScaler Only', X_train_scaled, X_val_scaled))
print("Configuration 2: 'StandardScaler Only' data prepared.")

# --- Configuration 3: PCA with 95% Variance (on Scaled Data) ---
print("\nFinding optimal components for PCA (95% variance)...")
pca_explorer = PCA().fit(X_train_scaled)
cumulative_variance = np.cumsum(pca_explorer.explained_variance_ratio_)
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of components for 95% variance: {n_components_95}")

pca = PCA(n_components=n_components_95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
configurations.append(('PCA (95% Var)', X_train_pca, X_val_pca))
print("Configuration 3: 'PCA (95% Var)' data prepared.")


# =============================================================================
# 3. RUN SVM WITH GRIDSEARCHCV ON ALL CONFIGURATIONS
# =============================================================================
print("\n" + "="*80)
print("RUNNING SVM WITH HYPERPARAMETER TUNING ON ALL CONFIGURATIONS")
print("="*80)

# Define the hyperparameter grid for SVM
# Note: 'model__' prefix is used for the SMOTE pipeline configuration
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'poly']
}
param_grid_svm_pipeline = {
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 'auto'],
    'model__kernel': ['rbf', 'poly']
}

# Clear previous results
clear_results()

# Loop through each data configuration
for name, X_train_cfg, X_val_cfg in configurations:
    print(f"\n--- Running SVM with '{name}' configuration ---")
    
    # Special handling for the SMOTE pipeline configuration
    if name == 'SMOTE + StandardScaler':
        pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('model', SVC(probability=True, random_state=42))
        ])
        grid_search = GridSearchCV(pipeline, param_grid_svm_pipeline, cv=5, n_jobs=-1, verbose=1, scoring='f1_weighted')
        # Fit on the original X_train, y_train
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        # Make predictions on the original X_val
        y_pred = best_model.predict(X_val)
        y_proba = best_model.predict_proba(X_val)[:, 1]
        
    else: # For all other configurations where data is already transformed
        grid_search = GridSearchCV(SVC(probability=True, random_state=42), param_grid_svm, cv=5, n_jobs=-1, verbose=1, scoring='f1_weighted')
        # Fit on the pre-processed configuration data
        grid_search.fit(X_train_cfg, y_train)
        best_model = grid_search.best_estimator_
        # Make predictions on the pre-processed validation data
        y_pred = best_model.predict(X_val_cfg)
        y_proba = best_model.predict_proba(X_val_cfg)[:, 1]

    # --- Metrics Calculation (consistent for all loops) ---
    le = LabelEncoder()
    y_val_encoded = le.fit_transform(y_val)
    y_pred_encoded = le.transform(y_pred)
    
    acc = metrics.accuracy_score(y_val_encoded, y_pred_encoded)
    f1_val = metrics.f1_score(y_val_encoded, y_pred_encoded, average='weighted')
    rec = metrics.recall_score(y_val_encoded, y_pred_encoded, average='weighted')
    prec = metrics.precision_score(y_val_encoded, y_pred_encoded, average='weighted', zero_division=0)
    roc = metrics.roc_auc_score(y_val_encoded, y_proba)
    
    store_results('SVM', name, acc, f1_val, rec, prec, roc)
    
    print(f"Best cross-validation F1-Score for '{name}': {grid_search.best_score_:.4f}")
    print(f"Validation F1-Score: {f1_val:.4f}")
    print(f"Best params: {grid_search.best_params_}")

print("\n--- SVM evaluation complete for all configurations. ---")


Configuration 1: 'SMOTE + StandardScaler' pipeline defined.
Configuration 2: 'StandardScaler Only' data prepared.

Finding optimal components for PCA (95% variance)...
Number of components for 95% variance: 8
Configuration 3: 'PCA (95% Var)' data prepared.

RUNNING SVM WITH HYPERPARAMETER TUNING ON ALL CONFIGURATIONS
Results cleared!

--- Running SVM with 'SMOTE + StandardScaler' configuration ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best cross-validation F1-Score for 'SMOTE + StandardScaler': 0.9623
Validation F1-Score: 0.9816
Best params: {'model__C': 1, 'model__gamma': 'auto', 'model__kernel': 'rbf'}

--- Running SVM with 'StandardScaler Only' configuration ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best cross-validation F1-Score for 'StandardScaler Only': 0.9714
Validation F1-Score: 0.9753
Best params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

--- Running SVM with 'PCA (95% Var)' configuration ---
Fitting 5 folds for each of 12 candi

### SVM with PCA 99

In [None]:
# # Store different configurations
# configurations = []
# configurations.append(('Original Data', X_train, X_test, y_train))

# # Step 2: Normalize the data
# scaler = MinMaxScaler()
# X_train_normalized = scaler.fit_transform(X_train)
# X_test_normalized = scaler.transform(X_test)
# configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# # Step 3.1: SelectKBest
# print("\n=== SelectKBest Feature Selection ===")
# scores = []
# for k in range(1, X_train.shape[1] + 1):
#     kbest = SelectKBest(score_func=f_classif, k=k)
#     X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
#     score = cross_val_score(SVC(kernel='linear'), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
#     scores.append(score)

# optimal_k = scores.index(max(scores)) + 1
# print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

# kbest = SelectKBest(score_func=f_classif, k=optimal_k)
# X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
# X_test_kbest = kbest.transform(X_test_normalized)
# selected_features_kbest = X.columns[kbest.get_support()]
# configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# # Step 3.2: RFECV
# print("\n=== RFECV Feature Selection with SVM ===")
# svm_estimator = SVC(kernel='linear')

# rfecv = RFECV(estimator=svm_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
# rfecv.fit(X_train_kbest, y_train)

# print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

# rfe = RFE(estimator=svm_estimator, n_features_to_select=rfecv.n_features_)
# X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
# X_test_rfe = rfe.transform(X_test_kbest)
# selected_features_rfe = selected_features_kbest[rfe.get_support()]
# configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# # Step 3.3: PCA
# print("\n=== PCA Dimensionality Reduction ===")
# pca = PCA().fit(X_train_rfe)
# cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# desired_variance = 0.99
# n_components = np.argmax(cumulative_variance >= desired_variance) + 1
# print(f'Number of components that explain {desired_variance*100}% variance: {n_components}')

# pca = PCA(n_components=n_components)
# X_train_pca = pca.fit_transform(X_train_rfe)
# X_test_pca = pca.transform(X_test_rfe)
# configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# # Step 4: SVM + GridSearchCV
# print("\n=== SVM Model Performance with Hyperparameter Tuning ===")

# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],
#     'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
#     'kernel': ['rbf', 'poly', 'sigmoid'],
#     'degree': [2, 3, 4],
#     'coef0': [0.0, 0.1, 0.5, 1.0]
# }

# for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
#     print(f"\nRunning SVM with {name} configuration...")
#     svc = GridSearchCV(SVC(probability=True), param_grid, cv=10, n_jobs=-1, verbose=2)
#     svc.fit(X_train_cfg, y_train_cfg)

#     y_train_svc = svc.predict(X_train_cfg)
#     y_test_svc = svc.predict(X_test_cfg)
#     y_train_svc_proba = svc.predict_proba(X_train_cfg)
#     y_test_svc_proba = svc.predict_proba(X_test_cfg)

#     metrics_dict = {
#         "Dataset": ["Training", "Test"],
#         "Accuracy": [
#             metrics.accuracy_score(y_train_cfg, y_train_svc),
#             metrics.accuracy_score(y_test, y_test_svc),
#         ],
#         "F1 Score": [
#             metrics.f1_score(y_train_cfg, y_train_svc, average='macro'),
#             metrics.f1_score(y_test, y_test_svc, average='macro'),
#         ],
#         "Recall": [
#             metrics.recall_score(y_train_cfg, y_train_svc, average='macro'),
#             metrics.recall_score(y_test, y_test_svc, average='macro'),
#         ],
#         "Precision": [
#             metrics.precision_score(y_train_cfg, y_train_svc, average='macro'),
#             metrics.precision_score(y_test, y_test_svc, average='macro'),
#         ],
#         "AUC-ROC": [
#             metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_svc_proba, multi_class='ovr', average='macro'),
#             metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svc_proba, multi_class='ovr', average='macro'),
#         ]
#     }

#     df_metrics = pd.DataFrame(metrics_dict)
#     print("\nSupport Vector Machine Model Performance Metrics")
#     print(df_metrics.to_string(index=False))

#     auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_svc_proba, multi_class='ovr', average='macro')
#     storeResults(
#         'Support Vector Machine 99',
#         name,
#         metrics.accuracy_score(y_test, y_test_svc),
#         metrics.f1_score(y_test, y_test_svc, average='macro'),
#         metrics.recall_score(y_test, y_test_svc, average='macro'),
#         metrics.precision_score(y_test, y_test_svc, average='macro'),
#         auc_score
#     )

#     print("Best hyperparameters found by GridSearchCV:")
#     print(svc.best_params_)



# =============================================================================
# SVM Model with Multiple Preprocessing Configurations (including PCA 99%)
# =============================================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# This list will hold different versions of our data for testing
configurations = []

# Define a standard preprocessor for reuse
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# --- Configuration 1: The approach from main.ipynb (SMOTE + StandardScaler) ---
# This is a pipeline, not a pre-transformed dataset. We will handle it specially in the loop.
configurations.append(('SMOTE + StandardScaler', None, None))
print("Configuration 1: 'SMOTE + StandardScaler' pipeline defined.")

# --- Configuration 2: Scaled Data (No SMOTE) ---
X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
configurations.append(('StandardScaler Only', X_train_scaled, X_val_scaled))
print("Configuration 2: 'StandardScaler Only' data prepared.")

# --- Configuration 3: PCA with 99% Variance (on Scaled Data) ---
print("\nApplying PCA to retain 99% of variance...")
pca_99 = PCA(n_components=0.99)
X_train_pca_99 = pca_99.fit_transform(X_train_scaled)
X_val_pca_99 = pca_99.transform(X_val_scaled)
configurations.append(('PCA (99% Var)', X_train_pca_99, X_val_pca_99))
print(f"PCA (99%) applied. Number of components selected: {pca_99.n_components_}")


# =============================================================================
# 4. RUN SVM WITH GRIDSEARCHCV ON ALL CONFIGURATIONS
# =============================================================================
print("\n" + "="*80)
print("RUNNING SVM WITH HYPERPARAMETER TUNING ON ALL CONFIGURATIONS")
print("="*80)

# Define the hyperparameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'poly']
}
param_grid_svm_pipeline = {
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 'auto'],
    'model__kernel': ['rbf', 'poly']
}

# Clear previous results before this run
clear_results()

# Loop through each data configuration
for name, X_train_cfg, X_val_cfg in configurations:
    print(f"\n--- Running SVM with '{name}' configuration ---")
    
    # Special handling for the SMOTE pipeline configuration
    if name == 'SMOTE + StandardScaler':
        pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('model', SVC(probability=True, random_state=42))
        ])
        grid_search = GridSearchCV(pipeline, param_grid_svm_pipeline, cv=5, n_jobs=-1, verbose=1, scoring='f1_weighted')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_train_pred = best_model.predict(X_train)
        y_val_pred = best_model.predict(X_val)
        y_train_proba = best_model.predict_proba(X_train)
        y_val_proba = best_model.predict_proba(X_val)
        
    else: # For all other configurations where data is already transformed
        grid_search = GridSearchCV(SVC(probability=True, random_state=42), param_grid_svm, cv=5, n_jobs=-1, verbose=1, scoring='f1_weighted')
        grid_search.fit(X_train_cfg, y_train)
        best_model = grid_search.best_estimator_
        y_train_pred = best_model.predict(X_train_cfg)
        y_val_pred = best_model.predict(X_val_cfg)
        y_train_proba = best_model.predict_proba(X_train_cfg)
        y_val_proba = best_model.predict_proba(X_val_cfg)

    # --- Create the metrics_dict with Training and Test results ---
    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train, y_train_pred),
            metrics.accuracy_score(y_val, y_val_pred),
        ],
        "F1 Score": [
            metrics.f1_score(y_train, y_train_pred, average='macro'),
            metrics.f1_score(y_val, y_val_pred, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train, y_train_pred, average='macro'),
            metrics.recall_score(y_val, y_val_pred, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
            metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train), y_train_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_val), y_val_proba, multi_class='ovr', average='macro'),
        ]
    }
    
    # Display Results and Store Test Metrics
    df_metrics = pd.DataFrame(metrics_dict)
    print("\nSupport Vector Machine Model Performance Metrics")
    print(df_metrics.to_string(index=False))
    
    test_metrics = df_metrics[df_metrics['Dataset'] == 'Test']
    store_results(
        'SVM',
        name,
        test_metrics['Accuracy'].iloc[0],
        test_metrics['F1 Score'].iloc[0],
        test_metrics['Recall'].iloc[0],
        test_metrics['Precision'].iloc[0],
        test_metrics['AUC-ROC'].iloc[0]
    )
    
    print("\nBest hyperparameters found by GridSearchCV:")
    print(grid_search.best_params_)

print("\n--- SVM evaluation complete for all configurations. ---")


Configuration 1: 'SMOTE + StandardScaler' pipeline defined.
Configuration 2: 'StandardScaler Only' data prepared.

Applying PCA to retain 99% of variance...
PCA (99%) applied. Number of components selected: 10

RUNNING SVM WITH HYPERPARAMETER TUNING ON ALL CONFIGURATIONS
Results cleared!

--- Running SVM with 'SMOTE + StandardScaler' configuration ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Support Vector Machine Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.967543  0.953674 0.954775   0.952585 0.993192
    Test  0.981481  0.973976 0.978486   0.969652 0.997622

Best hyperparameters found by GridSearchCV:
{'model__C': 1, 'model__gamma': 'auto', 'model__kernel': 'rbf'}

--- Running SVM with 'StandardScaler Only' configuration ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Support Vector Machine Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.973725  0

---

# Random Forest

In [3]:
# ML Model Results Storage Framework ‚Äî improved & backwards-compatible
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, List, Dict, Optional

# --- Directories (matches your current code) ---
results_dir = 'ModelExploration/results'
plots_dir = 'ModelExploration/plots'
models_dir = 'ModelExploration/models'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(plots_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

# --- Internal storage (single source-of-truth) ---
_RESULTS: List[Dict[str, object]] = []

# Also expose old-style lists for maximum compatibility (if other cells read them)
ML_Model: List[str] = []
ML_Config: List[str] = []
accuracy: List[float] = []
f1: List[float] = []
recall: List[float] = []
precision: List[float] = []
roc_auc: List[float] = []

# --- Core store function (raw floats: 0..1) ---
def store_results(model: str, config: str, acc: float, f1_score: float, rec: float, prec: float, roc: float) -> None:
    """
    Store a single result entry (raw floats 0..1).
    Appends to both _RESULTS and legacy lists ML_Model / ML_Config / accuracy / ...
    """
    entry = {
        "ML Model": str(model),
        "Configuration": str(config),
        "Accuracy": float(acc),
        "F1 Score": float(f1_score),
        "Recall": float(rec),
        "Precision": float(prec),
        "ROC_AUC": float(roc)
    }
    _RESULTS.append(entry)
    # keep legacy lists in sync
    ML_Model.append(entry["ML Model"])
    ML_Config.append(entry["Configuration"])
    accuracy.append(round(entry["Accuracy"], 6))
    f1.append(round(entry["F1 Score"], 6))
    recall.append(round(entry["Recall"], 6))
    precision.append(round(entry["Precision"], 6))
    roc_auc.append(round(entry["ROC_AUC"], 6))

# CamelCase alias used in your code
def storeResults(model: str, config: str, acc: float, f1_score: float, rec: float, prec: float, roc: float) -> None:
    store_results(model, config, acc, f1_score, rec, prec, roc)

# --- Clear stored results ---
def clear_results() -> None:
    """Clear internal and legacy storage."""
    global _RESULTS, ML_Model, ML_Config, accuracy, f1, recall, precision, roc_auc
    _RESULTS = []
    ML_Model = []
    ML_Config = []
    accuracy = []
    f1 = []
    recall = []
    precision = []
    roc_auc = []
    print("Results cleared!")

def clearResults() -> None:
    clear_results()

# --- Helpers to produce DataFrames ---
def _results_raw_df() -> pd.DataFrame:
    """Return raw numeric DataFrame (floats 0..1)."""
    if not _RESULTS:
        cols = ["ML Model", "Configuration", "Accuracy", "F1 Score", "Recall", "Precision", "ROC_AUC"]
        return pd.DataFrame(columns=cols)
    df = pd.DataFrame(_RESULTS)
    # drop duplicates keeping last (so new runs overwrite older ones)
    df = df.drop_duplicates(subset=["ML Model", "Configuration"], keep='last').reset_index(drop=True)
    return df

def _results_display_df() -> pd.DataFrame:
    """Return display DataFrame with numeric formatting (6 decimals) matching your examples."""
    df = _results_raw_df().copy()
    # Format numeric columns as floats with 6 decimal places (not percent strings)
    for col in ["Accuracy", "F1 Score", "Recall", "Precision", "ROC_AUC"]:
        if col in df.columns:
            df[col] = df[col].astype(float).map(lambda x: float(f"{x:.6f}"))
    return df

# --- Display & Save (CSV) ---
def display_and_save_results(filename_prefix: str = 'model_exploration') -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Create display DataFrame and sorted DataFrame, print them, and save two CSVs:
      - {results_dir}/{filename_prefix}_results.csv
      - {results_dir}/sorted_{filename_prefix}_results.csv
    Returns (display_df, sorted_display_df)
    """
    df_display = _results_display_df()
    if df_display.empty:
        print("No results to display. Use store_results(...) to add entries first.")
        return df_display, df_display

    # Save raw display CSV (numeric floats)
    out_csv = os.path.join(results_dir, f'{filename_prefix}_results.csv')
    df_display.to_csv(out_csv, index=False)
    print(f"\nSaved results to: {out_csv}")

    # Sorted by F1 Score then Accuracy (descending), like your example
    df_raw_sorted = _results_raw_df().sort_values(by=["F1 Score", "Accuracy"], ascending=False).reset_index(drop=True)
    df_sorted_display = df_raw_sorted.copy()
    for col in ["Accuracy", "F1 Score", "Recall", "Precision", "ROC_AUC"]:
        if col in df_sorted_display.columns:
            df_sorted_display[col] = df_sorted_display[col].astype(float).map(lambda x: float(f"{x:.6f}"))

    sorted_csv = os.path.join(results_dir, f'sorted_{filename_prefix}_results.csv')
    df_sorted_display.to_csv(sorted_csv, index=False)
    print(f"Saved sorted results to: {sorted_csv}\n")

    # Print tables
    print("\n" + "="*100)
    print("MODEL PERFORMANCE RESULTS")
    print("="*100)
    print(df_display.to_string(index=False))

    print("\n" + "="*100)
    print("SORTED MODEL PERFORMANCE RESULTS (by F1 Score and Accuracy)")
    print("="*100)
    print(df_sorted_display.to_string(index=False))

    return df_display, df_sorted_display

# CamelCase alias
def displayAndSaveResults(filename_prefix: str = 'model_exploration') -> Tuple[pd.DataFrame, pd.DataFrame]:
    return display_and_save_results(filename_prefix)

# --- Plotting function ---
def plot_model_comparison(result_df: Optional[pd.DataFrame] = None, plot_filename: str = "model_performance_comparison.png") -> None:
    """
    Plot average performance per model across stored configurations.
    If result_df is None, uses the saved sorted CSV (if exists) or internal results.
    The plot is saved to plots_dir.
    """
    if result_df is None:
        candidate = os.path.join(results_dir, 'sorted_model_exploration_results.csv')
        if os.path.exists(candidate):
            result_df = pd.read_csv(candidate)
        else:
            _, result_df = display_and_save_results()
    if result_df.empty:
        print("No results available to plot.")
        return

    df_plot = result_df.copy()
    # ensure numeric floats (0..1) -> convert to 0..100 for plotting percentage axis
    for col in ["Accuracy", "F1 Score", "Recall", "Precision", "ROC_AUC"]:
        if df_plot[col].dtype == object:
            # convert strings to floats if necessary
            df_plot[col] = df_plot[col].astype(float)
    df_plot[["Accuracy", "F1 Score", "Recall", "Precision", "ROC_AUC"]] = df_plot[["Accuracy", "F1 Score", "Recall", "Precision", "ROC_AUC"]] * 100.0

    metrics_cols = ['Accuracy', 'F1 Score', 'Recall', 'Precision', 'ROC_AUC']
    grouped = df_plot.groupby('ML Model')[metrics_cols].mean().sort_values(by='Accuracy', ascending=False)

    sns.set(style="whitegrid")
    n_metrics = len(metrics_cols)
    fig, axes = plt.subplots(1, n_metrics, figsize=(5 * n_metrics, 6), sharey=True)
    if n_metrics == 1:
        axes = [axes]

    for i, metric in enumerate(metrics_cols):
        ax = axes[i]
        vals = grouped[metric]
        sns.barplot(x=vals.index, y=vals.values, ax=ax, palette='Blues_r')
        ax.set_title(metric)
        ax.set_ylabel(metric + " (%)")
        ax.set_xlabel('')
        ax.tick_params(axis='x', rotation=45)
        # annotate
        for p in ax.patches:
            h = p.get_height()
            ax.text(p.get_x() + p.get_width() / 2., h, f'{h:.2f}%', ha='center', va='bottom', fontsize=9)

    plt.tight_layout()
    out_path = os.path.join(plots_dir, plot_filename)
    plt.savefig(out_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Saved model comparison plot to: {out_path}")

# CamelCase alias
def plotModelComparison(result_df: Optional[pd.DataFrame] = None, plot_filename: str = "model_performance_comparison.png") -> None:
    plot_model_comparison(result_df=result_df, plot_filename=plot_filename)

# --- End framework ---
print("Model results storage framework loaded successfully!")
print("Available functions (aliases included):")
print("- store_results(model, config, accuracy, f1, recall, precision, auc_roc)")
print("- storeResults(...)  # alias")
print("- display_and_save_results(filename_prefix='model_exploration') -> (display_df, sorted_df)")
print("- displayAndSaveResults(...)  # alias")
print("- clear_results() / clearResults()")
print("- plot_model_comparison(result_df=None, plot_filename=None) / plotModelComparison(...)")

Model results storage framework loaded successfully!
Available functions (aliases included):
- store_results(model, config, accuracy, f1, recall, precision, auc_roc)
- storeResults(...)  # alias
- display_and_save_results(filename_prefix='model_exploration') -> (display_df, sorted_df)
- displayAndSaveResults(...)  # alias
- clear_results() / clearResults()
- plot_model_comparison(result_df=None, plot_filename=None) / plotModelComparison(...)


In [None]:
# RandomForest configuration sweep: Original, Normalized, SelectKBest, RFECV, PCA, SMOTE pipelines
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE, RFECV
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn import metrics
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Define the random state
RANDOM_STATE = 42

# Load the training and validation sets
import joblib
import os
processed_data_dir = 'Analysis/processed_data'

X_train = joblib.load(os.path.join(processed_data_dir, 'X_train.joblib'))
X_val = joblib.load(os.path.join(processed_data_dir, 'X_val.joblib'))
y_train = joblib.load(os.path.join(processed_data_dir, 'y_train.joblib'))
y_val = joblib.load(os.path.join(processed_data_dir, 'y_val.joblib'))

# Define a standard preprocessor for reuse
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Clear previous stored results
try:
    clear_results()
except Exception:
    try:
        clearResults()
    except Exception:
        pass

print("\n=== START: RandomForest configuration sweep ===")

# -------------------------
# 1) Prepare configurations
# -------------------------
configurations = []   # tuples: (name, kind, X_train_ready, X_val_ready / or pipeline)
# kind == 'array' -> X arrays ready
# kind == 'pipeline' -> pipeline object that expects raw DataFrame X

# Original Data (raw)
configurations.append(('Original Data', 'array', X_train, X_val))

# Normalized Data (MinMax)
scaler = MinMaxScaler()
X_train_numeric = X_train[numeric_features]
X_val_numeric = X_val[numeric_features]

X_train_normalized = pd.DataFrame(scaler.fit_transform(X_train_numeric), columns=X_train_numeric.columns, index=X_train_numeric.index)
X_val_normalized = pd.DataFrame(scaler.transform(X_val_numeric), columns=X_val_numeric.columns, index=X_val_numeric.index)

X_train_normalized = pd.concat([X_train_normalized, X_train.drop(numeric_features, axis=1)], axis=1)
X_val_normalized = pd.concat([X_val_normalized, X_val.drop(numeric_features, axis=1)], axis=1)

configurations.append(('Normalized Data', 'array', X_train_normalized, X_val_normalized))

# SelectKBest (on numeric data only) -> find optimal k with RF cross-val
print("\n=== SelectKBest Feature Selection ===")
X_train_normalized_numeric = X_train_normalized[numeric_features]
X_val_normalized_numeric = X_val_normalized[numeric_features]

n_features = X_train_normalized_numeric.shape[1]
scores = []
for k in range(1, n_features + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_tr_k = kbest.fit_transform(X_train_normalized_numeric, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=RANDOM_STATE), X_tr_k, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    scores.append(score)

optimal_k = int(np.argmax(scores) + 1)
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = pd.DataFrame(kbest.fit_transform(X_train_normalized_numeric, y_train), columns=X_train_normalized_numeric.columns[kbest.get_support()])
X_val_kbest = pd.DataFrame(kbest.transform(X_val_normalized_numeric), columns=X_train_kbest.columns)
configurations.append(('SelectKBest', 'array', X_train_kbest, X_val_kbest))

# RFECV on the SelectKBest-transformed data (numeric only)
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=RANDOM_STATE)
rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy', n_jobs=-1)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = pd.DataFrame(rfe.fit_transform(X_train_kbest, y_train), columns=X_train_kbest.columns[rfe.get_support()])
X_val_rfe = pd.DataFrame(rfe.transform(X_val_kbest), columns=X_train_rfe.columns)
configurations.append(('RFECV', 'array', X_train_rfe, X_val_rfe))

# PCA on the RFECV-reduced data (numeric only, default 95% cumulative variance)
print("\n=== PCA Dimensionality Reduction ===")
desired_variance = 0.95   # change to 0.90 or 0.99 if needed
pca_full = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
n_components = int(np.argmax(cumulative_variance >= desired_variance) + 1)
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components, random_state=RANDOM_STATE)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_rfe), index=X_train_rfe.index)
X_val_pca = pd.DataFrame(pca.transform(X_val_rfe), index=X_val_rfe.index)
configurations.append(('PCA', 'array', X_train_pca, X_val_pca))

# SMOTE + StandardScaler pipeline (will be treated as 'pipeline')
pipeline_smote_scaler = ImbPipeline(steps=[
    ('preprocessor', preprocessor),            # uses notebook preprocessor
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + StandardScaler', 'pipeline', pipeline_smote_scaler, None))

# SMOTE + GridSearchCV pipeline (we'll grid search the pipeline)
pipeline_smote_grid = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', RandomForestClassifier(random_state=RANDOM_STATE))
])
configurations.append(('SMOTE + GridSearchCV', 'pipeline', pipeline_smote_grid, None))

# -------------------------
# Hyperparameter grids
# -------------------------
param_grid_array = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 50, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['gini', 'entropy']
}
# pipeline keys prefixed with model__
param_grid_pipeline = {f"model__{k}": v for k, v in param_grid_array.items()}

# CV folds
cv_folds = 10

# safe roc auc helper
def safe_roc_auc(y_true, y_proba):
    try:
        if isinstance(y_proba, np.ndarray) and y_proba.shape[1] == 2:
            return metrics.roc_auc_score(y_true, y_proba[:, 1])
        else:
            return metrics.roc_auc_score(pd.get_dummies(y_true), y_proba, multi_class='ovr', average='macro')
    except Exception:
        return np.nan

# -------------------------
# Run GridSearchCV per configuration
# -------------------------
for name, kind, X_tr_cfg, X_val_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    if kind == 'pipeline':
        pipeline = X_tr_cfg
        grid = GridSearchCV(pipeline, param_grid_pipeline, cv=cv_folds, n_jobs=-1, verbose=2)
        # Fit on original raw training data (pipeline handles preprocess & SMOTE)
        grid.fit(X_train, y_train)
        best = grid.best_estimator_
        y_train_pred = best.predict(X_train)
        y_val_pred = best.predict(X_val)
        y_train_proba = best.predict_proba(X_train)
        y_val_proba = best.predict_proba(X_val)
    else:
        X_train_arr = X_tr_cfg
        X_val_arr = X_val_cfg
        grid = GridSearchCV(RandomForestClassifier(random_state=RANDOM_STATE), param_grid_array, cv=cv_folds, n_jobs=-1, verbose=2)
        grid.fit(X_train_arr, y_train)
        best = grid.best_estimator_
        y_train_pred = best.predict(X_train_arr)
        y_val_pred = best.predict(X_val_arr)
        y_train_proba = best.predict_proba(X_train_arr)
        y_val_proba = best.predict_proba(X_val_arr)

    # Build metrics dict and print table (6-decimal floats to match examples)
    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train, y_train_pred),
            metrics.accuracy_score(y_val, y_val_pred),
        ],
        "F1 Score": [
            metrics.f1_score(y_train, y_train_pred, average='macro'),
            metrics.f1_score(y_val, y_val_pred, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train, y_train_pred, average='macro'),
            metrics.recall_score(y_val, y_val_pred, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train, y_train_pred, average='macro', zero_division=0),
            metrics.precision_score(y_val, y_val_pred, average='macro', zero_division=0),
        ],
        "AUC-ROC": [
            safe_roc_auc(y_train, y_train_proba),
            safe_roc_auc(y_val, y_val_proba),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    pd.options.display.float_format = '{:.6f}'.format
    print("\nRandom Forest Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    # Store test metrics using your storage API (camelCase alias)
    test_row = df_metrics[df_metrics['Dataset'] == 'Test'].iloc[0]
    try:
        storeResults(
            'Random Forest',
            name,
            float(test_row['Accuracy']),
            float(test_row['F1 Score']),
            float(test_row['Recall']),
            float(test_row['Precision']),
            float(test_row['AUC-ROC'])
        )
    except Exception:
        # fallback to snake_case
        store_results(
            'Random Forest',
            name,
            float(test_row['Accuracy']),
            float(test_row['F1 Score']),
            float(test_row['Recall']),
            float(test_row['Precision']),
            float(test_row['AUC-ROC'])
        )

    print("\nBest hyperparameters found by GridSearchCV:")
    try:
        print(grid.best_params_)
    except Exception:
        print("best_params_ not available for this run.")

print("\n=== RandomForest sweep complete ===")

# Save & display aggregated results then plot
try:
    displayAndSaveResults('random_forest_sweep')
except Exception:
    display_and_save_results('random_forest_sweep')

try:
    plotModelComparison()
except Exception:
    plot_model_comparison()

Results cleared!

=== START: RandomForest configuration sweep ===

=== SelectKBest Feature Selection ===


ValueError: could not convert string to float: 'M'

### Random Forest with PCA 90

In [None]:
# Written by Hasib

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=42), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=42)

rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 200,300,400],
    'max_depth': [10,20,50, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['gini', 'entropy']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_rf),
            metrics.accuracy_score(y_test, y_test_rf),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.f1_score(y_test, y_test_rf, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.recall_score(y_test, y_test_rf, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.precision_score(y_test, y_test_rf, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
        'Random Forest 90',
        name,
        metrics.accuracy_score(y_test, y_test_rf),
        metrics.f1_score(y_test, y_test_rf, average='macro'),
        metrics.recall_score(y_test, y_test_rf, average='macro'),
        metrics.precision_score(y_test, y_test_rf, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)


NameError: name 'X_test' is not defined

### Random Forest with PCA 95

In [None]:
# Written by Hasib

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=42), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=42)

rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 200,300,400],
    'max_depth': [10,20,50, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['gini', 'entropy']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_rf),
            metrics.accuracy_score(y_test, y_test_rf),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.f1_score(y_test, y_test_rf, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.recall_score(y_test, y_test_rf, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.precision_score(y_test, y_test_rf, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
        'Random Forest 95',
        name,
        metrics.accuracy_score(y_test, y_test_rf),
        metrics.f1_score(y_test, y_test_rf, average='macro'),
        metrics.recall_score(y_test, y_test_rf, average='macro'),
        metrics.precision_score(y_test, y_test_rf, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 17

=== RFECV Feature Selection with Random Forest ===
Optimal number of features selected by RFECV: 17

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 16

=== Random Forest Model Performance with Hyperparameter Tuning ===

Running Random Forest with Original Data configuration...
Fitting 10 folds for each of 128 candidates, totalling 1280 fits

Random Forest Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.933649  0.599386 0.530303   0.977346 0.979634
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Running Random Forest with Normalized Data configuration...
Fitting 10 folds for each of 128 candidates, tot

### Random Forest with PCA 99

In [11]:
# Written by Ovi, 2025-07-07, Random Forest classification with preprocessing and result logging


# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(RandomForestClassifier(random_state=42), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Random Forest ===")
rf_estimator = RandomForestClassifier(random_state=42)

rfecv = RFECV(estimator=rf_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=rf_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 200,300,400],
    'max_depth': [10,20,50, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['gini', 'entropy']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_rf),
            metrics.accuracy_score(y_test, y_test_rf),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.f1_score(y_test, y_test_rf, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.recall_score(y_test, y_test_rf, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
            metrics.precision_score(y_test, y_test_rf, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
        'Random Forest 99',
        name,
        metrics.accuracy_score(y_test, y_test_rf),
        metrics.f1_score(y_test, y_test_rf, average='macro'),
        metrics.recall_score(y_test, y_test_rf, average='macro'),
        metrics.precision_score(y_test, y_test_rf, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 17

=== RFECV Feature Selection with Random Forest ===
Optimal number of features selected by RFECV: 17

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 17

=== Random Forest Model Performance with Hyperparameter Tuning ===

Running Random Forest with Original Data configuration...
Fitting 10 folds for each of 128 candidates, totalling 1280 fits

Random Forest Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.933649  0.599386 0.530303   0.977346 0.979634
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Running Random Forest with Normalized Data configuration...
Fitting 10 folds for each of 128 candidates, tot

---

# Gradient Boosting

### Gradient Boosting with PCA 90

In [14]:
# Written by Ovi, 2025-07-07, Gradient Boosting classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(GradientBoostingClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Gradient Boosting ===")
gbc_estimator = GradientBoostingClassifier()

rfecv = RFECV(estimator=gbc_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=gbc_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200,500],
    'max_depth': [3, 5,7,9,15, 21],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    gbc = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=10, n_jobs=-1, verbose=2)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gbc = gbc.predict(X_train_cfg)
    y_test_gbc = gbc.predict(X_test_cfg)
    y_train_gbc_proba = gbc.predict_proba(X_train_cfg)
    y_test_gbc_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_gbc),
            metrics.accuracy_score(y_test, y_test_gbc),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.f1_score(y_test, y_test_gbc, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.recall_score(y_test, y_test_gbc, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.precision_score(y_test, y_test_gbc, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gbc_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradient Boosting Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro')
    storeResults(
        'Gradient Boosting 90',
        name,
        metrics.accuracy_score(y_test, y_test_gbc),
        metrics.f1_score(y_test, y_test_gbc, average='macro'),
        metrics.recall_score(y_test, y_test_gbc, average='macro'),
        metrics.precision_score(y_test, y_test_gbc, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 14

=== RFECV Feature Selection with Gradient Boosting ===
Optimal number of features selected by RFECV: 14

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 12

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Original Data configuration...
Fitting 10 folds for each of 144 candidates, totalling 1440 fits

Gradient Boosting Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.966825  0.848374 0.765152   0.988275 0.990586
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500, 'subsample': 0.8}

Running Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of 

### Gradient Boosting with PCA 95

In [15]:
# Written by Ovi, 2025-07-07, Gradient Boosting classification with preprocessing and result logging


# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(GradientBoostingClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Gradient Boosting ===")
gbc_estimator = GradientBoostingClassifier()

rfecv = RFECV(estimator=gbc_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=gbc_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200,500],
    'max_depth': [3, 5,7,9,15, 21],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    gbc = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=10, n_jobs=-1, verbose=2)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gbc = gbc.predict(X_train_cfg)
    y_test_gbc = gbc.predict(X_test_cfg)
    y_train_gbc_proba = gbc.predict_proba(X_train_cfg)
    y_test_gbc_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_gbc),
            metrics.accuracy_score(y_test, y_test_gbc),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.f1_score(y_test, y_test_gbc, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.recall_score(y_test, y_test_gbc, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.precision_score(y_test, y_test_gbc, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gbc_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradient Boosting Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro')
    storeResults(
        'Gradient Boosting 95',
        name,
        metrics.accuracy_score(y_test, y_test_gbc),
        metrics.f1_score(y_test, y_test_gbc, average='macro'),
        metrics.recall_score(y_test, y_test_gbc, average='macro'),
        metrics.precision_score(y_test, y_test_gbc, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 16

=== RFECV Feature Selection with Gradient Boosting ===
Optimal number of features selected by RFECV: 16

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 15

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Original Data configuration...
Fitting 10 folds for each of 144 candidates, totalling 1440 fits

Gradient Boosting Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000   1.00000 1.000000    1.00000 1.000000
    Test  0.947867   0.74708 0.661143    0.91675 0.986373
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 0.8}

Running Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of 

### Gradient Boosting with PCA 99

In [16]:
# Written by Ovi, 2025-07-07, Gradient Boosting classification with preprocessing and result logging


# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(GradientBoostingClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Gradient Boosting ===")
gbc_estimator = GradientBoostingClassifier()

rfecv = RFECV(estimator=gbc_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=gbc_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200,500],
    'max_depth': [3, 5,7,9,15, 21],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8],
    'max_features': ['sqrt']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    gbc = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=10, n_jobs=-1, verbose=2)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gbc = gbc.predict(X_train_cfg)
    y_test_gbc = gbc.predict(X_test_cfg)
    y_train_gbc_proba = gbc.predict_proba(X_train_cfg)
    y_test_gbc_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_gbc),
            metrics.accuracy_score(y_test, y_test_gbc),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.f1_score(y_test, y_test_gbc, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.recall_score(y_test, y_test_gbc, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_gbc, average='macro'),
            metrics.precision_score(y_test, y_test_gbc, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gbc_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradient Boosting Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gbc_proba, multi_class='ovr', average='macro')
    storeResults(
        'Gradient Boosting 99',
        name,
        metrics.accuracy_score(y_test, y_test_gbc),
        metrics.f1_score(y_test, y_test_gbc, average='macro'),
        metrics.recall_score(y_test, y_test_gbc, average='macro'),
        metrics.precision_score(y_test, y_test_gbc, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 14

=== RFECV Feature Selection with Gradient Boosting ===
Optimal number of features selected by RFECV: 14

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 14

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Original Data configuration...
Fitting 10 folds for each of 144 candidates, totalling 1440 fits

Gradient Boosting Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.962085  0.820578 0.734848   0.986667 0.990144
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.05, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500, 'subsample': 0.8}

Running Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of

---

# Adaboost

### Adaboost with PCA 90

In [17]:
# Written by Ovi, 2025-07-07, AdaBoost classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(AdaBoostClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with AdaBoost ===")
ab_estimator = AdaBoostClassifier()

rfecv = RFECV(estimator=ab_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=ab_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: AdaBoost + GridSearchCV
print("\n=== AdaBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 1],
    'algorithm': ['SAMME', 'SAMME.R'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning AdaBoost with {name} configuration...")
    ab = GridSearchCV(
        AdaBoostClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    ab.fit(X_train_cfg, y_train_cfg)

    y_train_ab = ab.predict(X_train_cfg)
    y_test_ab = ab.predict(X_test_cfg)
    y_train_ab_proba = ab.predict_proba(X_train_cfg)
    y_test_ab_proba = ab.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_ab),
            metrics.accuracy_score(y_test, y_test_ab),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.f1_score(y_test, y_test_ab, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.recall_score(y_test, y_test_ab, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.precision_score(y_test, y_test_ab, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_ab_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nAdaBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
    storeResults(
        'AdaBoost 90',
        name,
        metrics.accuracy_score(y_test, y_test_ab),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(ab.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 25

=== RFECV Feature Selection with AdaBoost ===
Optimal number of features selected by RFECV: 11

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 10

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.966825  0.860668 0.805082   0.942229 0.992096
Best hyperparameters found by GridSearchCV:
{'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 1, 'n_estimators': 200}

Running AdaBoost with Normalized Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Ac

### Adaboost with PCA 95

In [18]:
# Written by Ovi, 2025-07-07, AdaBoost classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(AdaBoostClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with AdaBoost ===")
ab_estimator = AdaBoostClassifier()

rfecv = RFECV(estimator=ab_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=ab_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: AdaBoost + GridSearchCV
print("\n=== AdaBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 1],
    'algorithm': ['SAMME', 'SAMME.R'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning AdaBoost with {name} configuration...")
    ab = GridSearchCV(
        AdaBoostClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    ab.fit(X_train_cfg, y_train_cfg)

    y_train_ab = ab.predict(X_train_cfg)
    y_test_ab = ab.predict(X_test_cfg)
    y_train_ab_proba = ab.predict_proba(X_train_cfg)
    y_test_ab_proba = ab.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_ab),
            metrics.accuracy_score(y_test, y_test_ab),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.f1_score(y_test, y_test_ab, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.recall_score(y_test, y_test_ab, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.precision_score(y_test, y_test_ab, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_ab_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nAdaBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
    storeResults(
        'AdaBoost 95',
        name,
        metrics.accuracy_score(y_test, y_test_ab),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(ab.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 25

=== RFECV Feature Selection with AdaBoost ===
Optimal number of features selected by RFECV: 11

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 10

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.957346  0.790443 0.693182   0.985075 0.984545
Best hyperparameters found by GridSearchCV:
{'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 1, 'n_estimators': 100}

Running AdaBoost with Normalized Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Ac

### Adaboost with PCA 99

In [19]:
# Written by Ovi, 2025-07-07, AdaBoost classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(AdaBoostClassifier(), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with AdaBoost ===")
ab_estimator = AdaBoostClassifier()

rfecv = RFECV(estimator=ab_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=ab_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: AdaBoost + GridSearchCV
print("\n=== AdaBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 1],
    'algorithm': ['SAMME', 'SAMME.R'],
    'estimator': [DecisionTreeClassifier(max_depth=d) for d in [1, 3, 5]]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning AdaBoost with {name} configuration...")
    ab = GridSearchCV(
        AdaBoostClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    ab.fit(X_train_cfg, y_train_cfg)

    y_train_ab = ab.predict(X_train_cfg)
    y_test_ab = ab.predict(X_test_cfg)
    y_train_ab_proba = ab.predict_proba(X_train_cfg)
    y_test_ab_proba = ab.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_ab),
            metrics.accuracy_score(y_test, y_test_ab),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.f1_score(y_test, y_test_ab, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.recall_score(y_test, y_test_ab, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_ab, average='macro'),
            metrics.precision_score(y_test, y_test_ab, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_ab_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nAdaBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ab_proba, multi_class='ovr', average='macro')
    storeResults(
        'AdaBoost 99',
        name,
        metrics.accuracy_score(y_test, y_test_ab),
        metrics.f1_score(y_test, y_test_ab, average='macro'),
        metrics.recall_score(y_test, y_test_ab, average='macro'),
        metrics.precision_score(y_test, y_test_ab, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(ab.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 25

=== RFECV Feature Selection with AdaBoost ===
Optimal number of features selected by RFECV: 11

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 11

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.957346  0.804293 0.721749   0.938981 0.990977
Best hyperparameters found by GridSearchCV:
{'algorithm': 'SAMME', 'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 1, 'n_estimators': 200}

Running AdaBoost with Normalized Data configuration...
Fitting 10 folds for each of 72 candidates, totalling 720 fits

AdaBoost Model Performance Metrics
 Dataset  Ac

---

# XGBoost

### XGBoost with PCA 90

In [22]:
# Written by Ovi, 2025-07-07, XGBoost classification with preprocessing and result logging



# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                            X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with XGBoost ===")
xgb_estimator = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

rfecv = RFECV(estimator=xgb_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=xgb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost with {name} configuration...")
    xgb = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xgb = xgb.predict(X_train_cfg)
    y_test_xgb = xgb.predict(X_test_cfg)
    y_train_xgb_proba = xgb.predict_proba(X_train_cfg)
    y_test_xgb_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_xgb),
            metrics.accuracy_score(y_test, y_test_xgb),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.f1_score(y_test, y_test_xgb, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.recall_score(y_test, y_test_xgb, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.precision_score(y_test, y_test_xgb, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xgb_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost 90',
        name,
        metrics.accuracy_score(y_test, y_test_xgb),
        metrics.f1_score(y_test, y_test_xgb, average='macro'),
        metrics.recall_score(y_test, y_test_xgb, average='macro'),
        metrics.precision_score(y_test, y_test_xgb, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 15

=== RFECV Feature Selection with XGBoost ===
Optimal number of features selected by RFECV: 15

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 13

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with Original Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.952607  0.755429 0.651515   0.983498 0.986044
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6}

Running XGBoost with Normalized Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Scor

### XGBoost with PCA 95

In [23]:
# Written by Ovi, 2025-07-07, XGBoost classification with preprocessing and result logging


# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                            X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with XGBoost ===")
xgb_estimator = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

rfecv = RFECV(estimator=xgb_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=xgb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost with {name} configuration...")
    xgb = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xgb = xgb.predict(X_train_cfg)
    y_test_xgb = xgb.predict(X_test_cfg)
    y_train_xgb_proba = xgb.predict_proba(X_train_cfg)
    y_test_xgb_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_xgb),
            metrics.accuracy_score(y_test, y_test_xgb),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.f1_score(y_test, y_test_xgb, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.recall_score(y_test, y_test_xgb, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.precision_score(y_test, y_test_xgb, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xgb_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost 95',
        name,
        metrics.accuracy_score(y_test, y_test_xgb),
        metrics.f1_score(y_test, y_test_xgb, average='macro'),
        metrics.recall_score(y_test, y_test_xgb, average='macro'),
        metrics.precision_score(y_test, y_test_xgb, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 15

=== RFECV Feature Selection with XGBoost ===
Optimal number of features selected by RFECV: 15

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 14

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with Original Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.952607  0.755429 0.651515   0.983498 0.986044
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6}

Running XGBoost with Normalized Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Scor

### XGBoost with PCA 99

In [24]:
# Written by Ovi, 2025-07-07, XGBoost classification with preprocessing and result logging



# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                            X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with XGBoost ===")
xgb_estimator = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

rfecv = RFECV(estimator=xgb_estimator, step=1, cv=StratifiedKFold(5), scoring='accuracy')
rfecv.fit(X_train_kbest, y_train)

print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=xgb_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost with {name} configuration...")
    xgb = GridSearchCV(
        XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xgb = xgb.predict(X_train_cfg)
    y_test_xgb = xgb.predict(X_test_cfg)
    y_train_xgb_proba = xgb.predict_proba(X_train_cfg)
    y_test_xgb_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_xgb),
            metrics.accuracy_score(y_test, y_test_xgb),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.f1_score(y_test, y_test_xgb, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.recall_score(y_test, y_test_xgb, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_xgb, average='macro'),
            metrics.precision_score(y_test, y_test_xgb, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xgb_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xgb_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost 99',
        name,
        metrics.accuracy_score(y_test, y_test_xgb),
        metrics.f1_score(y_test, y_test_xgb, average='macro'),
        metrics.recall_score(y_test, y_test_xgb, average='macro'),
        metrics.precision_score(y_test, y_test_xgb, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 15

=== RFECV Feature Selection with XGBoost ===
Optimal number of features selected by RFECV: 15

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 15

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with Original Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  1.000000  1.000000 1.000000   1.000000 1.000000
    Test  0.952607  0.755429 0.651515   0.983498 0.986044
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6}

Running XGBoost with Normalized Data configuration...
Fitting 10 folds for each of 162 candidates, totalling 1620 fits

XGBoost Model Performance Metrics
 Dataset  Accuracy  F1 Scor

---

# Bagging

### Bagging classification with PCA 90

In [26]:
# Written by Ovi, 2025-07-07, Bagging classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(BaggingClassifier(estimator=DecisionTreeClassifier()), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Bagging ===")
# Use single DecisionTreeClassifier for RFECV to enable feature_importances_
tree_estimator = DecisionTreeClassifier(random_state=42)

rfecv = RFECV(
    estimator=tree_estimator,
    step=1,
    cv=StratifiedKFold(5),
    scoring='accuracy',
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=tree_estimator, n_features_to_select=rfecv.n_features_)

X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.90
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: BaggingClassifier + GridSearchCV
print("\n=== Bagging Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_samples': [0.6, 0.8, 1.0],
    'max_features': [0.6, 0.8, 1.0],
    'bootstrap': [True],
    'bootstrap_features': [False],
    'estimator': [ 
        DecisionTreeClassifier(max_depth=3, min_samples_split=2),
        DecisionTreeClassifier(max_depth=5, min_samples_split=5),
        DecisionTreeClassifier(max_depth=None, min_samples_split=10)
    ]
}


for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Bagging with {name} configuration...")
    bag = GridSearchCV(
        BaggingClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    bag.fit(X_train_cfg, y_train_cfg)

    y_train_bag = bag.predict(X_train_cfg)
    y_test_bag = bag.predict(X_test_cfg)
    y_train_bag_proba = bag.predict_proba(X_train_cfg)
    y_test_bag_proba = bag.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_bag),
            metrics.accuracy_score(y_test, y_test_bag),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.f1_score(y_test, y_test_bag, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.recall_score(y_test, y_test_bag, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.precision_score(y_test, y_test_bag, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_bag_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nBagging Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro')
    storeResults(
        'Bagging 90',
        name,
        metrics.accuracy_score(y_test, y_test_bag),
        metrics.f1_score(y_test, y_test_bag, average='macro'),
        metrics.recall_score(y_test, y_test_bag, average='macro'),
        metrics.precision_score(y_test, y_test_bag, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(bag.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 22

=== RFECV Feature Selection with Bagging ===
Optimal number of features selected by RFECV: 1

=== PCA Dimensionality Reduction ===
Number of components that explain 90.0% variance: 1

=== Bagging Model Performance with Hyperparameter Tuning ===

Running Bagging with Original Data configuration...
Fitting 10 folds for each of 81 candidates, totalling 810 fits

Bagging Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.981013  0.916868 0.857639   0.993197 0.999923
    Test  0.933649  0.599386 0.530303   0.977346 0.954901
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(min_samples_split=10), 'max_features': 0.8, 'max_samples': 1.0, 'n_estimators': 100}

Running Bagging with Normalized Data configuration...
Fitting 10 folds for each of 81 candidates, totalling 810 fits

### Bagging classification with PCA 95

In [27]:
# Written by Ovi, 2025-07-07, Bagging classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(BaggingClassifier(estimator=DecisionTreeClassifier()), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Bagging ===")
# Use single DecisionTreeClassifier for RFECV to enable feature_importances_
tree_estimator = DecisionTreeClassifier(random_state=42)

rfecv = RFECV(
    estimator=tree_estimator,
    step=1,
    cv=StratifiedKFold(5),
    scoring='accuracy',
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=tree_estimator, n_features_to_select=rfecv.n_features_)

X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.95
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: BaggingClassifier + GridSearchCV
print("\n=== Bagging Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_samples': [0.6, 0.8, 1.0],
    'max_features': [0.6, 0.8, 1.0],
    'bootstrap': [True],
    'bootstrap_features': [False],
    'estimator': [ 
        DecisionTreeClassifier(max_depth=3, min_samples_split=2),
        DecisionTreeClassifier(max_depth=5, min_samples_split=5),
        DecisionTreeClassifier(max_depth=None, min_samples_split=10)
    ]
}


for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Bagging with {name} configuration...")
    bag = GridSearchCV(
        BaggingClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    bag.fit(X_train_cfg, y_train_cfg)

    y_train_bag = bag.predict(X_train_cfg)
    y_test_bag = bag.predict(X_test_cfg)
    y_train_bag_proba = bag.predict_proba(X_train_cfg)
    y_test_bag_proba = bag.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_bag),
            metrics.accuracy_score(y_test, y_test_bag),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.f1_score(y_test, y_test_bag, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.recall_score(y_test, y_test_bag, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.precision_score(y_test, y_test_bag, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_bag_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nBagging Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro')
    storeResults(
        'Bagging 95',
        name,
        metrics.accuracy_score(y_test, y_test_bag),
        metrics.f1_score(y_test, y_test_bag, average='macro'),
        metrics.recall_score(y_test, y_test_bag, average='macro'),
        metrics.precision_score(y_test, y_test_bag, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(bag.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 10

=== RFECV Feature Selection with Bagging ===
Optimal number of features selected by RFECV: 2

=== PCA Dimensionality Reduction ===
Number of components that explain 95.0% variance: 2

=== Bagging Model Performance with Hyperparameter Tuning ===

Running Bagging with Original Data configuration...
Fitting 10 folds for each of 81 candidates, totalling 810 fits

Bagging Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.958861  0.794992 0.704861   0.985604 0.998462
    Test  0.933649  0.599386 0.530303   0.977346 0.967648
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(max_depth=5, min_samples_split=5), 'max_features': 0.6, 'max_samples': 1.0, 'n_estimators': 150}

Running Bagging with Normalized Data configuration...
Fitting 10 folds for each of 81 candidates, totall

### Bagging classification with PCA 99

In [28]:
# Written by Ovi, 2025-07-07, Bagging classification with preprocessing and result logging

# Store different configurations
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# Step 3.1: SelectKBest
print("\n=== SelectKBest Feature Selection ===")
scores = []
for k in range(1, X_train.shape[1] + 1):
    kbest = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
    score = cross_val_score(BaggingClassifier(estimator=DecisionTreeClassifier()), X_train_kbest, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

optimal_k = scores.index(max(scores)) + 1
print(f"Optimal number of features to select using SelectKBest: {optimal_k}")

kbest = SelectKBest(score_func=f_classif, k=optimal_k)
X_train_kbest = kbest.fit_transform(X_train_normalized, y_train)
X_test_kbest = kbest.transform(X_test_normalized)
selected_features_kbest = X.columns[kbest.get_support()]
configurations.append(('SelectKBest', X_train_kbest, X_test_kbest, y_train))

# Step 3.2: RFECV
print("\n=== RFECV Feature Selection with Bagging ===")
# Use single DecisionTreeClassifier for RFECV to enable feature_importances_
tree_estimator = DecisionTreeClassifier(random_state=42)

rfecv = RFECV(
    estimator=tree_estimator,
    step=1,
    cv=StratifiedKFold(5),
    scoring='accuracy',
    n_jobs=-1
)
rfecv.fit(X_train_kbest, y_train)
print(f"Optimal number of features selected by RFECV: {rfecv.n_features_}")

rfe = RFE(estimator=tree_estimator, n_features_to_select=rfecv.n_features_)
X_train_rfe = rfe.fit_transform(X_train_kbest, y_train)
X_test_rfe = rfe.transform(X_test_kbest)
selected_features_rfe = selected_features_kbest[rfe.get_support()]
configurations.append(('RFECV', X_train_rfe, X_test_rfe, y_train))

# Step 3.3: PCA
print("\n=== PCA Dimensionality Reduction ===")
pca = PCA().fit(X_train_rfe)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
desired_variance = 0.99
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(f'Number of components that explain {desired_variance * 100}% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_rfe)
X_test_pca = pca.transform(X_test_rfe)
configurations.append(('PCA', X_train_pca, X_test_pca, y_train))

# Step 4: BaggingClassifier + GridSearchCV
print("\n=== Bagging Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_samples': [0.6, 0.8, 1.0],
    'max_features': [0.6, 0.8, 1.0],
    'bootstrap': [True],
    'bootstrap_features': [False],
    'estimator': [ 
        DecisionTreeClassifier(max_depth=3, min_samples_split=2),
        DecisionTreeClassifier(max_depth=5, min_samples_split=5),
        DecisionTreeClassifier(max_depth=None, min_samples_split=10)
    ]
}


for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Bagging with {name} configuration...")
    bag = GridSearchCV(
        BaggingClassifier(),
        param_grid,
        cv=10,
        n_jobs=-1,
        verbose=2
    )
    bag.fit(X_train_cfg, y_train_cfg)

    y_train_bag = bag.predict(X_train_cfg)
    y_test_bag = bag.predict(X_test_cfg)
    y_train_bag_proba = bag.predict_proba(X_train_cfg)
    y_test_bag_proba = bag.predict_proba(X_test_cfg)

    metrics_dict = {
        "Dataset": ["Training", "Test"],
        "Accuracy": [
            metrics.accuracy_score(y_train_cfg, y_train_bag),
            metrics.accuracy_score(y_test, y_test_bag),
        ],
        "F1 Score": [
            metrics.f1_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.f1_score(y_test, y_test_bag, average='macro'),
        ],
        "Recall": [
            metrics.recall_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.recall_score(y_test, y_test_bag, average='macro'),
        ],
        "Precision": [
            metrics.precision_score(y_train_cfg, y_train_bag, average='macro'),
            metrics.precision_score(y_test, y_test_bag, average='macro'),
        ],
        "AUC-ROC": [
            metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_bag_proba, multi_class='ovr', average='macro'),
            metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro'),
        ]
    }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nBagging Model Performance Metrics")
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_bag_proba, multi_class='ovr', average='macro')
    storeResults(
        'Bagging 99',
        name,
        metrics.accuracy_score(y_test, y_test_bag),
        metrics.f1_score(y_test, y_test_bag, average='macro'),
        metrics.recall_score(y_test, y_test_bag, average='macro'),
        metrics.precision_score(y_test, y_test_bag, average='macro'),
        auc_score
    )

    print("Best hyperparameters found by GridSearchCV:")
    print(bag.best_params_)



=== SelectKBest Feature Selection ===
Optimal number of features to select using SelectKBest: 13

=== RFECV Feature Selection with Bagging ===
Optimal number of features selected by RFECV: 1

=== PCA Dimensionality Reduction ===
Number of components that explain 99.0% variance: 1

=== Bagging Model Performance with Hyperparameter Tuning ===

Running Bagging with Original Data configuration...
Fitting 10 folds for each of 81 candidates, totalling 810 fits

Bagging Model Performance Metrics
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.957278  0.783941 0.694444   0.985075 0.998444
    Test  0.928910  0.558176 0.488636   0.975845 0.927392
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'bootstrap_features': False, 'estimator': DecisionTreeClassifier(max_depth=5, min_samples_split=5), 'max_features': 1.0, 'max_samples': 0.6, 'n_estimators': 100}

Running Bagging with Normalized Data configuration...
Fitting 10 folds for each of 81 candidates, totall

---

# Results

In [29]:
# Creating the dataframe
result = pd.DataFrame({
    'ML Model': ML_Model,
    'Configuration': ML_Config,
    'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
    'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
    'Recall': [f"{rec * 100:.3f}%" for rec in recall],
    'Precision': [f"{prec * 100:.3f}%" for prec in precision],
    'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
})

# Remove duplicates based on model and configuration
result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)

# Display the result
print("\n" + "=" * 100)
print("MODEL PERFORMANCE RESULTS")
print("=" * 100)
print(result.to_string(index=False))

# Save the result to a CSV file
result.to_csv('results/model_results.csv', index=False)
print("\nResults saved to model_results.csv")

# Sort by Accuracy and F1 Score
sorted_result = result.sort_values(by=['F1 Score', 'Accuracy'], ascending=False).reset_index(drop=True)

# Display the sorted result
print("\n" + "=" * 100)
print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
print("=" * 100)
print(sorted_result.to_string(index=False))

# Save the sorted result
sorted_result.to_csv('results/sorted_model_results.csv', index=False)
print("\nSorted results saved to sorted_model_results.csv")

# Extract top configuration per ML model
top_per_model = sorted_result.groupby('ML Model', as_index=False).first()

# Display and save the top configuration table
print("\n" + "=" * 100)
print("TOP CONFIGURATION PER MODEL")
print("=" * 100)
print(top_per_model.to_string(index=False))

top_per_model.to_csv('results/top_configurations.csv', index=False)
print("\nTop configuration per model saved to top_configurations.csv")



MODEL PERFORMANCE RESULTS
                 ML Model   Configuration Accuracy F1 Score  Recall Precision ROC_AUC
Support Vector Machine 90   Original Data  98.104%  92.171% 86.742%   99.320% 99.754%
Support Vector Machine 90 Normalized Data  98.104%  92.171% 86.742%   99.320% 99.692%
Support Vector Machine 90     SelectKBest  98.104%  92.171% 86.742%   99.320% 99.664%
Support Vector Machine 90           RFECV  98.104%  92.171% 86.742%   99.320% 99.646%
Support Vector Machine 90             PCA  99.052%  96.494% 93.939%   99.656% 99.982%
Support Vector Machine 95   Original Data  98.104%  92.171% 86.742%   99.320% 99.745%
Support Vector Machine 95 Normalized Data  98.104%  92.171% 86.742%   99.320% 99.710%
Support Vector Machine 95     SelectKBest  98.104%  92.171% 86.742%   99.320% 99.646%
Support Vector Machine 95           RFECV  98.104%  92.171% 86.742%   99.320% 99.646%
Support Vector Machine 95             PCA  98.578%  94.186% 89.773%   99.487% 99.644%
Support Vector Machine 99  

In [30]:
import pandas as pd

# Read input CSV
df = pd.read_csv('results/top_configurations.csv')

# Sort by 'Accuracy' column in descending order
df_sorted = df.sort_values(by=['F1 Score', 'Accuracy'], ascending=False)

# Save the sorted DataFrame to a new CSV
df_sorted.to_csv('results/sorted_top_configurations.csv', index=False)

---

# END