# Installing dependencies

In [1]:
!pip install boruta category_encoders xgboost catboost
!pip uninstall -y scikit-learn imbalanced-learn

!pip install scikit-learn==1.4.2 imbalanced-learn==0.12.0


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler, SMOTENC
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import CondensedNearestNeighbour, TomekLinks, RandomUnderSampler
from boruta import BorutaPy
from keras.models import Model, Sequential
from keras.layers import Input, Dense
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss
from imblearn.under_sampling import TomekLinks

from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss, TomekLinks
from imblearn.combine import SMOTETomek, SMOTEENN

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: imbalanced-learn 0.13.0
Uninstalling imbalanced-learn-0.13.0:
  Successfully uninstalled imbalanced-learn-0.13.0
Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.12.0
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.7/257.7 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbal

2025-10-11 19:08:19.004487: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760209699.275792      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760209699.346753      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data preprocessing



### Data Preprocessing & Feature Engineering

- Loaded the **Sleep Health & Lifestyle** dataset and filled missing values with `"None"`.
- Split **Blood Pressure** into numeric `Systolic BP` and `Diastolic BP`, then dropped `Person ID` and the original column.
- Grouped rare occupations (`Manager`, `Sales Representative`, `Scientist`, `Software Engineer`) into **"Other"**.
- Converted **BMI Category** labels into numeric averages:
  - `Normal → 22`
  - `Overweight → 27`
  - `Obese → 30`
- Created new interaction and ratio features:
  - `Stress_sleep_interaction = Stress Level / Quality of Sleep`
  - `BMI_Activity = BMI Category * Physical Activity Level`
  - `Sleep_Heart_ratio = Sleep Duration / Heart Rate`
  - `Sleep_Steps_ratio = Sleep Duration / Daily Steps`
  - `Sleep_Stress_ratio = Sleep Duration / Stress Level`
- One-hot encoded **Occupation** and label-encoded **Gender** and **Sleep Disorder**.
- Defined:
  - `X` = all feature columns
  - `y` = target (`Sleep Disorder`)
- Performed an **80/20 stratified train-test split** using `random_state=42` for reproducibility.


In [2]:
# ==============================
# Combined Feature Engineering Pipeline
# ==============================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split




df = pd.read_csv("/kaggle/input/sleep-health-and-lifestyle-dataset/Sleep_health_and_lifestyle_dataset.csv")


df.fillna("None", inplace=True)

# Dividing Blood Pressure into Systolic and Diastolic BP
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].str.split('/', expand=True).astype(int)
df.drop(['Person ID', 'Blood Pressure'], axis=1, inplace=True)

# Labeling less number of careers as other
df['Occupation'] = df['Occupation'].replace(['Manager', 'Sales Representative', 'Scientist', 'Software Engineer'], 'Other')

# Adding the average BMI for the range
df['BMI Category'] = df['BMI Category'].replace({'Normal':22, 'Normal Weight':22, 'Overweight':27, 'Obese':30})

# Creating Interaction features
df['Stress_sleep_interaction'] = df['Stress Level'] / df['Quality of Sleep']
df['BMI_Activity'] = df['BMI Category'] * df['Physical Activity Level']
df['Sleep_Heart_ratio'] = df['Sleep Duration'] / df['Heart Rate']
df['Sleep_Steps_ratio'] = df['Sleep Duration'] / df['Daily Steps']
df['Sleep_Stress_ratio'] = df['Sleep Duration'] / df['Stress Level']

df = pd.get_dummies(df, columns=['Occupation'], drop_first=False)

label_encoder = LabelEncoder()
columns = ['Gender', 'Sleep Disorder']
for col in columns:
  df[col] = label_encoder.fit_transform(df[col])




X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



# Preparing Pipeline 

**Prereqs:** `X_train, X_test, y_train, y_test`

---

### 1) Boruta → AE → SMOTE+Tomek
- Scale: `MinMaxScaler()`
- Boruta: `RandomForestClassifier(n_estimators=100, random_state=42)` + `BorutaPy(n_estimators='auto', verbose=0, random_state=42)`
- Autoencoder: Dense `32 → 16(bottleneck) → 32 → n_features`, `Adam(lr=0.001)`, `loss='mse'`, `epochs=10`, `batch_size=32`, `verbose=0`
- Balance: `SMOTETomek(random_state=42)`

---

### 2) PowerTransformer → SMOTE
- Transform: `PowerTransformer(method='yeo-johnson')`
- Balance: `SMOTE(random_state=42, k_neighbors=5)`

---

### 3) RobustScaler → MI(k=10) → SMOTE+Tomek
- Scale: `RobustScaler()`
- Select: `SelectKBest(mutual_info_classif, k=10)`
- Balance: `SMOTETomek(random_state=42)`

---

### 4) MI(k=5) → LDA
- Select: `SelectKBest(mutual_info_classif, k=5)`
- Reduce: `LinearDiscriminantAnalysis(n_components=2)`

---

### 5) MI only
- Select: `SelectKBest(mutual_info_classif, k=10)`

**Note:** Resampling applies to **train** only; test is copied unchanged.



In [3]:
# ==============================================================
# IMPORTS (no logic changes, just what's needed)
# ==============================================================
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE

from boruta import BorutaPy

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam


# ==============================================================
# CONFIG 1 — Boruta + SMOTE + Tomek
# (keeps your variables: X_train_boruta_smotetomek, y_train_boruta_smotetomek, etc.)
# ==============================================================

# Normalize for Boruta (RF works fine without, but you used it—kept as-is)
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized  = scaler.transform(X_test)

# Boruta with RandomForest
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=0, random_state=42)
X_train_boruta = boruta_selector.fit_transform(X_train_normalized, y_train)
X_test_boruta  = boruta_selector.transform(X_test_normalized)

# Autoencoder (exact as you had it)
n_features = X_train_boruta.shape[1]
input_layer = Input(shape=(n_features,))
encoded     = Dense(32, activation='relu')(input_layer)
bottleneck  = Dense(16, activation='relu')(encoded)
decoded     = Dense(32, activation='relu')(bottleneck)
output_layer= Dense(n_features, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
autoencoder.fit(X_train_boruta, X_train_boruta, epochs=10, batch_size=32, verbose=0)

# Encoder-only transform (you created these; keeping intact)
encoder         = Model(inputs=input_layer, outputs=bottleneck)
X_train_encoded = encoder.predict(X_train_boruta)
X_test_encoded  = encoder.predict(X_test_boruta)

# SMOTE + Tomek on Boruta features (final names kept)
smotetomek_boruta = SMOTETomek(random_state=42)
X_train_boruta_smotetomek, y_train_boruta_smotetomek = smotetomek_boruta.fit_resample(X_train_boruta, y_train)
X_test_boruta_smotetomek = X_test_boruta.copy()
y_test_boruta_smotetomek = y_test.copy()


# ==============================================================
# CONFIG 2 — PowerTransformer + SMOTE
# (keeps your variables: X_train_power_smote, y_train_power_smote, etc.)
# ==============================================================

scaler = PowerTransformer(method='yeo-johnson')
X_train_power = scaler.fit_transform(X_train)
X_test_power  = scaler.transform(X_test)

smote_power = SMOTE(random_state=42, k_neighbors=5)
X_train_power_smote, y_train_power_smote = smote_power.fit_resample(X_train_power, y_train)
X_test_power_smote = X_test_power.copy()
y_test_power_smote = y_test.copy()


# ==============================================================
# CONFIG 3 — MI + SMOTE + Tomek
# (keeps your variables: X_train_mi_smotetomek, y_train_mi_smotetomek, etc.)
# ==============================================================

# Robust scaling before MI (as you did)
scaler = RobustScaler()
X_train_robust = scaler.fit_transform(X_train)
X_test_robust  = scaler.transform(X_test)

# Mutual Information selection (k=5) — names kept
mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_mi = mi.fit_transform(X_train_robust, y_train)
X_test_mi  = mi.transform(X_test_robust)

# SMOTE + Tomek over MI features — final names kept
smotetomek_mi = SMOTETomek(random_state=42)
X_train_mi_smotetomek, y_train_mi_smotetomek = smotetomek_mi.fit_resample(X_train_mi, y_train)
X_test_mi_smotetomek = X_test_mi.copy()
y_test_mi_smotetomek = y_test.copy()


# ==============================================================
# CONFIG 4 — LDA
# (keeps your variables: X_train_lda, X_test_lda, y_train_lda, y_test_lda)
# ==============================================================
mi = SelectKBest(score_func=mutual_info_classif, k=5)
X_train_mi = mi.fit_transform(X_train_robust, y_train)
X_test_mi  = mi.transform(X_test_robust)


lda = LinearDiscriminantAnalysis(n_components=2)
X_train_lda = lda.fit_transform(X_train_mi, y_train)  # uses MI features as you wrote
X_test_lda  = lda.transform(X_test_mi)
y_train_lda = y_train.copy()
y_test_lda  = y_test.copy()


# ==============================================================
# CONFIG 5 — MI (no resampling)
# (keeps your variables: X_train_mi, X_test_mi, y_train_mi, y_test_mi)
# ==============================================================

# Re-apply MI exactly as you had it (duplicated in your script—left intact)
mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_mi = mi.fit_transform(X_train_robust, y_train)
X_test_mi  = mi.transform(X_test_robust)


# Final MI-only targets (kept exactly)
y_train_mi = y_train.copy()
y_test_mi  = y_test.copy()


# ==============================================================
print("All configurations prepared!\n")
# ==============================================================


2025-10-11 19:08:45.922100: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
All configurations prepared!



# Base & Ensemble Training and evaluation

### 0) Setup
- Imports ML libs; suppresses warnings.
- Assumes prebuilt configs: **Boruta+SMOTETomek**, **Power+SMOTE**, **MI+SMOTETomek**, **MI**, **LDA**.

---

### 1) Base Models (`get_base_classifiers`)
- **MLPClassifier** `(random_state=42, max_iter=1000)`
- **SVC** `(random_state=42, probability=True)` → enables ROC-AUC
- **AdaBoostClassifier** `(random_state=42, algorithm='SAMME')`
- **RandomForestClassifier** `(random_state=42, n_estimators=100)`
- **ExtraTreesClassifier** `(random_state=42, n_estimators=100)`
- **GradientBoostingClassifier** `(random_state=42)`
- **XGBClassifier** `(random_state=42, eval_metric='logloss', use_label_encoder=False)`
- **LGBMClassifier** `(random_state=42, verbose=-1)`
- **DecisionTreeClassifier** `(random_state=42)`

---

### 2) Ensembles (`get_ensemble_classifiers`)
**Base estimators used across ensembles**
- `XGBClassifier(random_state=42, eval_metric='logloss', n_jobs=-1)`
- `LGBMClassifier(random_state=42, verbose=-1, n_jobs=-1)`
- `ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1)`
- `GradientBoostingClassifier(random_state=42, n_estimators=100)`
- `SVC(kernel='rbf', C=1, gamma='scale', probability=True, random_state=42)`

**StackingClassifier**
- `estimators=base_estimators`
- `final_estimator=LogisticRegression(random_state=42, max_iter=2000, solver='lbfgs', n_jobs=-1)`
- `cv=5`, `n_jobs=-1`, `passthrough=True`

**VotingClassifier (soft)**
- `estimators=base_estimators`, `voting='soft'`, `n_jobs=-1`

**BaggingClassifier**
- `estimator=DecisionTreeClassifier(max_depth=5, min_samples_split=4, random_state=42)`
- `n_estimators=50`, `max_samples=0.8`, `max_features=0.8`, `bootstrap=True`, `n_jobs=-1`, `random_state=42`

---

### 3) Metrics (`calculate_metrics`)
- **Accuracy**, **Precision/Recall/F1 (weighted, `zero_division=0`)**
- **ROC-AUC**: binary → `proba[:,1]`; multiclass → `ovr`, `average='weighted'`; if missing → `'N/A'`.

---

### 4) Training/Eval (`train_and_evaluate`)
- `fit → predict → (try) predict_proba`
- Returns dict with metrics + `Model`, `Configuration`, `Model_Type=('Base'|'Ensemble')`.
- On error: zeros and `'Error'`.

---

### 5) Experiment Grid
- Loops **all base** then **all ensemble** models over each configuration.
- Collects `all_results` → `results_df`.

---

### 6) Reporting & Export
- Sort by **Accuracy**; print **Top-10**, **Top-5 Base**, **Top-3 Ensemble**.
- **Best per configuration** and **best config per model**.
- **Overall best** combo (by Accuracy).
- **Base vs Ensemble** average Accuracy + % difference.
- **Pivots**: Accuracy and F1 (Model × Configuration).
- **Summary stats**: counts + Accuracy mean/median/std/min/max.
- Saves:
  - `complete_model_comparison_results.csv`
  - `complete_model_comparison_sorted.csv`


In [4]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier, 
                               ExtraTreesClassifier, GradientBoostingClassifier,
                               StackingClassifier, VotingClassifier, BaggingClassifier)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                              f1_score, roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')



# ==============================================================================
# STEP 2: DEFINE BASE CLASSIFIERS
# ==============================================================================

def get_base_classifiers():
    """Returns dictionary of base classifiers"""
    return {
        'MLPClassifier': MLPClassifier(random_state=42, max_iter=1000),
        'SVC': SVC(random_state=42, probability=True),
        'AdaBoostClassifier': AdaBoostClassifier(random_state=42, algorithm='SAMME'),
        'RandomForestClassifier': RandomForestClassifier(random_state=42, n_estimators=100),
        'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42, n_estimators=100),
        'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
        'XGBClassifier': XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False),
        'LGBMClassifier': LGBMClassifier(random_state=42, verbose=-1),
        'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42)
    }

# ==============================================================================
# STEP 3: DEFINE ENSEMBLE CLASSIFIERS
# ==============================================================================

def get_ensemble_classifiers():
    """Returns dictionary of ensemble classifiers using improved base estimators"""

    # Diverse base estimators for ensembles
    base_estimators = [
        ('xgb', XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False, n_jobs=-1)),
        ('lgbm', LGBMClassifier(random_state=42, verbose=-1, n_jobs=-1)),
        ('et', ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1)),
        ('gb', GradientBoostingClassifier(random_state=42, n_estimators=100)),
        ('svc', SVC(probability=True, kernel='rbf', C=1, gamma='scale', random_state=42))
    ]

    # Stacking Classifier with Logistic Regression meta-model
    stacking = StackingClassifier(
        estimators=base_estimators,
        final_estimator=LogisticRegression(
            random_state=42,
            max_iter=2000,
            solver='lbfgs',
            n_jobs=-1
        ),
        cv=5,
        n_jobs=-1,
        passthrough=True
    )

    # Voting Classifier (Soft voting for probability averaging)
    voting = VotingClassifier(
        estimators=base_estimators,
        voting='soft',
        n_jobs=-1
    )

    # Bagging Ensemble using shallow Decision Trees (better for bagging)
    bagging = BaggingClassifier(
        estimator=DecisionTreeClassifier(
            max_depth=5,
            min_samples_split=4,
            random_state=42
        ),
        n_estimators=50,
        max_samples=0.8,
        max_features=0.8,
        bootstrap=True,
        n_jobs=-1,
        random_state=42
    )

    return {
        'StackingClassifier': stacking,
        'VotingClassifier': voting,
        'BaggingEnsemble': bagging
    }


# ==============================================================================
# STEP 4: HELPER FUNCTIONS
# ==============================================================================

def calculate_metrics(y_true, y_pred, y_pred_proba=None):
    """Calculate classification metrics"""
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'F1-Score': f1_score(y_true, y_pred, average='weighted', zero_division=0)
    }
    
    if y_pred_proba is not None:
        try:
            if len(np.unique(y_true)) == 2:
                metrics['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba[:, 1])
            else:
                metrics['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba, 
                                                    multi_class='ovr', average='weighted')
        except:
            metrics['ROC-AUC'] = 'N/A'
    else:
        metrics['ROC-AUC'] = 'N/A'
    
    return metrics

def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name, config_name):
    """Train model and return metrics"""
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        try:
            y_pred_proba = model.predict_proba(X_test)
        except:
            y_pred_proba = None
        
        metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
        metrics['Model'] = model_name
        metrics['Configuration'] = config_name
        metrics['Model_Type'] = 'Ensemble' if model_name in ['StackingClassifier', 'VotingClassifier', 'BaggingEnsemble'] else 'Base'
        
        return metrics
    
    except Exception as e:
        print(f"  ❌ Error: {str(e)}")
        return {
            'Model': model_name,
            'Configuration': config_name,
            'Model_Type': 'Ensemble' if model_name in ['StackingClassifier', 'VotingClassifier', 'BaggingEnsemble'] else 'Base',
            'Accuracy': 0.0,
            'Precision': 0.0,
            'Recall': 0.0,
            'F1-Score': 0.0,
            'ROC-AUC': 'Error'
        }

# ==============================================================================
# STEP 5: PREPARE CONFIGURATIONS LIST
# ==============================================================================

configurations = [
    ('Boruta + SMOTE+Tomek',
     X_train_boruta_smotetomek, y_train_boruta_smotetomek,
     X_test_boruta_smotetomek, y_test_boruta_smotetomek),

    ('PowerTransformer + SMOTE',
     X_train_power_smote, y_train_power_smote,
     X_test_power_smote, y_test_power_smote),

    ('MI + SMOTE+Tomek',
     X_train_mi_smotetomek, y_train_mi_smotetomek,
     X_test_mi_smotetomek, y_test_mi_smotetomek),

    ('MI',
     X_train_mi, y_train_mi,
     X_test_mi, y_test_mi),

    ('LDA',
     X_train_lda, y_train_lda,
     X_test_lda, y_test_lda),
]



# ==============================================================================
# STEP 6: TRAIN ALL BASE MODELS ON ALL CONFIGURATIONS
# ==============================================================================

all_results = []

print("="*80)
print("PHASE 1: TRAINING BASE MODELS")
print("="*80)
print()

for config_name, X_tr, y_tr, X_te, y_te in configurations:
    print("="*80)
    print(f"CONFIGURATION: {config_name}")
    print("="*80)
    print(f"Training shape: {X_tr.shape}, Test shape: {X_te.shape}")
    print()
    
    base_classifiers = get_base_classifiers()
    
    for model_name, model in base_classifiers.items():
        print(f"  Training {model_name}...", end=' ')
        
        metrics = train_and_evaluate(
            model, X_tr, y_tr, X_te, y_te, model_name, config_name
        )
        all_results.append(metrics)
        
        print(f"✓ Acc: {metrics['Accuracy']:.4f} | F1: {metrics['F1-Score']:.4f}")
    
    print()

# ==============================================================================
# STEP 7: TRAIN ALL ENSEMBLE MODELS ON ALL CONFIGURATIONS
# ==============================================================================

print("\n" + "="*80)
print("PHASE 2: TRAINING ENSEMBLE MODELS")
print("="*80)
print()

for config_name, X_tr, y_tr, X_te, y_te in configurations:
    print("="*80)
    print(f"CONFIGURATION: {config_name}")
    print("="*80)
    print(f"Training shape: {X_tr.shape}, Test shape: {X_te.shape}")
    print()
    
    ensemble_classifiers = get_ensemble_classifiers()
    
    for model_name, model in ensemble_classifiers.items():
        print(f"  Training {model_name}...", end=' ')
        
        metrics = train_and_evaluate(
            model, X_tr, y_tr, X_te, y_te, model_name, config_name
        )
        all_results.append(metrics)
        
        print(f"✓ Acc: {metrics['Accuracy']:.4f} | F1: {metrics['F1-Score']:.4f}")
    
    print()

# ==============================================================================
# STEP 8: CREATE RESULTS DATAFRAME
# ==============================================================================

results_df = pd.DataFrame(all_results)

# ==============================================================================
# STEP 9: DISPLAY ALL RESULTS SORTED BY ACCURACY
# ==============================================================================

print("\n" + "="*80)
print("🏆 ALL RESULTS SORTED BY ACCURACY (HIGHEST TO LOWEST)")
print("="*80)
print()

results_sorted = results_df.sort_values('Accuracy', ascending=False).reset_index(drop=True)
results_sorted.index = results_sorted.index + 1

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.4f}'.format)

print(results_sorted.to_string())

# ==============================================================================
# STEP 10: TOP PERFORMERS
# ==============================================================================

print("\n" + "="*80)
print("🥇 TOP 10 MODEL-CONFIGURATION COMBINATIONS")
print("="*80)
print()
print(results_sorted.head(10).to_string())

print("\n" + "="*80)
print("🥈 TOP 5 BASE MODELS")
print("="*80)
print()
base_models = results_sorted[results_sorted['Model_Type'] == 'Base'].head(5)
print(base_models.to_string())

print("\n" + "="*80)
print("🥉 TOP 3 ENSEMBLE MODELS")
print("="*80)
print()
ensemble_models = results_sorted[results_sorted['Model_Type'] == 'Ensemble'].head(3)
print(ensemble_models.to_string())

# ==============================================================================
# STEP 11: BEST MODEL PER CONFIGURATION
# ==============================================================================

print("\n" + "="*80)
print("📊 BEST MODEL PER CONFIGURATION")
print("="*80)

for config in configurations:
    config_name = config[0]
    config_results = results_df[results_df['Configuration'] == config_name]
    best_model = config_results.loc[config_results['Accuracy'].idxmax()]
    print(f"\n{config_name}:")
    print(f"  Model:      {best_model['Model']} ({best_model['Model_Type']})")
    print(f"  Accuracy:   {best_model['Accuracy']:.4f}")
    print(f"  Precision:  {best_model['Precision']:.4f}")
    print(f"  Recall:     {best_model['Recall']:.4f}")
    print(f"  F1-Score:   {best_model['F1-Score']:.4f}")
    print(f"  ROC-AUC:    {best_model['ROC-AUC']}")

# ==============================================================================
# STEP 12: BEST CONFIGURATION PER MODEL
# ==============================================================================

print("\n" + "="*80)
print("🎯 BEST CONFIGURATION PER MODEL")
print("="*80)

all_models = results_df['Model'].unique()
for model_name in sorted(all_models):
    model_results = results_df[results_df['Model'] == model_name]
    best_config = model_results.loc[model_results['Accuracy'].idxmax()]
    model_type = best_config['Model_Type']
    print(f"\n{model_name} ({model_type}):")
    print(f"  Best Config:  {best_config['Configuration']}")
    print(f"  Accuracy:     {best_config['Accuracy']:.4f}")
    print(f"  F1-Score:     {best_config['F1-Score']:.4f}")
    print(f"  ROC-AUC:      {best_config['ROC-AUC']}")

# ==============================================================================
# STEP 13: OVERALL BEST MODEL
# ==============================================================================

print("\n" + "="*80)
print("🌟 OVERALL BEST MODEL-CONFIGURATION COMBINATION")
print("="*80)
best_overall = results_df.loc[results_df['Accuracy'].idxmax()]
print(f"\nModel:         {best_overall['Model']}")
print(f"Type:          {best_overall['Model_Type']}")
print(f"Configuration: {best_overall['Configuration']}")
print(f"Accuracy:      {best_overall['Accuracy']:.4f}")
print(f"Precision:     {best_overall['Precision']:.4f}")
print(f"Recall:        {best_overall['Recall']:.4f}")
print(f"F1-Score:      {best_overall['F1-Score']:.4f}")
print(f"ROC-AUC:       {best_overall['ROC-AUC']}")

# ==============================================================================
# STEP 14: COMPARISON - BASE VS ENSEMBLE
# ==============================================================================

print("\n" + "="*80)
print("⚖️ BASE MODELS VS ENSEMBLE MODELS COMPARISON")
print("="*80)

base_avg = results_df[results_df['Model_Type'] == 'Base']['Accuracy'].mean()
ensemble_avg = results_df[results_df['Model_Type'] == 'Ensemble']['Accuracy'].mean()

print(f"\nBase Models Average Accuracy:     {base_avg:.4f}")
print(f"Ensemble Models Average Accuracy: {ensemble_avg:.4f}")
print(f"Difference:                       {ensemble_avg - base_avg:+.4f}")

if ensemble_avg > base_avg:
    print(f"\n✓ Ensemble models perform {((ensemble_avg/base_avg - 1) * 100):.2f}% better on average")
else:
    print(f"\n✗ Base models perform {((base_avg/ensemble_avg - 1) * 100):.2f}% better on average")

# ==============================================================================
# STEP 15: PIVOT TABLES
# ==============================================================================

print("\n" + "="*80)
print("📈 PIVOT TABLE: ACCURACY BY MODEL AND CONFIGURATION")
print("="*80)
pivot_accuracy = results_df.pivot_table(
    index='Model', 
    columns='Configuration', 
    values='Accuracy',
    aggfunc='first'
)
print(pivot_accuracy.to_string())

print("\n" + "="*80)
print("📈 PIVOT TABLE: F1-SCORE BY MODEL AND CONFIGURATION")
print("="*80)
pivot_f1 = results_df.pivot_table(
    index='Model', 
    columns='Configuration', 
    values='F1-Score',
    aggfunc='first'
)
print(pivot_f1.to_string())

# ==============================================================================
# STEP 16: SUMMARY STATISTICS
# ==============================================================================

print("\n" + "="*80)
print("📊 SUMMARY STATISTICS")
print("="*80)
print(f"\nTotal Experiments:        {len(results_df)}")
print(f"Base Models:              {len(results_df[results_df['Model_Type'] == 'Base'])}")
print(f"Ensemble Models:          {len(results_df[results_df['Model_Type'] == 'Ensemble'])}")
print(f"Number of Configurations: {len(configurations)}")
print(f"Number of Model Types:    {results_df['Model'].nunique()}")

print(f"\nOverall Accuracy Statistics:")
print(f"  Mean:    {results_df['Accuracy'].mean():.4f}")
print(f"  Median:  {results_df['Accuracy'].median():.4f}")
print(f"  Std Dev: {results_df['Accuracy'].std():.4f}")
print(f"  Min:     {results_df['Accuracy'].min():.4f}")
print(f"  Max:     {results_df['Accuracy'].max():.4f}")

# ==============================================================================
# STEP 17: EXPORT RESULTS
# ==============================================================================

results_df.to_csv('complete_model_comparison_results.csv', index=False)
results_sorted.to_csv('complete_model_comparison_sorted.csv', index=False)

print("\n" + "="*80)
print("💾 RESULTS EXPORTED")
print("="*80)
print("  ✓ complete_model_comparison_results.csv")
print("  ✓ complete_model_comparison_sorted.csv")
print("="*80)

PHASE 1: TRAINING BASE MODELS

CONFIGURATION: Boruta + SMOTE+Tomek
Training shape: (519, 10), Test shape: (75, 10)

  Training MLPClassifier... ✓ Acc: 0.9467 | F1: 0.9484
  Training SVC... ✓ Acc: 0.9333 | F1: 0.9341
  Training AdaBoostClassifier... ✓ Acc: 0.9067 | F1: 0.9062
  Training RandomForestClassifier... ✓ Acc: 0.9600 | F1: 0.9599
  Training ExtraTreesClassifier... ✓ Acc: 0.9333 | F1: 0.9333
  Training GradientBoostingClassifier... ✓ Acc: 0.9600 | F1: 0.9599
  Training XGBClassifier... ✓ Acc: 0.9333 | F1: 0.9333
  Training LGBMClassifier... ✓ Acc: 0.9333 | F1: 0.9338
  Training DecisionTreeClassifier... ✓ Acc: 0.9200 | F1: 0.9213

CONFIGURATION: PowerTransformer + SMOTE
Training shape: (525, 24), Test shape: (75, 24)

  Training MLPClassifier... ✓ Acc: 0.9200 | F1: 0.9213
  Training SVC... ✓ Acc: 0.9467 | F1: 0.9474
  Training AdaBoostClassifier... ✓ Acc: 0.9600 | F1: 0.9595
  Training RandomForestClassifier... ✓ Acc: 0.9600 | F1: 0.9606
  Training ExtraTreesClassifier... ✓ Acc: