

# Importing Libraries




In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.under_sampling import CondensedNearestNeighbour, TomekLinks, RandomUnderSampler
from boruta import BorutaPy
from sklearn.feature_selection import SelectKBest
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

# Dataset Load & Preprocessing

In [13]:
df = pd.read_csv("/content/Sleep_health_and_lifestyle_dataset.csv")
df.fillna("None", inplace=True)
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].str.split('/', expand=True).astype(int)
df.drop(['Person ID', 'Blood Pressure'], axis=1, inplace=True)
df = pd.get_dummies(df, columns=['Occupation', 'BMI Category'], drop_first=False)

label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


# ML Model Result Storage

In [14]:
ML_Model = []
ML_Config = []
accuracy = []
f1_score = []
recall = []
precision = []
auc_roc = []  # Adding a holder for AUC-ROC

# Function to call for storing the results
def storeResults(model, config, a, b, c, d, e):
    """
    Store model performance results

    Parameters:
    model: Name of the ML model
    config: Configuration name (preprocessing steps applied)
    a: Accuracy score
    b: F1 score
    c: Recall score
    d: Precision score
    e: AUC-ROC score
    """
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(a, 6))
    f1_score.append(round(b, 6))
    recall.append(round(c, 6))
    precision.append(round(d, 6))
    auc_roc.append(round(e, 6))

# Random Forest with K-Fold, Oversampling, Undersampling, Randomsampling

In [21]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# RandomForest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Applying Boruta Feature Selection
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=0, random_state=42)
X_train_boruta = boruta_selector.fit_transform(X_train_normalized, y_train)
X_test_boruta = boruta_selector.transform(X_test_normalized)
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# applying Autoencoder
n_features = X_train_boruta.shape[1]
input_layer = Input(shape=(n_features,))
encoded = Dense(32, activation='relu')(input_layer)
bottleneck = Dense(16, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(encoded)
decoded = Dense(n_features, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
autoencoder.fit(X_train_boruta, X_train_boruta, epochs=10, batch_size=32, verbose=0)
encoder = Model(input_layer, bottleneck)
X_train_encoded = encoder.predict(X_train_boruta)
X_test_encoded = encoder.predict(X_test_boruta)
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [120],
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['entropy']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_rf),
              metrics.accuracy_score(y_test, y_test_rf),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.f1_score(y_test, y_test_rf, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.recall_score(y_test, y_test_rf, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.precision_score(y_test, y_test_rf, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
          'Random Forest',
          name,
          metrics.accuracy_score(y_test, y_test_rf),
          metrics.f1_score(y_test, y_test_rf, average='macro'),
          metrics.recall_score(y_test, y_test_rf, average='macro'),
          metrics.precision_score(y_test, y_test_rf, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)


Optimal number of features to select using Boruta: 10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step

=== Random Forest Model Performance with Hyperparameter Tuning ===

Running Random Forest with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Random Forest Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990293
    Test  0.893617  0.843243 0.844207   0.851058 0.931928
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 120}

Running Random Forest with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Random Forest Model Performance Metrics
Conf

# Decision Tree with K-Fold, Oversampling, Undersampling, Randomsampling

In [16]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# # DecisionTree classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Applying Boruta Feature Selection
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=0, random_state=42)
X_train_boruta = boruta_selector.fit_transform(X_train_normalized, y_train)
X_test_boruta = boruta_selector.transform(X_test_normalized)
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

# Step 4: Decision Tree + GridSearchCV
print("\n=== Decision Tree Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [3],
    'max_features': ['sqrt'],
    'criterion': ['entropy']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Decision Tree with {name} configuration...")
    dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    dt.fit(X_train_cfg, y_train_cfg)

    y_train_dt = dt.predict(X_train_cfg)
    y_test_dt = dt.predict(X_test_cfg)
    y_train_dt_proba = dt.predict_proba(X_train_cfg)
    y_test_dt_proba = dt.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_dt),
              metrics.accuracy_score(y_test, y_test_dt),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_dt, average='macro'),
              metrics.f1_score(y_test, y_test_dt, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_dt, average='macro'),
              metrics.recall_score(y_test, y_test_dt, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_dt, average='macro'),
              metrics.precision_score(y_test, y_test_dt, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_dt_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_dt_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nDeicion Tree Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_dt_proba, multi_class='ovr', average='macro')
    storeResults(
        'Decision Tree',
        name,
        metrics.accuracy_score(y_test, y_test_dt),
        metrics.f1_score(y_test, y_test_dt, average='macro'),
        metrics.recall_score(y_test, y_test_dt, average='macro'),
        metrics.precision_score(y_test, y_test_dt, average='macro'),
        auc_score
    )
    print("Best hyperparameters found by GridSearchCV:")
    print(dt.best_params_)


Optimal number of features to select using Boruta: 10

=== Decision Tree Model Performance with Hyperparameter Tuning ===

Running Decision Tree with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Deicion Tree Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.917857  0.902743 0.900755   0.905341 0.984165
    Test  0.893617  0.852012 0.848663   0.872845 0.890642
Best hyperparameters found by GridSearchCV:
{'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 2}

Running Decision Tree with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Deicion Tree Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.917857  0.902743 0.900755   0.905341 0.984165
    Test  0.893617  0.852012 0.848663

# Gradient Boosting with K-Fold, Oversampling, Undersampling, Randomsampling

In [17]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# GradientBoosting classifier
gb = GradientBoostingClassifier(random_state=42)

# Applying Boruta Feature Selection
boruta_selector = BorutaPy(gb, n_estimators='auto', verbose=0, random_state=42)
X_train_boruta = boruta_selector.fit_transform(X_train_normalized, y_train)
X_test_boruta = boruta_selector.transform(X_test_normalized)
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'learning_rate': [0.01],
    'n_estimators': [150],
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'subsample': [0.8]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    gbc = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gb = gbc.predict(X_train_cfg)
    y_test_gb = gbc.predict(X_test_cfg)
    y_train_gb_proba = gbc.predict_proba(X_train_cfg)
    y_test_gb_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_gb),
              metrics.accuracy_score(y_test, y_test_gb),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.f1_score(y_test, y_test_gb, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.recall_score(y_test, y_test_gb, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.precision_score(y_test, y_test_gb, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gb_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradien Boosting Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro')
    storeResults(
          'Gradient Boosting',
          name,
          metrics.accuracy_score(y_test, y_test_gb),
          metrics.f1_score(y_test, y_test_gb, average='macro'),
          metrics.recall_score(y_test, y_test_gb, average='macro'),
          metrics.precision_score(y_test, y_test_gb, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)


Optimal number of features to select using Boruta: 6

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Gradien Boosting Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645   0.927169 0.990717
    Test  0.893617  0.843243 0.844207   0.851058 0.940176
Best hyperparameters found by GridSearchCV:
{'learning_rate': 0.01, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150, 'subsample': 0.8}

Running Gradient Boosting with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Gradien Boosting Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923808 0.920645 

# Extra Trees with K-Fold, Oversampling, Undersampling, Randomsampling

In [18]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

# Step 2: Normalize the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))

# ExtraTrees classifier
et = ExtraTreesClassifier(random_state=42)

# Applying Boruta Feature Selection
boruta_selector = BorutaPy(et, n_estimators='auto', verbose=0, random_state=42)
X_train_boruta = boruta_selector.fit_transform(X_train_normalized, y_train)
X_test_boruta = boruta_selector.transform(X_test_normalized)
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# applying oversampling Smote & ADASYN
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
X_train_resample_smote, y_train_resample_smote = smote.fit_resample(X_train_normalized, y_train)
X_train_resample_adasyn, y_train_resample_adasyn = adasyn.fit_resample(X_train_normalized, y_train)
configurations.append(('SMOTE', X_train_resample_smote, X_test_normalized, y_train_resample_smote))
configurations.append(('ADASYN', X_train_resample_adasyn, X_test_normalized, y_train_resample_adasyn))

# applying undersampling CNN & Tomek Links
cnn = CondensedNearestNeighbour(random_state=42)
tomek = TomekLinks()
X_train_resample_cnn, y_train_resample_cnn = cnn.fit_resample(X_train_normalized, y_train)
X_train_resample_tomek, y_train_resample_tomek = tomek.fit_resample(X_train_normalized, y_train)
configurations.append(('CondensedNN', X_train_resample_cnn, X_test_normalized, y_train_resample_cnn))
configurations.append(('Tomek Links', X_train_resample_tomek, X_test_normalized, y_train_resample_tomek))

# applying randomsampling Randomoversampling & Randomundersampling
ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_resample_ros, y_train_resample_ros = ros.fit_resample(X_train_normalized, y_train)
X_train_resample_rus, y_train_resample_rus = rus.fit_resample(X_train_normalized, y_train)
configurations.append(('Random Oversampling', X_train_resample_ros, X_test_normalized, y_train_resample_ros))
configurations.append(('Random Undersampling', X_train_resample_rus, X_test_normalized, y_train_resample_rus))

# Step 4: Extra Trees + GridSearchCV
print("\n=== Extra Trees Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': [100],
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['entropy']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Extra Trees with {name} configuration...")
    etc = GridSearchCV(ExtraTreesClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, verbose=2)
    etc.fit(X_train_cfg, y_train_cfg)

    y_train_et = etc.predict(X_train_cfg)
    y_test_et = etc.predict(X_test_cfg)
    y_train_et_proba = etc.predict_proba(X_train_cfg)
    y_test_et_proba = etc.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_et),
              metrics.accuracy_score(y_test, y_test_et),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_et, average='macro'),
              metrics.f1_score(y_test, y_test_et, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_et, average='macro'),
              metrics.recall_score(y_test, y_test_et, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_et, average='macro'),
              metrics.precision_score(y_test, y_test_et, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_et_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_et_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nExtraTrees Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_et_proba, multi_class='ovr', average='macro')
    storeResults(
          'Extra Trees',
          name,
          metrics.accuracy_score(y_test, y_test_et),
          metrics.f1_score(y_test, y_test_et, average='macro'),
          metrics.recall_score(y_test, y_test_et, average='macro'),
          metrics.precision_score(y_test, y_test_et, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(etc.best_params_)


Optimal number of features to select using Boruta: 14

=== Extra Trees Model Performance with Hyperparameter Tuning ===

Running Extra Trees with Original Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

ExtraTrees Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923843 0.921041   0.927580 0.990362
    Test  0.893617  0.843243 0.844207   0.851058 0.931017
Best hyperparameters found by GridSearchCV:
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Running Extra Trees with Normalized Data configuration...
Fitting 10 folds for each of 1 candidates, totalling 10 fits

ExtraTrees Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.932143  0.923843 0.921041   0.927580 0.990362
    Tes

# Result


In [19]:
# Creating the dataframe
result = pd.DataFrame({
    'ML Model': ML_Model,
    'Configuration': ML_Config,
    'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
    'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
    'Recall': [f"{rec * 100:.3f}%" for rec in recall],
    'Precision': [f"{prec * 100:.3f}%" for prec in precision],
    'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
})

# Remove duplicates based on model and configuration
result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)

# Display the result
print("\n" + "=" * 100)
print("MODEL PERFORMANCE RESULTS")
print("=" * 100)
print(result.to_string(index=False))

# Save the result to a CSV file
# result.to_csv('final_results/model_results.csv', index=False)
# print("\nResults saved to model_results.csv")

# Sort by Accuracy and F1 Score
sorted_result = result.sort_values(by=['Accuracy', 'F1 Score'], ascending=False).reset_index(drop=True)

# Display the sorted result
print("\n" + "=" * 100)
print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
print("=" * 100)
print(sorted_result.to_string(index=False))

# Save the sorted result
# sorted_result.to_csv('final_results/sorted_model_results.csv', index=False)
# print("\nSorted results saved to sorted_model_results.csv")

# Extract top configuration per ML model
top_per_model = sorted_result.groupby('ML Model', as_index=False).first()

# Display and save the top configuration table
print("\n" + "=" * 100)
print("TOP CONFIGURATION PER MODEL")
print("=" * 100)
print(top_per_model.to_string(index=False))

# top_per_model.to_csv('final_results/top_configurations.csv', index=False)
# print("\nTop configuration per model saved to top_configurations.csv")


MODEL PERFORMANCE RESULTS
         ML Model        Configuration Accuracy F1 Score  Recall Precision ROC_AUC
    Random Forest        Original Data  89.362%  84.324% 84.421%   85.106% 93.193%
    Random Forest      Normalized Data  89.362%  84.324% 84.421%   85.106% 93.177%
    Random Forest               Boruta  89.362%  84.324% 84.421%   85.106% 93.271%
    Random Forest                SMOTE  89.362%  84.324% 84.421%   85.106% 93.143%
    Random Forest               ADASYN  89.362%  84.324% 84.421%   85.106% 92.046%
    Random Forest          CondensedNN  86.170%  80.120% 79.893%   81.177% 88.345%
    Random Forest          Tomek Links  89.362%  84.324% 84.421%   85.106% 93.216%
    Random Forest  Random Oversampling  80.851%  76.802% 79.572%   76.008% 92.216%
    Random Forest Random Undersampling  86.170%  81.481% 82.603%   81.995% 91.479%
    Decision Tree        Original Data  89.362%  85.201% 84.866%   87.284% 89.064%
    Decision Tree      Normalized Data  89.362%  85.201% 84.