<a href="https://colab.research.google.com/github/Miftahul-adib/sleep-disorder/blob/main/Improving_Sleep_Disorder_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Importing Libraries




In [1]:
!pip install boruta category_encoders xgboost catboost lazypredict

Collecting boruta
  Downloading Boruta-0.4.3-py3-none-any.whl.metadata (8.8 kB)
Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-3.3.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.2 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-3.3.2-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.2 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_tracing-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecti

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lazypredict.Supervised import LazyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler, SMOTENC
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import CondensedNearestNeighbour, TomekLinks, RandomUnderSampler
from boruta import BorutaPy
from keras.models import Model, Sequential
from keras.layers import Input, Dense
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

# Dataset Load & Preprocessing

In [3]:
df = pd.read_csv("/content/Sleep_health_and_lifestyle_dataset.csv")
df.fillna("None", inplace=True)
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].str.split('/', expand=True).astype(int)
df.drop(['Person ID', 'Blood Pressure'], axis=1, inplace=True)

df['Occupation'] = df['Occupation'].replace(['Manager', 'Sales Representative', 'Scientist', 'Software Engineer'], 'Other')
df['BMI Category'] = df['BMI Category'].replace({'Normal':22, 'Normal Weight':22, 'Overweight':27, 'Obese':30})

df['Stress_sleep_interaction'] = df['Stress Level'] / df['Quality of Sleep']
df['BMI_Activity'] = df['BMI Category'] * df['Physical Activity Level']
df['Sleep_Heart_ratio'] = df['Sleep Duration'] / df['Heart Rate']
df['Sleep_Steps_ratio'] = df['Sleep Duration'] / df['Daily Steps']
df['Sleep_Stress_ratio'] = df['Sleep Duration'] / df['Stress Level']

df = pd.get_dummies(df, columns=['Occupation'], drop_first=False)

label_encoder = LabelEncoder()
columns = ['Gender', 'Sleep Disorder']
for col in columns:
  df[col] = label_encoder.fit_transform(df[col])

num_col = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Stress_sleep_interaction',
          'Sleep_Heart_ratio', 'Sleep_Steps_ratio', 'Sleep_Stress_ratio', 'Heart Rate', 'Daily Steps',
           'Systolic BP', 'Diastolic BP']

Q1 = df[num_col].quantile(0.25)
Q3 = df[num_col].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[num_col] < (Q1 - 1.5 * IQR)) | (df[num_col] > (Q3 + 1.5 * IQR))).any(axis=1)]
classes, count = np.unique(df['Sleep Disorder'], return_counts=True)

X = df.drop('Sleep Disorder', axis=1)
y = df['Sleep Disorder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


# Apply RobustSclaer, MI, LDA, Boruta, Autoencoder, and SMOTETomek

In [4]:
# Step 2: Normalize the data
scaler = RobustScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Applying Mutual information
mi = SelectKBest(score_func=mutual_info_classif, k=5)
X_train_mi = mi.fit_transform(X_train_normalized, y_train)
X_test_mi = mi.transform(X_test_normalized)

# Applying LDA
lda = LinearDiscriminantAnalysis(n_components=2)
X_train_lda = lda.fit_transform(X_train_mi, y_train)
X_test_lda = lda.transform(X_test_mi)

# RandomForest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Applying Boruta Feature Selection
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=0, random_state=42)

X_train_boruta = boruta_selector.fit_transform(X_train_normalized, y_train)
X_test_boruta = boruta_selector.transform(X_test_normalized)

# applying Autoencoder
n_features = X_train_boruta.shape[1]
input_layer = Input(shape=(n_features,))
encoded = Dense(32, activation='relu')(input_layer)
bottleneck = Dense(16, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(encoded)
decoded = Dense(n_features, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
autoencoder.fit(X_train_boruta, X_train_boruta, epochs=10, batch_size=32, verbose=0)
encoder = Model(input_layer, bottleneck)
X_train_encoded = encoder.predict(X_train_boruta)
X_test_encoded = encoder.predict(X_test_boruta)

smotetomek = SMOTETomek(sampling_strategy='auto',
                   smote=SMOTE(k_neighbors=3, random_state=42),
                   tomek=TomekLinks(sampling_strategy='auto', n_jobs=-1),
                   n_jobs=-1,
                   random_state=42)

X_train_resample, y_train_resample = smotetomek.fit_resample(X_train_normalized, y_train)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


# Trial with LazyClassifier

In [20]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
# models, preds = clf.fit(X_train, X_test, y_train, y_test)
# models, preds = clf.fit(X_train_normalized, X_test_normalized, y_train, y_test)
# models, preds = clf.fit(X_train_mi, X_test_mi, y_train, y_test)
# models, preds = clf.fit(X_train_lda, X_test_lda, y_train, y_test)
# models, preds = clf.fit(X_train_boruta, X_test_boruta, y_train, y_test)
# models, preds = clf.fit(X_train_encoded, X_test_encoded, y_train, y_test)
# models, preds = clf.fit(X_train_resample, X_test_normalized, y_train_resample, y_test)
# print(models)

# ML Model Result Storage

In [5]:
ML_Model = []
ML_Config = []
accuracy = []
f1_score = []
recall = []
precision = []
auc_roc = []  # Adding a holder for AUC-ROC

# Function to call for storing the results
def storeResults(model, config, a, b, c, d, e):
    """
    Store model performance results

    Parameters:
    model: Name of the ML model
    config: Configuration name (preprocessing steps applied)
    a: Accuracy score
    b: F1 score
    c: Recall score
    d: Precision score
    e: AUC-ROC score
    """
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(a, 6))
    f1_score.append(round(b, 6))
    recall.append(round(c, 6))
    precision.append(round(d, 6))
    auc_roc.append(round(e, 6))

# Logistic Regression

In [21]:
from scipy.stats import uniform
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

params = {
    'penalty': ['l1', 'l2'],
    'C': [0.88, 0.89, 0.90, 0.91], #uniform(0.01, 10),
    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],  # Optimization solvers
    'max_iter': [489, 490, 491] #range(100, 1000, 12) #[100, 200, 500, 1000]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Logistic Regression with {name} configuration...")
    logr = GridSearchCV(LogisticRegression(), params, cv=5,
                             n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=2)
    logr.fit(X_train_cfg, y_train_cfg)

    y_train_lr = logr.predict(X_train_cfg)
    y_test_lr = logr.predict(X_test_cfg)
    y_train_lr_proba = logr.predict_proba(X_train_cfg)
    y_test_lr_proba = logr.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_lr),
              metrics.accuracy_score(y_test, y_test_lr),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_lr, average='macro'),
              metrics.f1_score(y_test, y_test_lr, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_lr, average='macro'),
              metrics.recall_score(y_test, y_test_lr, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_lr, average='macro'),
              metrics.precision_score(y_test, y_test_lr, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_lr_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lr_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nLogistic Regression Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lr_proba, multi_class='ovr', average='macro')
    storeResults(
          'Logistic Regression',
          name,
          metrics.accuracy_score(y_test, y_test_lr),
          metrics.f1_score(y_test, y_test_lr, average='macro'),
          metrics.recall_score(y_test, y_test_lr, average='macro'),
          metrics.precision_score(y_test, y_test_lr, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(logr.best_params_)



Running Logistic Regression with Original Data configuration...
Fitting 5 folds for each of 96 candidates, totalling 480 fits

Logistic Regression Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.91      0.88    0.86       0.91     0.93
    Test      0.95      0.93    0.95       0.92     0.97
Best hyperparameters found by GridSearchCV:
{'C': 0.88, 'max_iter': 489, 'penalty': 'l2', 'solver': 'liblinear'}

Running Logistic Regression with SMOTETomek configuration...
Fitting 5 folds for each of 96 candidates, totalling 480 fits

Logistic Regression Model Performance Metrics
Configuration Name:  SMOTETomek
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.90      0.90    0.90       0.90     0.96
    Test      0.95      0.93    0.95       0.92     0.97
Best hyperparameters found by GridSearchCV:
{'C': 0.88, 'max_iter': 489, 'penalty': 'l2', 'solver': 'liblinear'}

Running Logistic Re

# KNN

In [31]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

params = {
    'n_neighbors': np.random.randint(1, 50, 3),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': np.random.randint(1, 5, 1)
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning KNN with {name} configuration...")
    knn = RandomizedSearchCV(KNeighborsClassifier(), params, n_iter=50, cv=10,
                             n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=2)
    knn.fit(X_train_cfg, y_train_cfg)

    y_train_knn = knn.predict(X_train_cfg)
    y_test_knn = knn.predict(X_test_cfg)
    y_train_knn_proba = knn.predict_proba(X_train_cfg)
    y_test_knn_proba = knn.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_knn),
              metrics.accuracy_score(y_test, y_test_knn),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_knn, average='macro'),
              metrics.f1_score(y_test, y_test_knn, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_knn, average='macro'),
              metrics.recall_score(y_test, y_test_knn, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_knn, average='macro'),
              metrics.precision_score(y_test, y_test_knn, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_knn_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_knn_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nKNearestNeighbors Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_knn_proba, multi_class='ovr', average='macro')
    storeResults(
          'K-Nearest Neighbors',
          name,
          metrics.accuracy_score(y_test, y_test_knn),
          metrics.f1_score(y_test, y_test_knn, average='macro'),
          metrics.recall_score(y_test, y_test_knn, average='macro'),
          metrics.precision_score(y_test, y_test_knn, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(knn.best_params_)



Running KNN with Original Data configuration...
Fitting 10 folds for each of 18 candidates, totalling 180 fits

KNearestNeighbors Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.93      0.90    0.89       0.92     0.99
    Test      0.95      0.90    0.95       0.88     0.97
Best hyperparameters found by GridSearchCV:
{'weights': 'distance', 'p': np.int64(1), 'n_neighbors': np.int64(23), 'metric': 'euclidean'}

Running KNN with SMOTETomek configuration...
Fitting 10 folds for each of 18 candidates, totalling 180 fits

KNearestNeighbors Model Performance Metrics
Configuration Name:  SMOTETomek
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.96      0.96    0.96       0.96     1.00
    Test      0.95      0.90    0.95       0.88     0.97
Best hyperparameters found by GridSearchCV:
{'weights': 'distance', 'p': np.int64(1), 'n_neighbors': np.int64(23), 'metric': 'euclidean'}

Run

# Random Forest

In [22]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': range(150, 500, 10), #[300, 350, 400, 450, 500],
    'max_depth': range(2, 50, 10), #[11, 17, 20],
    'min_samples_split': range(1, 10, 1), #[1, 2],
    'min_samples_leaf': range(1, 10, 1), #[2, 3],
    'max_features': ['sqrt'],
    'bootstrap': [False],
    'class_weight': ['balanced'],
    'max_leaf_nodes': range(2, 50, 10), #[30, 50],
    'min_impurity_decrease': np.linspace(0.0001, 0.1, 10), #[0.01, 0.005],
    'ccp_alpha': np.linspace(0.0001, 0.1, 10), #[0.001, 0.003, 0.008],
    'random_state': range(2, 50, 10), #[49, 51],
    'criterion': ['gini', 'entropy', 'log_loss']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=10, n_jobs=-1, n_iter=50, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=2)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_rf),
              metrics.accuracy_score(y_test, y_test_rf),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.f1_score(y_test, y_test_rf, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.recall_score(y_test, y_test_rf, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.precision_score(y_test, y_test_rf, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
          'Random Forest',
          name,
          metrics.accuracy_score(y_test, y_test_rf),
          metrics.f1_score(y_test, y_test_rf, average='macro'),
          metrics.recall_score(y_test, y_test_rf, average='macro'),
          metrics.precision_score(y_test, y_test_rf, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)


Optimal number of features to select using Boruta: 10

=== Random Forest Model Performance with Hyperparameter Tuning ===

Running Random Forest with Original Data configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

Random Forest Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.92      0.90    0.89       0.91     0.95
    Test      0.95      0.90    0.95       0.88     0.98
Best hyperparameters found by GridSearchCV:
{'random_state': 12, 'n_estimators': 170, 'min_samples_split': 4, 'min_samples_leaf': 1, 'min_impurity_decrease': np.float64(0.0112), 'max_leaf_nodes': 32, 'max_features': 'sqrt', 'max_depth': 12, 'criterion': 'gini', 'class_weight': 'balanced', 'ccp_alpha': np.float64(0.0112), 'bootstrap': False}

Running Random Forest with SMOTETomek configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

Random Forest Model Performance Metrics
Config

# XGBoost




In [25]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))

configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))

configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'booster': ['gbtree',],
    'learning_rate': np.linspace(0.0001, 0.1, 10), #[0.1, 0.2],
    'n_estimators': range(50, 500, 10), #[200, 220],
    'estimator__n_estimators': range(50, 500, 10),
    'max_depth': range(2, 50, 10), #[5,12],
    'min_child_weight': range(1, 10, 1), #[1, 3, 7],
    'gamma': np.linspace(0, 0.1, 3), #[0, 0.1],
    'subsample': np.linspace(0.1, 1, 1), #[0.8],
    'colsample_bytree': [0.3, 0.8],
    'colsample_bylevel': [1.0],
    'colsample_bynode': [0.6, 0.8],
    'max_delta_step': [0, 5],

    'reg_alpha': np.linspace(0.1, 1, 1), #[0.1, 0.5, 0.7],
    'reg_lambda': np.linspace(0.1, 1, 1), #[0.1, 0.5, 0.6],
    'scale_pos_weight': [1, 2, 5],

    'sample_type': ["weighted"],
    'normalize_type': ["tree", "forest"],
    'rate_drop': [0, 0.1],
    'skip_drop': [0, 0.1],

    'random_state': [2, 42, 49, 51]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost with {name} configuration...")
    xgb = RandomizedSearchCV(XGBClassifier(), param_grid, n_iter=50, cv=10,
                             n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=2)
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xg = xgb.predict(X_train_cfg)
    y_test_xg = xgb.predict(X_test_cfg)
    y_train_xg_proba = xgb.predict_proba(X_train_cfg)
    y_test_xg_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_xg),
              metrics.accuracy_score(y_test, y_test_xg),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.f1_score(y_test, y_test_xg, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.recall_score(y_test, y_test_xg, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.precision_score(y_test, y_test_xg, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xg_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xg_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xg_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost Model',
        name,
        metrics.accuracy_score(y_test, y_test_xg),
        metrics.f1_score(y_test, y_test_xg, average='macro'),
        metrics.recall_score(y_test, y_test_xg, average='macro'),
        metrics.precision_score(y_test, y_test_xg, average='macro'),
        auc_score
    )
    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)


Optimal number of features to select using Boruta: 10

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with Original Data configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

XGBoost Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.92      0.90    0.89       0.91     0.95
    Test      0.95      0.93    0.95       0.92     0.98
Best hyperparameters found by GridSearchCV:
{'subsample': np.float64(0.1), 'skip_drop': 0.1, 'scale_pos_weight': 5, 'sample_type': 'weighted', 'reg_lambda': np.float64(0.1), 'reg_alpha': np.float64(0.1), 'rate_drop': 0.1, 'random_state': 2, 'normalize_type': 'forest', 'n_estimators': 490, 'min_child_weight': 1, 'max_depth': 42, 'max_delta_step': 5, 'learning_rate': np.float64(0.06670000000000001), 'gamma': np.float64(0.1), 'estimator__n_estimators': 340, 'colsample_bytree': 0.8, 'colsample_bynode': 0.6, 'colsample_bylevel

# Gradient Boosting

In [26]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))

configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'loss': ['log_loss'],
    'learning_rate': np.linspace(0.0001, 0.1, 10), #[0.001],
    'n_estimators': range(40, 400, 10), #[47, 50, 400],
    'subsample': np.linspace(0.1, 0.9, 3), #[0.8653],
    'max_depth': range(2, 100, 5), #[28],
    'init': [None],
    'max_leaf_nodes': [None],
    'min_samples_split': range(2, 50, 3), #[10],
    'min_samples_leaf': range(2, 50, 3), #[15],
    'min_weight_fraction_leaf': [0.0],
    'min_impurity_decrease': [0.0],
    'validation_fraction': [0.1],
    'n_iter_no_change': [None],
    'tol': np.linspace(0.0001, 0.1, 10), #[0.0001],
    'ccp_alpha': np.linspace(0.0001, 0.1, 10), #[0.01, 0.0001],
    'max_features': ['sqrt'],
    'verbose': [0],
    'warm_start': [False],
    'criterion': ['friedman_mse'],
    'random_state': [0]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    gbc = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, cv=10, n_iter=50, n_jobs=-1, scoring=['accuracy'], refit='accuracy', verbose=2)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gb = gbc.predict(X_train_cfg)
    y_test_gb = gbc.predict(X_test_cfg)
    y_train_gb_proba = gbc.predict_proba(X_train_cfg)
    y_test_gb_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_gb),
              metrics.accuracy_score(y_test, y_test_gb),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.f1_score(y_test, y_test_gb, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.recall_score(y_test, y_test_gb, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.precision_score(y_test, y_test_gb, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gb_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradien Boosting Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro')
    storeResults(
          'Gradient Boosting',
          name,
          metrics.accuracy_score(y_test, y_test_gb),
          metrics.f1_score(y_test, y_test_gb, average='macro'),
          metrics.recall_score(y_test, y_test_gb, average='macro'),
          metrics.precision_score(y_test, y_test_gb, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)


Optimal number of features to select using Boruta: 10

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Original Data configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

Gradien Boosting Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.92      0.90    0.89       0.91     0.98
    Test      0.97      0.94    0.97       0.92     0.97
Best hyperparameters found by GridSearchCV:
{'warm_start': False, 'verbose': 0, 'validation_fraction': 0.1, 'tol': np.float64(0.07780000000000001), 'subsample': np.float64(0.9), 'random_state': 0, 'n_iter_no_change': None, 'n_estimators': 310, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 41, 'min_samples_leaf': 29, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'max_features': 'sqrt', 'max_depth': 27, 'loss': 'log_loss', 'learning_rate': np.float64(0.033400000000000006), 'init'

# Extra Trees

In [28]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))

configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))

# Step 4: Extra Trees + GridSearchCV
print("\n=== Extra Trees Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': range(100, 500, 13), #[50, 180],
    'max_depth': range(5, 40, 3), #[10, 20],
    'max_leaf_nodes': range(10, 100, 8), #[50, 100],
    'min_samples_split': np.arange(1, 10, 1), #[2, 4],
    'min_samples_leaf': np.arange(1, 10, 1), #[1, 2],
    'min_weight_fraction_leaf': [0.0],
    'min_impurity_decrease': [0.0],
    'ccp_alpha': np.linspace(0.0001, 0.1, 11), #[0.001, 0.01],
    'max_features': ['sqrt', 'log2'],
    'class_weight': [None],
    'bootstrap': [True, False],
    'oob_score': [True, False],
    'criterion': ['gini', 'log_loss'],
    'random_state': range(2, 123, 5), #[51, 123]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Extra Trees with {name} configuration...")
    etc = RandomizedSearchCV(ExtraTreesClassifier(), param_grid, cv=10, n_iter=50, n_jobs=-1, scoring=["accuracy", "f1_macro"], refit='accuracy', verbose=2)
    etc.fit(X_train_cfg, y_train_cfg)

    y_train_et = etc.predict(X_train_cfg)
    y_test_et = etc.predict(X_test_cfg)
    y_train_et_proba = etc.predict_proba(X_train_cfg)
    y_test_et_proba = etc.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_et),
              metrics.accuracy_score(y_test, y_test_et),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_et, average='macro'),
              metrics.f1_score(y_test, y_test_et, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_et, average='macro'),
              metrics.recall_score(y_test, y_test_et, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_et, average='macro'),
              metrics.precision_score(y_test, y_test_et, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_et_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_et_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nExtraTrees Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_et_proba, multi_class='ovr', average='macro')
    storeResults(
          'Extra Trees',
          name,
          metrics.accuracy_score(y_test, y_test_et),
          metrics.f1_score(y_test, y_test_et, average='macro'),
          metrics.recall_score(y_test, y_test_et, average='macro'),
          metrics.precision_score(y_test, y_test_et, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(etc.best_params_)


Optimal number of features to select using Boruta: 10

=== Extra Trees Model Performance with Hyperparameter Tuning ===

Running Extra Trees with Original Data configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

ExtraTrees Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.93      0.90    0.89       0.92     0.99
    Test      0.95      0.90    0.95       0.88     0.97
Best hyperparameters found by GridSearchCV:
{'random_state': 17, 'oob_score': False, 'n_estimators': 191, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': np.int64(2), 'min_samples_leaf': np.int64(1), 'min_impurity_decrease': 0.0, 'max_leaf_nodes': 98, 'max_features': 'log2', 'max_depth': 32, 'criterion': 'gini', 'class_weight': None, 'ccp_alpha': np.float64(0.0001), 'bootstrap': False}

Running Extra Trees with Normalized Data configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

# ADABoost

In [24]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))

configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))

configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))

# Step 4: Extra Trees + GridSearchCV
print("\n=== AdaBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': range(300, 450, 13), #[50, 150],
    'learning_rate': np.linspace(0.01, 0.05, 3), #[0.005, 0.5, 0.03, 0.003],
    'estimator__max_depth': range(2, 10, 3), #[5, 20],
    'estimator__min_samples_split': range(1, 5, 1), #[8],
    'random_state': range(20, 60) #[42, 1234]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning AdaBoost with {name} configuration...")
    adb = RandomizedSearchCV(AdaBoostClassifier(estimator=DecisionTreeClassifier()), param_grid, cv=10, n_iter=50, n_jobs=-1,
                             scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=2)
    adb.fit(X_train_cfg, y_train_cfg)

    y_train_ad = adb.predict(X_train_cfg)
    y_test_ad = adb.predict(X_test_cfg)
    y_train_ad_proba = adb.predict_proba(X_train_cfg)
    y_test_ad_proba = adb.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_ad),
              metrics.accuracy_score(y_test, y_test_ad),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_ad, average='macro'),
              metrics.f1_score(y_test, y_test_ad, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_ad, average='macro'),
              metrics.recall_score(y_test, y_test_ad, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_ad, average='macro'),
              metrics.precision_score(y_test, y_test_ad, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_ad_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ad_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nAdaBoost Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ad_proba, multi_class='ovr', average='macro')
    storeResults(
          'AdaBoost',
          name,
          metrics.accuracy_score(y_test, y_test_ad),
          metrics.f1_score(y_test, y_test_ad, average='macro'),
          metrics.recall_score(y_test, y_test_ad, average='macro'),
          metrics.precision_score(y_test, y_test_ad, average='macro'),
          auc_score
      )
    print("Best hyperparameters found by GridSearchCV:")
    print(adb.best_params_)

Optimal number of features to select using Boruta: 10

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

AdaBoost Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.92      0.90    0.89       0.91     0.97
    Test      0.95      0.90    0.95       0.88     0.98
Best hyperparameters found by GridSearchCV:
{'random_state': 28, 'n_estimators': 443, 'learning_rate': np.float64(0.03), 'estimator__min_samples_split': 4, 'estimator__max_depth': 2}

Running AdaBoost with Normalized Data configuration...
Fitting 10 folds for each of 50 candidates, totalling 500 fits

AdaBoost Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.92      0.90    0.89       0.91     0.97
    Test      0.97      0.94    0.9

# MLP

In [33]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data', X_train_normalized, X_test_normalized, y_train))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='sgd',
    alpha=0.01,
    batch_size='auto',
    learning_rate='constant',
    max_iter=1000,
    random_state=42)

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning MLP Classifier with {name} configuration...")
    mlp.fit(X_train_cfg, y_train_cfg)

    y_train_mlp = mlp.predict(X_train_cfg)
    y_test_mlp = mlp.predict(X_test_cfg)
    y_train_mlp_proba = mlp.predict_proba(X_train_cfg)
    y_test_mlp_proba = mlp.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_mlp),
              metrics.accuracy_score(y_test, y_test_mlp),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_mlp, average='macro'),
              metrics.f1_score(y_test, y_test_mlp, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_mlp, average='macro'),
              metrics.recall_score(y_test, y_test_mlp, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_mlp, average='macro'),
              metrics.precision_score(y_test, y_test_mlp, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_mlp_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_mlp_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\MLP Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_mlp_proba, multi_class='ovr', average='macro')
    storeResults(
          'MLP Classifier',
          name,
          metrics.accuracy_score(y_test, y_test_mlp),
          metrics.f1_score(y_test, y_test_mlp, average='macro'),
          metrics.recall_score(y_test, y_test_mlp, average='macro'),
          metrics.precision_score(y_test, y_test_mlp, average='macro'),
          auc_score
      )
    # print("Best hyperparameters found by GridSearchCV:")
    # print(mlp.best_params_)


Running MLP Classifier with Original Data configuration...
\MLP Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.64      0.26    0.33       0.21     0.50
    Test      0.64      0.26    0.33       0.21     0.50

Running MLP Classifier with Normalized Data configuration...
\MLP Model Performance Metrics
Configuration Name:  Normalized Data
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.92      0.89    0.88       0.92     0.94
    Test      0.95      0.93    0.95       0.92     0.98

Running MLP Classifier with MI configuration...
\MLP Model Performance Metrics
Configuration Name:  MI
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training      0.89      0.86    0.84       0.89     0.92
    Test      0.95      0.93    0.95       0.92     0.96

Running MLP Classifier with LDA configuration...
\MLP Model Performance Metrics
Configuration Name:  LDA
 Dataset  Accuracy  F

# Result


In [32]:
# Creating the dataframe
result = pd.DataFrame({
    'ML Model': ML_Model,
    'Configuration': ML_Config,
    'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
    'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
    'Recall': [f"{rec * 100:.3f}%" for rec in recall],
    'Precision': [f"{prec * 100:.3f}%" for prec in precision],
    'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
})

# Remove duplicates based on model and configuration
result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)

# Display the result
print("\n" + "=" * 100)
print("MODEL PERFORMANCE RESULTS")
print("=" * 100)
print(result.to_string(index=False))

# Save the result to a CSV file
# result.to_csv('final_results/model_results.csv', index=False)
# print("\nResults saved to model_results.csv")

# Sort by Accuracy and F1 Score
sorted_result = result.sort_values(by=['Accuracy', 'F1 Score'], ascending=False).reset_index(drop=True)

# Display the sorted result
print("\n" + "=" * 100)
print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
print("=" * 100)
print(sorted_result.to_string(index=False))

# Save the sorted result
# sorted_result.to_csv('final_results/sorted_model_results.csv', index=False)
# print("\nSorted results saved to sorted_model_results.csv")

# Extract top configuration per ML model
top_per_model = sorted_result.groupby('ML Model', as_index=False).first()

# Display and save the top configuration table
print("\n" + "=" * 100)
print("TOP CONFIGURATION PER MODEL")
print("=" * 100)
print(top_per_model.to_string(index=False))

# top_per_model.to_csv('final_results/top_configurations.csv', index=False)
# print("\nTop configuration per model saved to top_configurations.csv")


MODEL PERFORMANCE RESULTS
           ML Model   Configuration Accuracy F1 Score  Recall Precision ROC_AUC
Logistic Regression   Original Data  94.828%  92.880% 95.178%   91.579% 94.328%
Logistic Regression      SMOTETomek  89.655%  85.101% 92.475%   81.726% 95.248%
Logistic Regression Normalized Data  94.828%  92.880% 95.178%   91.579% 97.787%
Logistic Regression              MI  93.103%  91.316% 93.217%   90.769% 95.526%
Logistic Regression             LDA  86.207%  84.321% 85.374%   87.907% 95.322%
Logistic Regression          Boruta  94.828%  90.349% 95.178%   87.988% 98.035%
Logistic Regression     Autoencoder  94.828%  92.880% 95.178%   91.579% 97.786%
      Random Forest   Original Data  94.828%  90.349% 95.178%   87.988% 97.929%
      Random Forest      SMOTETomek  94.828%  90.349% 95.178%   87.988% 97.140%
      Random Forest Normalized Data  96.552%  94.385% 97.138%   92.432% 96.515%
      Random Forest              MI  93.103%  87.456% 94.277%   84.788% 96.294%
      Random 