

# Importing Libraries




In [1]:
!pip install boruta category_encoders xgboost



In [2]:
!pip install boruta category_encoders xgboost catboost



!pip uninstall -y scikit-learn imbalanced-learn
# Step 1: Uninstall old versions (run twice to ensure cleanup)
!pip uninstall -y scikit-learn imbalanced-learn

# Step 2: Reinstall compatible latest versions
!pip install --upgrade --no-cache-dir scikit-learn==1.4.2 imbalanced-learn==0.12.3
!pip install scikit-learn==1.4.2 imbalanced-learn==0.12.0


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler, SMOTENC
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import CondensedNearestNeighbour, TomekLinks, RandomUnderSampler
from boruta import BorutaPy
from keras.models import Model, Sequential
from keras.layers import Input, Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: imbalanced-learn 0.13.0
Uninstalling imbalanced-learn-0.13.0:
  Successfully uninstalled imbalanced-learn-0.13.0
[0mCollecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.12.3
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m204.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m334.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imba

2025-10-16 04:56:32.291560: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760590592.536875      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760590592.601085      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Dataset Load & Preprocessing

In [3]:
df = pd.read_csv("/kaggle/input/sleep-health-and-lifestyle-dataset/Sleep_health_and_lifestyle_dataset.csv")
df.fillna("None", inplace=True)

df_train, df_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['Sleep Disorder']
)

for d in (df_train, df_test):
    # Dividing Blood Pressure into Systolic and Diastolic BP
    d[['Systolic BP', 'Diastolic BP']] = d['Blood Pressure'].str.split('/', expand=True).astype(int)
    d.drop(['Person ID', 'Blood Pressure'], axis=1, inplace=True)

    # Labeling less number of careers as other
    d['Occupation'] = d['Occupation'].replace(['Manager', 'Sales Representative', 'Scientist', 'Software Engineer'], 'Other')

    # Adding the average BMI for the range
    d['BMI Category'] = d['BMI Category'].replace({'Normal':22, 'Normal Weight':22, 'Overweight':27, 'Obese':30})

    # Creating Interaction features
    eps = 1e-6
    d['Stress_sleep_interaction'] = d['Stress Level'] / (d['Quality of Sleep'] + eps)
    d['BMI_Activity'] = d['BMI Category'] * d['Physical Activity Level']
    d['Sleep_Heart_ratio'] = d['Sleep Duration'] / (d['Heart Rate'] + eps)
    d['Sleep_Steps_ratio'] = d['Sleep Duration'] / (d['Daily Steps'] + eps)
    d['Sleep_Stress_ratio'] = d['Sleep Duration'] / (d['Stress Level'] + eps)
    d['Pulse_Pressure'] = d['Systolic BP'] - d['Diastolic BP']
    d['log_steps'] = np.log1p(d['Daily Steps'])
    d['sqrt_sleep'] = np.sqrt(d['Sleep Duration'])
    d.replace([np.inf, -np.inf], np.nan, inplace=True)
    d.dropna(inplace=True)

# One-hot encode Occupation on train, then align test to train columns
df_train = pd.get_dummies(df_train, columns=['Occupation'], drop_first=False)
df_test  = pd.get_dummies(df_test,  columns=['Occupation'], drop_first=False)

# Ensure test has same dummy columns as train (add missing columns with 0)
df_test = df_test.reindex(columns=df_train.columns, fill_value=0)

# Label encode Gender using encoder fitted on train (no leakage)
le_gender = LabelEncoder()
df_train['Gender'] = le_gender.fit_transform(df_train['Gender'])
# transform test using the same encoder; if unseen label appears this will raise — same behavior as original approach
df_test['Gender'] = le_gender.transform(df_test['Gender'])

# Encode target (Sleep Disorder) using encoder fitted on train only
le_target = LabelEncoder()
y_train = le_target.fit_transform(df_train['Sleep Disorder'])
y_test  = le_target.transform(df_test['Sleep Disorder'])

# Prepare X_train and X_test (drop target column exactly like original)
X_train = df_train.drop('Sleep Disorder', axis=1)
X_test  = df_test.drop('Sleep Disorder', axis=1)


# Apply RobustSclaer, MI, LDA, Boruta, Autoencoder, and SMOTETomek

In [4]:
#### Pipeline 1 - Robustscaler -> MI - LDA  ####
#Normalize the data
scaler = RobustScaler()
X_train_robust = scaler.fit_transform(X_train)
X_test_robust = scaler.transform(X_test)

smotetomek = SMOTETomek(sampling_strategy='auto',
                   smote=SMOTE(k_neighbors=3, random_state=42),
                   tomek=TomekLinks(sampling_strategy='auto', n_jobs=-1),
                   n_jobs=-1,
                   random_state=42)

X_train_robust_resample, y_train_robust_resample = smotetomek.fit_resample(X_train_robust, y_train)

# Applying Mutual information
mi = SelectKBest(score_func=mutual_info_classif, k=5)
X_train_mi = mi.fit_transform(X_train_robust, y_train)
X_test_mi = mi.transform(X_test_robust)

# Applying LDA
lda = LinearDiscriminantAnalysis(n_components=2)
X_train_lda = lda.fit_transform(X_train_mi, y_train)
X_test_lda = lda.transform(X_test_mi)

# Resample MI and LDA data
X_train_mi_res, y_train_mi_res = smotetomek.fit_resample(X_train_mi, y_train)
X_train_lda_res, y_train_lda_res = smotetomek.fit_resample(X_train_lda, y_train)

#### Pipeline 2 - MinMaxscaler -> Boruta - Autoencoder  ####
#Normalize the data
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_test_minmax = scaler.transform(X_test)

X_train_minmax_resample, y_train_minmax_resample = smotetomek.fit_resample(X_train_minmax, y_train)

# RandomForest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Applying Boruta Feature Selection
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=0, random_state=42)

X_train_boruta = boruta_selector.fit_transform(X_train_minmax, y_train)
X_test_boruta = boruta_selector.transform(X_test_minmax)

# applying Autoencoder
n_features = X_train_boruta.shape[1]
input_layer = Input(shape=(n_features,))
encoded     = Dense(32, activation='relu')(input_layer)
bottleneck  = Dense(16, activation='relu')(encoded)
decoded     = Dense(32, activation='relu')(bottleneck)
output_layer= Dense(n_features, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
autoencoder.fit(X_train_boruta, X_train_boruta, epochs=10, batch_size=32, verbose=0)

# Encoder-only transform (you created these; keeping intact)
encoder = Model(inputs=input_layer, outputs=bottleneck)
X_train_encoded = encoder.predict(X_train_boruta)
X_test_encoded  = encoder.predict(X_test_boruta)

# Resample MI and LDA data
X_train_boruta_res, y_train_boruta_res = smotetomek.fit_resample(X_train_boruta, y_train)
X_train_encoded_res, y_train_encoded_res = smotetomek.fit_resample(X_train_encoded, y_train)

I0000 00:00:1760590613.253126      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1760590613.253833      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5
I0000 00:00:1760590615.978304     112 service.cc:148] XLA service 0x794a30011fb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1760590615.979181     112 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1760590615.979202     112 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1760590616.223387     112 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1760590617.140709     112 device_compiler.h:188] Compiled clust

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


# ML Model Result and parameter Storage

In [5]:
import csv

# Initialize lists to store model performance results
ML_Model = []
ML_Config = []
accuracy = []
f1_score = []
recall = []
precision = []
auc_roc = []

def storeResults(model, config, a, b, c, d, e, csv_file='model_performance.csv'):
    """
    Store model performance results in a CSV file.

    Parameters:
    model: Name of the ML model
    config: Configuration name (preprocessing steps applied)
    a: Accuracy score
    b: F1 score
    c: Recall score
    d: Precision score
    e: AUC-ROC score
    csv_file: The CSV file where the performance results will be stored
    """
    # Open the CSV file in append mode
    with open(csv_file, mode='a', newline='') as file:
        writer = csv.writer(file)

        # Write the header if the file is empty (only the first time)
        if file.tell() == 0:
            writer.writerow(['Model', 'Configuration', 'Accuracy', 'F1 Score', 'Recall', 'Precision', 'AUC-ROC'])

        # Write the model performance results to the CSV file
        writer.writerow([model, config, round(a, 6), round(b, 6), round(c, 6), round(d, 6), round(e, 6)])

    # Optionally, store in memory as well
    ML_Model.append(model)
    ML_Config.append(config)
    accuracy.append(round(a, 6))
    f1_score.append(round(b, 6))
    recall.append(round(c, 6))
    precision.append(round(d, 6))
    auc_roc.append(round(e, 6))

# Example usage:
# storeResults('SVM', 'Boruta + SMOTE+Tomek', 0.92, 0.91, 0.93, 0.90, 0.94)


best_params_dict = {}

import csv

def storeBestParams(config_name, best_params, classifier_name, csv_file='best_params.csv'):
    """
    Store the best parameters for each classifier and configuration in a CSV file.

    Parameters:
    config_name: Name of the configuration (e.g., preprocessing applied)
    best_params: Best hyperparameters found by the model
    classifier_name: Name of the classifier (e.g., 'KNN', 'SVM', etc.)
    csv_file: The CSV file where the best parameters will be stored
    """
        # Check if the classifier already exists in the dictionary, if not, initialize it
    if classifier_name not in best_params_dict:
        best_params_dict[classifier_name] = {}

    # Store the best parameters for the given configuration in the dictionary
    best_params_dict[classifier_name][config_name] = best_params
    # Open the CSV file in append mode
    with open(csv_file, mode='a', newline='') as file:
        writer = csv.writer(file)

        # Write the header if the file is empty (only the first time)
        if file.tell() == 0:
            writer.writerow(['Classifier', 'Configuration', 'Best Parameters'])

        # Write the best parameters for the given classifier and configuration
        writer.writerow([classifier_name, config_name, str(best_params)])



# Example usage:
# storeBestParams('Boruta + SMOTE+Tomek', {'C': 1, 'gamma': 'scale'}, 'SVM')




# Logistic Regression

In [6]:
##### import warnings
warnings.filterwarnings('ignore')
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

params = [
    {
        'solver': ['saga'],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': np.linspace(0.01, 100, 10)
    },
    {
        'solver': ['lbfgs'],
        'penalty': ['l2', 'none'],
        'C': np.linspace(0.01, 100, 10)
    },
    {
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'C': np.linspace(0.01, 100, 10)
    }
]


for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Logistic Regression with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    logr = RandomizedSearchCV(LogisticRegression(max_iter=5000), params, cv=cv, n_iter=50,
                             n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    # logr= GridSearchCV(LogisticRegression(), params, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)
    logr.fit(X_train_cfg, y_train_cfg)

    y_train_lr = logr.predict(X_train_cfg)
    y_test_lr = logr.predict(X_test_cfg)
    y_train_lr_proba = logr.predict_proba(X_train_cfg)
    y_test_lr_proba = logr.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_lr),
              metrics.accuracy_score(y_test, y_test_lr),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_lr, average='macro'),
              metrics.f1_score(y_test, y_test_lr, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_lr, average='macro'),
              metrics.recall_score(y_test, y_test_lr, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_lr, average='macro'),
              metrics.precision_score(y_test, y_test_lr, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_lr_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lr_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nLogistic Regression Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lr_proba, multi_class='ovr', average='macro')
    storeResults(
          'Logistic Regression',
          name,
          metrics.accuracy_score(y_test, y_test_lr),
          metrics.f1_score(y_test, y_test_lr, average='macro'),
          metrics.recall_score(y_test, y_test_lr, average='macro'),
          metrics.precision_score(y_test, y_test_lr, average='macro'),
          auc_score
      )
    storeBestParams(name, logr.best_params_, "logistic regression")
    print("Best hyperparameters found by GridSearchCV:")
    print(logr.best_params_)



Running Logistic Regression with Original Data configuration...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Logistic Regression Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.913043  0.895726 0.891459   0.900717 0.956114
    Test  0.973333  0.956583 0.955556   0.962963 0.998183
Best hyperparameters found by GridSearchCV:
{'solver': 'liblinear', 'penalty': 'l1', 'C': 100.0}

Running Logistic Regression with Normalized Data with RobustScaler configuration...





Logistic Regression Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901213 0.896836   0.905995 0.960341
    Test  0.946667  0.921421 0.927146   0.916340 0.997812
Best hyperparameters found by GridSearchCV:
{'solver': 'liblinear', 'penalty': 'l1', 'C': 88.89}

Running Logistic Regression with SMOTETomek + RobustScaler configuration...





Logistic Regression Model Performance Metrics
Configuration Name:  SMOTETomek + RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.902672  0.902682 0.902759   0.908230 0.970730
    Test  0.960000  0.943788 0.947980   0.947368 0.992627
Best hyperparameters found by GridSearchCV:
{'solver': 'lbfgs', 'penalty': 'l2', 'C': 22.23}

Running Logistic Regression with MI configuration...

Logistic Regression Model Performance Metrics
Configuration Name:  MI
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.886288  0.858664 0.858863   0.859107 0.918226
    Test  0.946667  0.922244 0.928535   0.916667 0.973541
Best hyperparameters found by GridSearchCV:
{'solver': 'saga', 'penalty': 'l1', 'C': 33.339999999999996}

Running Logistic Regression with MI + SMOTETomek configuration...

Logistic Regression Model Performance Metrics
Configuration Name:  MI + SMOTETomek
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.846154  0.8471




Logistic Regression Model Performance Metrics
Configuration Name:  Normalized Data with MinMaxScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.913043  0.895726 0.891459   0.900717 0.953633
    Test  0.973333  0.956583 0.955556   0.962963 0.999294
Best hyperparameters found by GridSearchCV:
{'solver': 'liblinear', 'penalty': 'l1', 'C': 55.559999999999995}

Running Logistic Regression with SMOTETomek + MiMaxScaler configuration...





Logistic Regression Model Performance Metrics
Configuration Name:  SMOTETomek + MiMaxScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.908397  0.908545 0.908462   0.912970 0.967040
    Test  0.946667  0.937442 0.955051   0.925146 0.981516
Best hyperparameters found by GridSearchCV:
{'solver': 'liblinear', 'penalty': 'l2', 'C': 22.23}

Running Logistic Regression with Boruta configuration...

Logistic Regression Model Performance Metrics
Configuration Name:  Boruta
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.899666  0.874035 0.869954   0.878348 0.933605
    Test  0.933333  0.900930 0.904924   0.904184 0.972546
Best hyperparameters found by GridSearchCV:
{'solver': 'liblinear', 'penalty': 'l2', 'C': 100.0}

Running Logistic Regression with Boruta + SMOTETomek configuration...





Logistic Regression Model Performance Metrics
Configuration Name:  Boruta + SMOTETomek
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.868069  0.867967 0.868175   0.870618 0.955176
    Test  0.933333  0.910146 0.919571   0.906015 0.959702
Best hyperparameters found by GridSearchCV:
{'solver': 'saga', 'penalty': 'l1', 'C': 11.12}

Running Logistic Regression with Autoencoder configuration...

Logistic Regression Model Performance Metrics
Configuration Name:  Autoencoder
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.886288  0.864413 0.858863   0.871068 0.930268
    Test  0.946667  0.935980 0.941793   0.930810 0.973504
Best hyperparameters found by GridSearchCV:
{'solver': 'liblinear', 'penalty': 'l2', 'C': 88.89}

Running Logistic Regression with Autoencoder + SMOTETomek configuration...

Logistic Regression Model Performance Metrics
Configuration Name:  Autoencoder + SMOTETomek
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Train

# KNN

In [7]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

params = {
    'n_neighbors': np.random.randint(2, 50, 3),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
    'p': np.random.randint(1, 5, 1)
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning KNN with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    knn = RandomizedSearchCV(KNeighborsClassifier(), params, cv=cv, n_iter=50,
                             n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    # knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_cfg, y_train_cfg)

    y_train_knn = knn.predict(X_train_cfg)
    y_test_knn = knn.predict(X_test_cfg)
    y_train_knn_proba = knn.predict_proba(X_train_cfg)
    y_test_knn_proba = knn.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_knn),
              metrics.accuracy_score(y_test, y_test_knn),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_knn, average='macro'),
              metrics.f1_score(y_test, y_test_knn, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_knn, average='macro'),
              metrics.recall_score(y_test, y_test_knn, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_knn, average='macro'),
              metrics.precision_score(y_test, y_test_knn, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_knn_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_knn_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nKNearestNeighbors Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_knn_proba, multi_class='ovr', average='macro')
    storeResults(
          'K-Nearest Neighbors',
          name,
          metrics.accuracy_score(y_test, y_test_knn),
          metrics.f1_score(y_test, y_test_knn, average='macro'),
          metrics.recall_score(y_test, y_test_knn, average='macro'),
          metrics.precision_score(y_test, y_test_knn, average='macro'),
          auc_score
      )
    storeBestParams(name, knn.best_params_, "KNN")
    print("Best hyperparameters found by GridSearchCV:")
    print(knn.best_params_)



Running KNN with Original Data configuration...

KNearestNeighbors Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.923077  0.908611 0.904117   0.913518 0.987279
    Test  0.920000  0.895256 0.911995   0.881944 0.951633
Best hyperparameters found by GridSearchCV:
{'weights': 'distance', 'p': 2, 'n_neighbors': 17, 'metric': 'euclidean'}

Running KNN with Normalized Data with RobustScaler configuration...

KNearestNeighbors Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.923077  0.908611 0.904117   0.913518 0.987279
    Test  0.946667  0.921421 0.927146   0.916340 0.956252
Best hyperparameters found by GridSearchCV:
{'weights': 'distance', 'p': 2, 'n_neighbors': 17, 'metric': 'manhattan'}

Running KNN with SMOTETomek + RobustScaler configuration...

KNearestNeighbors Model Performance Metrics
Configuratio

# Random Forest

In [8]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# Step 4: Random Forest + GridSearchCV
print("\n=== Random Forest Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': range(100, 160, 3),
    'max_depth': range(2, 20, 3),
    'min_samples_split': range(3, 7, 1),
    'min_samples_leaf': range(3, 7, 1),
    'max_features': ['sqrt'],
    'bootstrap': [False],
    'class_weight': ['balanced'],
    'max_leaf_nodes': range(20, 40, 5),
    'min_impurity_decrease': np.linspace(0.001, 0.05, 3),
    'ccp_alpha': np.linspace(0.001, 0.07, 3),
    'criterion': ['gini', 'entropy', 'log_loss']
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Random Forest with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv, n_jobs=-1,
                            n_iter=50, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    # rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_cfg, y_train_cfg)

    y_train_rf = rf.predict(X_train_cfg)
    y_test_rf = rf.predict(X_test_cfg)
    y_train_rf_proba = rf.predict_proba(X_train_cfg)
    y_test_rf_proba = rf.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_rf),
              metrics.accuracy_score(y_test, y_test_rf),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.f1_score(y_test, y_test_rf, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.recall_score(y_test, y_test_rf, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_rf, average='macro'),
              metrics.precision_score(y_test, y_test_rf, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_rf_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nRandom Forest Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_rf_proba, multi_class='ovr', average='macro')
    storeResults(
          'Random Forest',
          name,
          metrics.accuracy_score(y_test, y_test_rf),
          metrics.f1_score(y_test, y_test_rf, average='macro'),
          metrics.recall_score(y_test, y_test_rf, average='macro'),
          metrics.precision_score(y_test, y_test_rf, average='macro'),
          auc_score
      )
    storeBestParams(name, rf.best_params_,  "Random forest")
    print("Best hyperparameters found by GridSearchCV:")
    print(rf.best_params_)


Optimal number of features to select using Boruta: 11

=== Random Forest Model Performance with Hyperparameter Tuning ===

Running Random Forest with Original Data configuration...

Random Forest Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.903010  0.881326 0.878802   0.885183 0.939791
    Test  0.946667  0.919978 0.925758   0.921727 0.995633
Best hyperparameters found by GridSearchCV:
{'n_estimators': 145, 'min_samples_split': 3, 'min_samples_leaf': 6, 'min_impurity_decrease': 0.05, 'max_leaf_nodes': 30, 'max_features': 'sqrt', 'max_depth': 11, 'criterion': 'log_loss', 'class_weight': 'balanced', 'ccp_alpha': 0.035500000000000004, 'bootstrap': False}

Running Random Forest with Normalized Data with RobustScaler configuration...

Random Forest Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score  Recall  Precision  AUC-ROC
Training  0.903010  0.

# XGBoost




In [9]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'booster': ['gbtree'],  # Specify booster type
    'learning_rate': np.linspace(0.0001, 0.1, 10),  # Learning rate tuning
    'n_estimators': range(50, 500, 10),  # Number of estimators (trees)
    'max_depth': range(2, 50, 10),  # Depth of trees
    'min_child_weight': range(1, 10, 1),  # Minimum weight of children
    'gamma': np.linspace(0, 0.1, 3),  # Gamma (for pruning)
    'subsample': np.linspace(0.1, 1, 10),  # Subsample ratio
    'colsample_bytree': [0.3, 0.8],  # Subsample ratio for tree
    'colsample_bylevel': [1.0],  # Subsample ratio for level
    'colsample_bynode': [0.6, 0.8],  # Subsample ratio for node
    'max_delta_step': [0, 5],  # Maximum delta step for optimization
    
    'reg_alpha': np.linspace(0.1, 1, 1),  # L1 regularization
    'reg_lambda': np.linspace(0.1, 1, 1) # L2 regularization
   # 'scale_pos_weight': [1, 2, 5],  # Handling class imbalance

    # Remove booster-specific parameters if using gbtree
    # 'sample_type': ["weighted"], 
    # 'normalize_type': ["tree", "forest"],
    # 'rate_drop': [0, 0.1],
    # 'skip_drop': [0, 0.1]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    xgb = RandomizedSearchCV(XGBClassifier(), param_grid, n_iter=50, cv=cv,
                             n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    # xgb = XGBClassifier()
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xg = xgb.predict(X_train_cfg)
    y_test_xg = xgb.predict(X_test_cfg)
    y_train_xg_proba = xgb.predict_proba(X_train_cfg)
    y_test_xg_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_xg),
              metrics.accuracy_score(y_test, y_test_xg),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.f1_score(y_test, y_test_xg, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.recall_score(y_test, y_test_xg, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.precision_score(y_test, y_test_xg, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xg_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xg_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xg_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost Model',
        name,
        metrics.accuracy_score(y_test, y_test_xg),
        metrics.f1_score(y_test, y_test_xg, average='macro'),
        metrics.recall_score(y_test, y_test_xg, average='macro'),
        metrics.precision_score(y_test, y_test_xg, average='macro'),
        auc_score
    )
    storeBestParams(name, xgb.best_params_, "xgboost")
    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)


Optimal number of features to select using Boruta: 11

=== XGBoost Model Performance with Hyperparameter Tuning ===

Running XGBoost with Original Data configuration...

XGBoost Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901116 0.896836   0.906622 0.984194
    Test  0.973333  0.956944 0.956944   0.956944 0.993471
Best hyperparameters found by GridSearchCV:
{'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'n_estimators': 260, 'min_child_weight': 1, 'max_depth': 32, 'max_delta_step': 5, 'learning_rate': 0.033400000000000006, 'gamma': 0.1, 'colsample_bytree': 0.3, 'colsample_bynode': 0.8, 'colsample_bylevel': 1.0, 'booster': 'gbtree'}

Running XGBoost with Normalized Data with RobustScaler configuration...

XGBoost Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901116 0.896

# xgboost 2

In [10]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# Step 4: XGBoost + GridSearchCV
print("\n=== XGBoost2 Model Performance with Hyperparameter Tuning ===")

param_grid = [
    {
        'booster': ['gbtree'],
        'learning_rate': np.linspace(0.0001, 0.1, 10),
        'n_estimators': range(50, 500, 50),  # Reduced step size for faster search
        'max_depth': range(2, 50, 10),
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.5, 1],
        'min_child_weight': [1, 3, 5]
    },
    {
        'booster': ['dart'],
        'learning_rate': np.linspace(0.0001, 0.1, 10),
        'n_estimators': range(50, 500, 50),
        'max_depth': range(2, 50, 10),
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.5, 1],
        'min_child_weight': [1, 3, 5],
        'rate_drop': [0.1, 0.3, 0.5],
        'skip_drop': [0.1, 0.3, 0.5]
    },
    {
        'booster': ['gblinear'],
        'learning_rate': np.linspace(0.0001, 0.1, 10),
        'n_estimators': range(50, 500, 50),
        'reg_alpha': [0, 0.01, 0.1, 1],
        'reg_lambda': [0, 0.01, 0.1, 1]
        # No max_depth for gblinear (it's a linear model)
    }
]

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning XGBoost2 with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    xgb = RandomizedSearchCV(XGBClassifier(), param_grid, n_iter=50, cv=cv,
                             n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    # xgb = XGBClassifier()
    xgb.fit(X_train_cfg, y_train_cfg)

    y_train_xg = xgb.predict(X_train_cfg)
    y_test_xg = xgb.predict(X_test_cfg)
    y_train_xg_proba = xgb.predict_proba(X_train_cfg)
    y_test_xg_proba = xgb.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_xg),
              metrics.accuracy_score(y_test, y_test_xg),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.f1_score(y_test, y_test_xg, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.recall_score(y_test, y_test_xg, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_xg, average='macro'),
              metrics.precision_score(y_test, y_test_xg, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_xg_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xg_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nXGBoost2 Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_xg_proba, multi_class='ovr', average='macro')
    storeResults(
        'XGBoost2 Model',
        name,
        metrics.accuracy_score(y_test, y_test_xg),
        metrics.f1_score(y_test, y_test_xg, average='macro'),
        metrics.recall_score(y_test, y_test_xg, average='macro'),
        metrics.precision_score(y_test, y_test_xg, average='macro'),
        auc_score
    )
    storeBestParams(name, xgb.best_params_, "XGboost2")
    print("Best hyperparameters found by GridSearchCV:")
    print(xgb.best_params_)


Optimal number of features to select using Boruta: 11

=== XGBoost2 Model Performance with Hyperparameter Tuning ===

Running XGBoost2 with Original Data configuration...

XGBoost2 Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901116 0.896836   0.906622 0.978982
    Test  0.960000  0.935214 0.934722   0.936975 0.996400
Best hyperparameters found by GridSearchCV:
{'subsample': 0.6, 'n_estimators': 350, 'min_child_weight': 3, 'max_depth': 22, 'learning_rate': 0.033400000000000006, 'gamma': 0, 'colsample_bytree': 1.0, 'booster': 'gbtree'}

Running XGBoost2 with Normalized Data with RobustScaler configuration...

XGBoost2 Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901116 0.896836   0.906622 0.980616
    Test  0.973333  0.956944 0.956944   0.956944 0.996383
Best hyperparameters fou

# Gradient Boosting

In [11]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))

# Step 4: Gradient Boosting + GridSearchCV
print("\n=== Gradient Boosting Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'loss': ['log_loss'],
    'learning_rate': np.linspace(0.0001, 0.1, 10),
    'n_estimators': range(40, 400, 10),
    'subsample': np.linspace(0.1, 0.9, 3),
    'max_depth': range(20, 60, 5),
    'init': [None],
    'max_leaf_nodes': [None],
    'min_samples_split': range(2, 20, 3),
    'min_samples_leaf': range(2, 10, 3),
    'min_weight_fraction_leaf': [0.0],
    'min_impurity_decrease': [0.0],
    'validation_fraction': [0.1],
    'n_iter_no_change': [None],
    'tol': np.linspace(0.001, 0.05, 5),
    'ccp_alpha': np.linspace(0.005, 0.05, 5),
    'max_features': ['sqrt'],
    'verbose': [0],
    'warm_start': [False],
    'criterion': ['friedman_mse'],
    # 'random_state': [0]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Gradient Boosting with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    gbc = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, cv=cv,
                             n_iter=50, n_jobs=-1, scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    # gbc = GradientBoostingClassifier(random_state=42)
    gbc.fit(X_train_cfg, y_train_cfg)

    y_train_gb = gbc.predict(X_train_cfg)
    y_test_gb = gbc.predict(X_test_cfg)
    y_train_gb_proba = gbc.predict_proba(X_train_cfg)
    y_test_gb_proba = gbc.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_gb),
              metrics.accuracy_score(y_test, y_test_gb),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.f1_score(y_test, y_test_gb, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.recall_score(y_test, y_test_gb, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_gb, average='macro'),
              metrics.precision_score(y_test, y_test_gb, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_gb_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nGradien Boosting Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_gb_proba, multi_class='ovr', average='macro')
    storeResults(
          'Gradient Boosting',
          name,
          metrics.accuracy_score(y_test, y_test_gb),
          metrics.f1_score(y_test, y_test_gb, average='macro'),
          metrics.recall_score(y_test, y_test_gb, average='macro'),
          metrics.precision_score(y_test, y_test_gb, average='macro'),
          auc_score
      )
    storeBestParams(name, gbc.best_params_, "Gradient Boosting")
    print("Best hyperparameters found by GridSearchCV:")
    print(gbc.best_params_)


Optimal number of features to select using Boruta: 11

=== Gradient Boosting Model Performance with Hyperparameter Tuning ===

Running Gradient Boosting with Original Data configuration...

Gradien Boosting Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.892977  0.864521 0.859201   0.873098 0.937592
    Test  0.973333  0.963280 0.958333   0.971759 0.998588
Best hyperparameters found by GridSearchCV:
{'warm_start': False, 'verbose': 0, 'validation_fraction': 0.1, 'tol': 0.037750000000000006, 'subsample': 0.1, 'n_iter_no_change': None, 'n_estimators': 370, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 5, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'max_features': 'sqrt', 'max_depth': 35, 'loss': 'log_loss', 'learning_rate': 0.0889, 'init': None, 'criterion': 'friedman_mse', 'ccp_alpha': 0.01625}

Running Gradient Boosting with Normalized Data with RobustScaler confi

# Extra Trees

In [12]:
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# configurations.append(('SMOTETomek', X_train_resample, X_test_normalized, y_train_resample))

# Step 4: Extra Trees + GridSearchCV
print("\n=== Extra Trees Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': range(300, 500, 13),
    'max_depth': range(30, 50, 3),
    'max_leaf_nodes': range(30, 60, 8),
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [2, 3, 4],
    'min_weight_fraction_leaf': [0.0],
    'min_impurity_decrease': [0.0],
    'ccp_alpha': np.linspace(0.001, 0.05, 11),
    'max_features': ['sqrt', 'log2'],
    'class_weight': [None],
    'bootstrap': [True, False],
    'oob_score': [True, False],
    'criterion': ['gini', 'log_loss'],
    # 'random_state': range(2, 10, 1),
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Extra Trees with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    etc = RandomizedSearchCV(ExtraTreesClassifier(), param_grid, cv=cv, n_iter=50,
                             n_jobs=-1, scoring=["accuracy", "f1_macro"], refit='accuracy', verbose=0)
    # etc = ExtraTreesClassifier(random_state=42)
    etc.fit(X_train_cfg, y_train_cfg)

    y_train_et = etc.predict(X_train_cfg)
    y_test_et = etc.predict(X_test_cfg)
    y_train_et_proba = etc.predict_proba(X_train_cfg)
    y_test_et_proba = etc.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_et),
              metrics.accuracy_score(y_test, y_test_et),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_et, average='macro'),
              metrics.f1_score(y_test, y_test_et, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_et, average='macro'),
              metrics.recall_score(y_test, y_test_et, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_et, average='macro'),
              metrics.precision_score(y_test, y_test_et, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_et_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_et_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nExtraTrees Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_et_proba, multi_class='ovr', average='macro')
    storeResults(
          'Extra Trees',
          name,
          metrics.accuracy_score(y_test, y_test_et),
          metrics.f1_score(y_test, y_test_et, average='macro'),
          metrics.recall_score(y_test, y_test_et, average='macro'),
          metrics.precision_score(y_test, y_test_et, average='macro'),
          auc_score
      )
    storeBestParams(name, etc.best_params_, 'Extra Trees')
    print("Best hyperparameters found by GridSearchCV:")
    print(etc.best_params_)


Optimal number of features to select using Boruta: 11

=== Extra Trees Model Performance with Hyperparameter Tuning ===

Running Extra Trees with Original Data configuration...

ExtraTrees Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901213 0.896836   0.905995 0.982110
    Test  0.960000  0.935214 0.934722   0.936975 0.997123
Best hyperparameters found by GridSearchCV:
{'oob_score': False, 'n_estimators': 378, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 4, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': 54, 'max_features': 'sqrt', 'max_depth': 33, 'criterion': 'log_loss', 'class_weight': None, 'ccp_alpha': 0.001, 'bootstrap': False}

Running Extra Trees with Normalized Data with RobustScaler configuration...

ExtraTrees Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Tr

# ADABoost

In [13]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# Step 4: AdaBoost + GridSearchCV
print("\n=== AdaBoost Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'n_estimators': range(100, 500, 13), #[50, 150],
    'algorithm': ['SAMME'], 
    'learning_rate': np.linspace(0.01, 0.05, 3), #[0.005, 0.5, 0.03, 0.003],
    'estimator__max_depth': range(2, 10, 3), #[5, 20],
    'estimator__min_samples_split': range(1, 5, 1), #[8],
    'random_state': range(20, 60) #[42, 1234]
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning AdaBoost with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    adb = RandomizedSearchCV(AdaBoostClassifier(estimator=DecisionTreeClassifier()), param_grid, cv=cv, n_iter=50, n_jobs=-1,
                             scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    # adb = AdaBoostClassifier(random_state=42)
    adb.fit(X_train_cfg, y_train_cfg)

    y_train_ad = adb.predict(X_train_cfg)
    y_test_ad = adb.predict(X_test_cfg)
    y_train_ad_proba = adb.predict_proba(X_train_cfg)
    y_test_ad_proba = adb.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_ad),
              metrics.accuracy_score(y_test, y_test_ad),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_ad, average='macro'),
              metrics.f1_score(y_test, y_test_ad, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_ad, average='macro'),
              metrics.recall_score(y_test, y_test_ad, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_ad, average='macro'),
              metrics.precision_score(y_test, y_test_ad, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_ad_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ad_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nAdaBoost Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_ad_proba, multi_class='ovr', average='macro')
    storeResults(
          'AdaBoost',
          name,
          metrics.accuracy_score(y_test, y_test_ad),
          metrics.f1_score(y_test, y_test_ad, average='macro'),
          metrics.recall_score(y_test, y_test_ad, average='macro'),
          metrics.precision_score(y_test, y_test_ad, average='macro'),
          auc_score
      )
    storeBestParams(name, adb.best_params_, 'AdaBoost')
    print("Best hyperparameters found by GridSearchCV:")
    print(adb.best_params_)

Optimal number of features to select using Boruta: 11

=== AdaBoost Model Performance with Hyperparameter Tuning ===

Running AdaBoost with Original Data configuration...

AdaBoost Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901116 0.896836   0.906622 0.985429
    Test  0.933333  0.892473 0.893056   0.893056 0.977374
Best hyperparameters found by GridSearchCV:
{'random_state': 33, 'n_estimators': 191, 'learning_rate': 0.03, 'estimator__min_samples_split': 3, 'estimator__max_depth': 5, 'algorithm': 'SAMME'}

Running AdaBoost with Normalized Data with RobustScaler configuration...

AdaBoost Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901213 0.896836   0.905995 0.982728
    Test  0.946667  0.913889 0.915278   0.917367 0.986676
Best hyperparameters found by GridSearchCV:
{'random

# MLP

In [14]:
configurations = []

configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='sgd',
    alpha=0.01,
    batch_size='auto',
    learning_rate='constant',
    max_iter=1000,
    random_state=42,
verbose=False)

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning MLP Classifier with {name} configuration...")
    mlp.fit(X_train_cfg, y_train_cfg)

    y_train_mlp = mlp.predict(X_train_cfg)
    y_test_mlp = mlp.predict(X_test_cfg)
    y_train_mlp_proba = mlp.predict_proba(X_train_cfg)
    y_test_mlp_proba = mlp.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_mlp),
              metrics.accuracy_score(y_test, y_test_mlp),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_mlp, average='macro'),
              metrics.f1_score(y_test, y_test_mlp, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_mlp, average='macro'),
              metrics.recall_score(y_test, y_test_mlp, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_mlp, average='macro'),
              metrics.precision_score(y_test, y_test_mlp, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_mlp_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_mlp_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\MLP Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_mlp_proba, multi_class='ovr', average='macro')
    storeResults(
          'MLP Classifier',
          name,
          metrics.accuracy_score(y_test, y_test_mlp),
          metrics.f1_score(y_test, y_test_mlp, average='macro'),
          metrics.recall_score(y_test, y_test_mlp, average='macro'),
          metrics.precision_score(y_test, y_test_mlp, average='macro'),
          auc_score
      )
    # print("Best hyperparameters found by GridSearchCV:")
    # print(mlp.best_params_)


Running MLP Classifier with Original Data configuration...
\MLP Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.585284  0.246132 0.333333   0.195095 0.500000
    Test  0.613333  0.324786 0.375000   0.534247 0.537142

Running MLP Classifier with Normalized Data with RobustScaler configuration...
\MLP Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.909699  0.890223 0.886083   0.895565 0.942245
    Test  0.960000  0.934392 0.933333   0.947368 0.995289

Running MLP Classifier with SMOTETomek + RobustScaler configuration...
\MLP Model Performance Metrics
Configuration Name:  SMOTETomek + RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.896947  0.896847 0.897011   0.902984 0.968769
    Test  0.946667  0.922095 0.925758   0.933333 0.987494

Running MLP Classifier with MI conf

# LightGBM

In [15]:
import lightgbm as lgb
import warnings
import os

# Suppress all warnings
warnings.filterwarnings("ignore")

# Set LightGBM verbosity to suppress training logs
os.environ['LIGHTGBM_VERBOSITY'] = '-1'

configurations = []


configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

selected_features = boruta_selector.support_
optimal_features = sum(selected_features)
print(f"Optimal number of features to select using Boruta: {optimal_features}")

# Step 4: LightGBM + GridSearchCV
print("\n=== LightGBM Model Performance with Hyperparameter Tuning ===")

param_grid = {
    'num_leaves': range(20, 150, 5),               # Number of leaves in the tree
    'max_depth': range(3, 15, 2),                 # Maximum depth of the tree
    'learning_rate': np.linspace(0.01, 0.2, 2),         # Learning rate
    'n_estimators': range(50, 500, 5),            # Number of boosting iterations
    'min_child_samples': range(2, 100, 3),       # Minimum data in a leaf
    'subsample': np.linspace(0.05, 0.5, 5),              # Fraction of data to be used for training
    'colsample_bytree': np.linspace(0.05, 0.5, 5),       # Fraction of features to be used for training
    'reg_alpha': np.linspace(0.1, 1, 1),
    'reg_lambda': np.linspace(0.1, 1, 1),
}

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning LightGBM with {name} configuration...")
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
    lgbm = RandomizedSearchCV(LGBMClassifier(verbose=-1), param_grid, cv=cv, n_iter=50, n_jobs=-1,
                             scoring=['accuracy', 'f1_macro'], refit='accuracy', verbose=0)
    lgbm.fit(X_train_cfg, y_train_cfg)

    y_train_lg = lgbm.predict(X_train_cfg)
    y_test_lg = lgbm.predict(X_test_cfg)
    y_train_lg_proba = lgbm.predict_proba(X_train_cfg)
    y_test_lg_proba = lgbm.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_lg),
              metrics.accuracy_score(y_test, y_test_lg),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_lg, average='macro'),
              metrics.f1_score(y_test, y_test_lg, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_lg, average='macro'),
              metrics.recall_score(y_test, y_test_lg, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_lg, average='macro'),
              metrics.precision_score(y_test, y_test_lg, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_lg_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lg_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\nLightGBM Model Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_lg_proba, multi_class='ovr', average='macro')
    storeResults(
          'LightGBM',
          name,
          metrics.accuracy_score(y_test, y_test_lg),
          metrics.f1_score(y_test, y_test_lg, average='macro'),
          metrics.recall_score(y_test, y_test_lg, average='macro'),
          metrics.precision_score(y_test, y_test_lg, average='macro'),
          auc_score
      )
    storeBestParams(name, lgbm.best_params_, 'LightGBM')
    print("Best hyperparameters found by GridSearchCV:")
    print(lgbm.best_params_)

Optimal number of features to select using Boruta: 11

=== LightGBM Model Performance with Hyperparameter Tuning ===

Running LightGBM with Original Data configuration...

LightGBM Model Performance Metrics
Configuration Name:  Original Data
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901213 0.896836   0.905995 0.964426
    Test  0.973333  0.956944 0.956944   0.956944 0.997830
Best hyperparameters found by GridSearchCV:
{'subsample': 0.3875, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'num_leaves': 20, 'n_estimators': 490, 'min_child_samples': 2, 'max_depth': 7, 'learning_rate': 0.01, 'colsample_bytree': 0.05}

Running LightGBM with Normalized Data with RobustScaler configuration...

LightGBM Model Performance Metrics
Configuration Name:  Normalized Data with RobustScaler
 Dataset  Accuracy  F1 Score   Recall  Precision  AUC-ROC
Training  0.916388  0.901116 0.896836   0.906622 0.974745
    Test  0.960000  0.935214 0.934722   0.936975 0.996383
Best hyperpara

# Stacking classifier

In [16]:

'''
configurations = []
configurations.append(('Original Data', X_train, X_test, y_train))
configurations.append(('Normalized Data with RobustScaler', X_train_robust, X_test_robust, y_train))
configurations.append(('SMOTETomek + RobustScaler', X_train_robust_resample, X_test_robust, y_train_robust_resample))
configurations.append(('MI', X_train_mi, X_test_mi, y_train))
configurations.append(('MI + SMOTETomek', X_train_mi_res, X_test_mi, y_train_mi_res))
configurations.append(('LDA', X_train_lda, X_test_lda, y_train))
configurations.append(('LDA + SMOTETomek', X_train_lda_res, X_test_lda, y_train_lda_res))
configurations.append(('Normalized Data with MinMaxScaler', X_train_minmax, X_test_minmax, y_train))
configurations.append(('SMOTETomek + MiMaxScaler', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))
configurations.append(('Boruta', X_train_boruta, X_test_boruta, y_train))
configurations.append(('Boruta + SMOTETomek', X_train_boruta_res, X_test_boruta, y_train_boruta_res))
configurations.append(('Autoencoder', X_train_encoded, X_test_encoded, y_train))
configurations.append(('Autoencoder + SMOTETomek', X_train_encoded_res, X_test_encoded, y_train_encoded_res))

xgb_params = {'subsample': np.float64(0.1), 'skip_drop': 0, 'scale_pos_weight': 2, 'sample_type': 'weighted',
              'reg_lambda': np.float64(0.1), 'reg_alpha': np.float64(0.1), 'rate_drop': 0.1,
              'normalize_type': 'forest', 'n_estimators': 410, 'min_child_weight': 1, 'max_depth': 2,
              'max_delta_step': 5, 'learning_rate': np.float64(0.07780000000000001), 'gamma': np.float64(0.05),
              'estimator__n_estimators': 150, 'colsample_bytree': 0.3, 'colsample_bynode': 0.8,
              'colsample_bylevel': 1.0, 'booster': 'gbtree'}

rf_params = {'n_estimators': 142, 'min_samples_split': 4, 'min_samples_leaf': 5,
             'min_impurity_decrease': np.float64(0.001), 'max_leaf_nodes': 25, 'max_features': 'sqrt',
             'max_depth': 14, 'criterion': 'entropy', 'class_weight': 'balanced', 'ccp_alpha': np.float64(0.001),
             'bootstrap': False}

gbc_params = {'warm_start': False, 'verbose': 0, 'validation_fraction': 0.1, 'tol': np.float64(0.05),
              'subsample': np.float64(0.1), 'n_iter_no_change': None, 'n_estimators': 380,
              'min_weight_fraction_leaf': 0.0, 'min_samples_split': 14, 'min_samples_leaf': 5,
              'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'max_features': 'sqrt', 'max_depth': 50, 'loss': 'log_loss',
              'learning_rate': np.float64(0.1), 'init': None, 'criterion': 'friedman_mse', 'ccp_alpha': np.float64(0.005)}

knn_params = {'weights': 'distance', 'p': np.int64(1), 'n_neighbors': np.int64(11), 'metric': 'euclidean'}

logr_params = {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 970, 'C': np.float64(66.67)}

etc_params = {'oob_score': True, 'n_estimators': 482, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 3,
              'min_samples_leaf': 2, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': 38, 'max_features': 'log2',
              'max_depth': 48, 'criterion': 'log_loss', 'class_weight': None, 'ccp_alpha': np.float64(0.001),
              'bootstrap': True}
base_dt = DecisionTreeClassifier(max_depth=5, min_samples_split=4)
# adb_params = {'random_state': 54, 'n_estimators': 165, 'learning_rate': np.float64(0.01)}

estimators = [
    ('xgb', XGBClassifier(**xgb_params)),
    ('rf', RandomForestClassifier(**rf_params)),
    ('gbc', GradientBoostingClassifier(**gbc_params)),
    ('knn', KNeighborsClassifier(**knn_params)),
    ('etc', ExtraTreesClassifier(**etc_params)),
    ('adb', AdaBoostClassifier(estimator=base_dt, random_state=54, n_estimators=165, learning_rate=0.01)),
    ('logr', LogisticRegression(**logr_params))
]

log_reg_params = {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 816, 'C': np.float64(0.01)}
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(**log_reg_params),
    cv = 5,
    stack_method = "predict_proba",
    n_jobs=-1
    )

for name, X_train_cfg, X_test_cfg, y_train_cfg in configurations:
    print(f"\nRunning Stacking Classifier with {name} configuration...")
    stacking_clf.fit(X_train_cfg, y_train_cfg)

    y_train_stc = stacking_clf.predict(X_train_cfg)
    y_test_stc = stacking_clf.predict(X_test_cfg)
    y_train_stc_proba = stacking_clf.predict_proba(X_train_cfg)
    y_test_stc_proba = stacking_clf.predict_proba(X_test_cfg)

    metrics_dict = {
          "Dataset": ["Training", "Test"],
          "Accuracy": [
              metrics.accuracy_score(y_train_cfg, y_train_stc),
              metrics.accuracy_score(y_test, y_test_stc),
          ],
          "F1 Score": [
              metrics.f1_score(y_train_cfg, y_train_stc, average='macro'),
              metrics.f1_score(y_test, y_test_stc, average='macro'),
          ],
          "Recall": [
              metrics.recall_score(y_train_cfg, y_train_stc, average='macro'),
              metrics.recall_score(y_test, y_test_stc, average='macro'),
          ],
          "Precision": [
              metrics.precision_score(y_train_cfg, y_train_stc, average='macro'),
              metrics.precision_score(y_test, y_test_stc, average='macro'),
          ],
          "AUC-ROC": [
              metrics.roc_auc_score(pd.get_dummies(y_train_cfg), y_train_stc_proba, multi_class='ovr', average='macro'),
              metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stc_proba, multi_class='ovr', average='macro'),
          ]
      }

    df_metrics = pd.DataFrame(metrics_dict)
    print("\Stacking Classifier Performance Metrics")
    print("Configuration Name: ", name)
    print(df_metrics.to_string(index=False))

    auc_score = metrics.roc_auc_score(pd.get_dummies(y_test), y_test_stc_proba, multi_class='ovr', average='macro')
    storeResults(
          'Stacking Classifier',
          name,
          metrics.accuracy_score(y_test, y_test_stc),
          metrics.f1_score(y_test, y_test_stc, average='macro'),
          metrics.recall_score(y_test, y_test_stc, average='macro'),
          metrics.precision_score(y_test, y_test_stc, average='macro'),
          auc_score
      )
'''

'\nconfigurations = []\nconfigurations.append((\'Original Data\', X_train, X_test, y_train))\nconfigurations.append((\'Normalized Data with RobustScaler\', X_train_robust, X_test_robust, y_train))\nconfigurations.append((\'SMOTETomek + RobustScaler\', X_train_robust_resample, X_test_robust, y_train_robust_resample))\nconfigurations.append((\'MI\', X_train_mi, X_test_mi, y_train))\nconfigurations.append((\'MI + SMOTETomek\', X_train_mi_res, X_test_mi, y_train_mi_res))\nconfigurations.append((\'LDA\', X_train_lda, X_test_lda, y_train))\nconfigurations.append((\'LDA + SMOTETomek\', X_train_lda_res, X_test_lda, y_train_lda_res))\nconfigurations.append((\'Normalized Data with MinMaxScaler\', X_train_minmax, X_test_minmax, y_train))\nconfigurations.append((\'SMOTETomek + MiMaxScaler\', X_train_minmax_resample, X_test_minmax, y_train_minmax_resample))\nconfigurations.append((\'Boruta\', X_train_boruta, X_test_boruta, y_train))\nconfigurations.append((\'Boruta + SMOTETomek\', X_train_boruta_re

# Result


In [17]:
# Creating the dataframe
result = pd.DataFrame({
    'ML Model': ML_Model,
    'Configuration': ML_Config,
    'Accuracy': [f"{acc * 100:.3f}%" for acc in accuracy],
    'F1 Score': [f"{f1 * 100:.3f}%" for f1 in f1_score],
    'Recall': [f"{rec * 100:.3f}%" for rec in recall],
    'Precision': [f"{prec * 100:.3f}%" for prec in precision],
    'ROC_AUC': [f"{roc * 100:.3f}%" for roc in auc_roc],
})

# Remove duplicates based on model and configuration
result.drop_duplicates(subset=["ML Model", "Configuration"], inplace=True)

# Display the result
print("\n" + "=" * 100)
print("MODEL PERFORMANCE RESULTS")
print("=" * 100)
print(result.to_string(index=False))

# Save the result to a CSV file
# result.to_csv('final_results/model_results.csv', index=False)
# print("\nResults saved to model_results.csv")

# Sort by Accuracy and F1 Score
sorted_result = result.sort_values(by=['Accuracy', 'F1 Score'], ascending=False).reset_index(drop=True)

# Display the sorted result
print("\n" + "=" * 100)
print("SORTED MODEL PERFORMANCE RESULTS (by Accuracy and F1 Score)")
print("=" * 100)
print(sorted_result.to_string(index=False))

# Save the sorted result
# sorted_result.to_csv('final_results/sorted_model_results.csv', index=False)
# print("\nSorted results saved to sorted_model_results.csv")

# Extract top configuration per ML model
top_per_model = sorted_result.groupby('ML Model', as_index=False).first()

# Display and save the top configuration table
print("\n" + "=" * 100)
print("TOP CONFIGURATION PER MODEL")
print("=" * 100)
print(top_per_model.to_string(index=False))

# top_per_model.to_csv('final_results/top_configurations.csv', index=False)
# print("\nTop configuration per model saved to top_configurations.csv")


MODEL PERFORMANCE RESULTS
           ML Model                     Configuration Accuracy F1 Score  Recall Precision ROC_AUC
Logistic Regression                     Original Data  97.333%  95.658% 95.556%   96.296% 99.818%
Logistic Regression Normalized Data with RobustScaler  94.667%  92.142% 92.715%   91.634% 99.781%
Logistic Regression         SMOTETomek + RobustScaler  96.000%  94.379% 94.798%   94.737% 99.263%
Logistic Regression                                MI  94.667%  92.224% 92.853%   91.667% 97.354%
Logistic Regression                   MI + SMOTETomek  82.667%  80.626% 84.571%   79.625% 95.948%
Logistic Regression                               LDA  92.000%  89.715% 91.338%   89.006% 95.821%
Logistic Regression                  LDA + SMOTETomek  81.333%  79.893% 83.813%   79.833% 95.357%
Logistic Regression Normalized Data with MinMaxScaler  97.333%  95.658% 95.556%   96.296% 99.929%
Logistic Regression          SMOTETomek + MiMaxScaler  94.667%  93.744% 95.505%   92.515% 9