### Imports and Setup

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from utils.models import model_mapping
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#runs = 10 # for ease when coding
runs = 100 # for final experiments
np.random.seed(42)
random_seeds = np.random.randint(0, 10000, size=runs)

dataset_names = ['rarefied', 'clr']
data_dir = "./data"
results_dir = "./results/summaries"
os.makedirs(results_dir, exist_ok=True)

final_results = {}

### Looping Over Datasets

In [2]:
for dataset_name in dataset_names:
    print(f"\n{'='*60}\n Running Training Pipeline on: {dataset_name}\n{'='*60}")

    # loading data
    df = pd.read_csv(f"{data_dir}/{dataset_name}.csv", index_col=0)
    X = df.drop(columns=["Diagnosis", "Diagnosis_labeled"])
    y = df["Diagnosis_labeled"]

    # encoding labels
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    # initializing models from mapping
    models = {name: model_class() for name, model_class in model_mapping.items()}
    # dictionary to store model evaluation results
    model_results = {name: np.zeros((runs, 5)) for name in models}

    # Training Loop
    for i, seed in enumerate(tqdm(random_seeds, desc=f"    Progress ({dataset_name})")):
        # showing progress
        if (i + 1) % 10 == 0 or i == 0:
            print(f"    Run {i + 1} (seed={seed})")
            
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)
        
        # preprocessing
        vt = VarianceThreshold(threshold=0)
        X_train = vt.fit_transform(X_train)
        X_test = vt.transform(X_test)
        
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # training & evaluating each model
        for name, current_model in models.items():
            current_model.train(X_train_scaled, y_train)
            metrics_dict = current_model.predict(X_test_scaled, y_test)
            model_results[name][i, :] = [
                metrics_dict["AUC"],
                metrics_dict["Accuracy"],
                metrics_dict["Precision"],
                metrics_dict["Recall"],
                metrics_dict["F1"]
            ]
            

    # computing average metrics
    average_results = {
    name: np.mean(results, axis=0) for name, results in model_results.items()
    }

    # best model selection with priority on AUC while using other metrics as tiebreakers
    best_model = max(average_results, key=lambda name: tuple(average_results[name]))
    best_metrics = average_results[best_model]

    print(f"\nBest Model for {dataset_name}: {best_model}")
    print(
        f"AUC: {best_metrics[0]:.5f}\n"
        f"Accuracy: {best_metrics[1]:.5f}\n"
        f"Precision: {best_metrics[2]:.5f}\n"
        f"Recall: {best_metrics[3]:.5f}\n"
        f"F1: {best_metrics[4]:.5f}\n"
    )

    # storing results in dictionary for JSON
    final_results[dataset_name] = {
        "best_model": best_model,
        "metrics": {
            "auc": round(best_metrics[0], 5),
            "accuracy": round(best_metrics[1], 5),
            "precision": round(best_metrics[2], 5),
            "recall": round(best_metrics[3], 5),
            "f1": round(best_metrics[4], 5)
        }
    }

# saving structured info for frontend
with open(f"{results_dir}/best_models.json", "w") as f:
    json.dump(final_results, f, indent=2)

print(f"\nResults saved to: {results_dir}/best_models.json")


 Running Training Pipeline on: rarefied


    Progress (rarefied):   0%|      | 0/100 [00:00<?, ?it/s]

    Run 1 (seed=7270)


    Progress (rarefied):   9%| | 9/100 [01:40<19:28, 12.84s/

    Run 10 (seed=8322)


    Progress (rarefied):  19%|▏| 19/100 [03:18<12:50,  9.51s

    Run 20 (seed=3385)


    Progress (rarefied):  29%|▎| 29/100 [04:49<10:41,  9.03s

    Run 30 (seed=189)


    Progress (rarefied):  39%|▍| 39/100 [06:17<09:02,  8.89s

    Run 40 (seed=8838)


    Progress (rarefied):  49%|▍| 49/100 [07:50<08:05,  9.51s

    Run 50 (seed=7099)


    Progress (rarefied):  59%|▌| 59/100 [09:22<06:21,  9.30s

    Run 60 (seed=3843)


    Progress (rarefied):  69%|▋| 69/100 [10:59<04:46,  9.23s

    Run 70 (seed=1016)


    Progress (rarefied):  79%|▊| 79/100 [12:35<03:21,  9.59s

    Run 80 (seed=4859)


    Progress (rarefied):  89%|▉| 89/100 [14:10<01:48,  9.85s

    Run 90 (seed=5463)


    Progress (rarefied):  99%|▉| 99/100 [15:46<00:09,  9.87s

    Run 100 (seed=6184)


    Progress (rarefied): 100%|█| 100/100 [15:57<00:00,  9.58



Best Model for rarefied: Light Gradient Boosting Model
AUC: 0.78454
Accuracy: 0.70647
Precision: 0.71678
Recall: 0.70647
F1: 0.70160


 Running Training Pipeline on: clr


    Progress (clr):   0%|           | 0/100 [00:00<?, ?it/s]

    Run 1 (seed=7270)


    Progress (clr):   9%| | 9/100 [07:01<1:08:09, 44.94s/it]

    Run 10 (seed=8322)


    Progress (clr):  19%|▏| 19/100 [14:28<1:00:04, 44.49s/it

    Run 20 (seed=3385)


    Progress (clr):  29%|▌ | 29/100 [21:57<52:40, 44.51s/it]

    Run 30 (seed=189)


    Progress (clr):  39%|▊ | 39/100 [29:30<47:22, 46.60s/it]

    Run 40 (seed=8838)


    Progress (clr):  49%|▉ | 49/100 [36:58<37:07, 43.68s/it]

    Run 50 (seed=7099)


    Progress (clr):  59%|█▏| 59/100 [44:10<30:12, 44.20s/it]

    Run 60 (seed=3843)


    Progress (clr):  69%|█▍| 69/100 [51:30<22:56, 44.39s/it]

    Run 70 (seed=1016)


    Progress (clr):  79%|█▌| 79/100 [58:41<14:38, 41.84s/it]

    Run 80 (seed=4859)


    Progress (clr):  89%|▉| 89/100 [1:05:54<07:44, 42.22s/it

    Run 90 (seed=5463)


    Progress (clr):  99%|▉| 99/100 [1:13:01<00:41, 41.68s/it

    Run 100 (seed=6184)


    Progress (clr): 100%|█| 100/100 [1:13:46<00:00, 44.26s/i


Best Model for clr: Logistic Regression - Elastic Net
AUC: 0.77202
Accuracy: 0.71078
Precision: 0.71475
Recall: 0.71078
F1: 0.70895


Results saved to: ./results/summaries/best_models.json



