In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score
import torch
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler

BATCH_SIZE = 1024
remake_datasets = False

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader
import torch
from collections import defaultdict

# ========== Step 1: Load and Normalize ==========

def load_and_merge_data(folder_path):
    print("Loading and cleaning datasets")
    cols = list(pd.read_csv("/kaggle/input/unsw-nb15/NUSW-NB15_features.csv", encoding='cp1252')["Name"])
    files = [f"UNSW-NB15_{i}.csv" for i in range(1, 5)]
    df = pd.concat([pd.read_csv(os.path.join(folder_path, f), header = None) for f in files], ignore_index=True)
    df.columns = cols   
    df['attack_cat'] = df['attack_cat'].astype(str).str.strip().str.upper()
    df.replace({'attack_cat': ['Nan', '', ' ', 'NAN', 'NaN', 'nan']}, value='BENIGN', inplace=True)
    return df

# ========== Step 2: Preprocess and Stratified Split ==========

def preprocess_and_split(df, label_col='attack_cat'):
    print("Processing and splitting")
    df = df.dropna(subset=[label_col])
    df[label_col] = df[label_col].astype(str)

    # Strip whitespace from all string-like fields
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

    # Replace blank strings with NaN
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    # Encode categorical features
    label_encoders = {}
    for col in ['state', 'service', 'proto']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    # Encode multiclass target
    target_encoder = LabelEncoder()
    df[label_col] = target_encoder.fit_transform(df[label_col].astype(str))
    label_encoders[label_col] = target_encoder

    # Drop metadata columns
    drop_cols = ['id', 'srcip', 'sport', 'dstip', 'dsport', 'Label']
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')

    # Drop rows with any remaining NaNs
    df = df.fillna(0)

    # Split
    
    train_val, test = train_test_split(df, test_size=0.2, stratify=df[label_col], random_state=42)
    train, val = train_test_split(train_val, test_size=0.2, stratify=train_val[label_col], random_state=42)


        # After the split
    print("🔍 Classes in full data:", sorted(df[label_col].unique()))
    print("🧪 Classes in train:", sorted(train[label_col].unique()))
    print("🧪 Classes in val:", sorted(val[label_col].unique()))
    print("🧪 Classes in test:", sorted(test[label_col].unique()))


    return train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True), label_encoders


# ========== Step 3: Custom Dataset ==========

class UNSWDataset(Dataset):
    def __init__(self, df, scaler, label_col='attack_cat'):
        self.labels = torch.tensor(df[label_col].values, dtype=torch.long)
        features = df.drop(columns=[label_col], errors='ignore')
        self.features = torch.tensor(scaler.transform(features), dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


# ========== Step 4: Class-Balanced Sampler ==========
class BalancedSampler:
    def __init__(self, label_col='attack_cat', max_samples=10_000, random_state=42):
        self.label_col = label_col
        self.max_samples = max_samples
        self.random_state = random_state

    def sample(self, df):
        sampled_frames = []
        for label in sorted(df[self.label_col].unique()):
            class_df = df[df[self.label_col] == label]
            sample_size = min(len(class_df), self.max_samples)
            sampled = class_df.sample(n=sample_size, random_state=self.random_state)
            sampled_frames.append(sampled)
        starter_df = pd.concat(sampled_frames).reset_index(drop=True)
        return starter_df

class StratifiedBatchSampler(torch.utils.data.Sampler):
    def __init__(self, labels, batch_size = BATCH_SIZE):
        self.labels = np.array(labels)
        self.batch_size = batch_size
        self.label_to_indices = defaultdict(list)

        for idx, label in enumerate(self.labels):
            self.label_to_indices[label].append(idx)

        self.class_probs = {
            label: len(indices) / len(labels)
            for label, indices in self.label_to_indices.items()
        }

    def __iter__(self):
        all_indices = []
        num_classes = len(self.label_to_indices)

        # Approximate how many of each class to sample per batch
        per_batch = {
            label: max(1, int(self.class_probs[label] * self.batch_size))
            for label in self.label_to_indices
        }

        num_batches = len(self)
        for _ in range(num_batches):
            batch = []
            for label, indices in self.label_to_indices.items():
                selected = np.random.choice(indices, per_batch[label], replace=True)
                batch.extend(selected)
            np.random.shuffle(batch)
            yield batch[:self.batch_size]  # trim excess if needed

    def __len__(self):
        return len(self.labels) // self.batch_size

# ========== Step 5: Example Usage ==========

def build_dataloaders(folder_path, batch_size=BATCH_SIZE):
    df = load_and_merge_data(folder_path)
    train_df, val_df, test_df, label_encoder = preprocess_and_split(df)

    print("Building Starter Dataset")
    starter_dataset = UNSWDataset(starter_df)
    print("Building Trianing Dataset")
    train_dataset = UNSWDataset(train_df)
    print("Building Validation Dataset")
    val_dataset = UNSWDataset(val_df)
    print("Building Testing Dataset")
    test_dataset = UNSWDataset(test_df)

    train_sampler = StratifiedBatchSampler(train_df['attack_cat'].values, batch_size)

    input_dim = train_df.drop(columns=['attack_cat'], errors='ignore').shape[1]
    num_classes = train_df['attack_cat'].nunique()

    print("Done")
    return {
        "train": DataLoader(train_dataset, batch_sampler=train_sampler, pin_memory=True, num_workers=4),
        "val": DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4),
        "test": DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4),
        "label_encoder": label_encoder,
        "input_dim": input_dim,
        "num_classes": num_classes
    }

In [4]:
remake_datasets = True
file_paths = [
    '/kaggle/working/train_dataset.pt',
    '/kaggle/working/val_dataset.pt',
    '/kaggle/working/test_dataset.pt',
    '/kaggle/working/starter_dataset.pt'
]
dataset_names = ['train_dataset', 'val_dataset', 'test_dataset', 'starter_dataset']

all_exist = all(os.path.exists(p) for p in file_paths)

if not all_exist or remake_datasets:
    if not all(var in globals() for var in ['df', 'train_df', 'val_df', 'test_df', 'starter_df']):
        df = load_and_merge_data("/kaggle/input/unsw-nb15")
        train_df, val_df, test_df, label_encoder = preprocess_and_split(df)
        sampler = BalancedSampler(label_col='attack_cat', max_samples=10_000)
        starter_df = sampler.sample(train_df).reset_index(drop=True)



    scaler = StandardScaler()
    scaler.fit(train_df.drop(columns=['attack_cat'], errors='ignore'))

    print("Building Training Dataset")
    train_dataset = UNSWDataset(train_df, scaler=scaler)
    torch.save(train_dataset, "/kaggle/working/train_dataset.pt")

    print("Building Starter Dataset")
    starter_dataset = UNSWDataset(starter_df, scaler=scaler)
    torch.save(starter_dataset, "/kaggle/working/starter_dataset.pt")

    print("Building Validation Dataset")
    val_dataset = UNSWDataset(val_df, scaler=scaler)
    torch.save(val_dataset, "/kaggle/working/val_dataset.pt")

    print("Building Testing Dataset")
    test_dataset = UNSWDataset(test_df, scaler=scaler)
    torch.save(test_dataset, "/kaggle/working/test_dataset.pt")

else:
    for name in dataset_names:
        globals().pop(name, None)

    train_dataset = torch.load("/kaggle/working/train_dataset.pt")
    val_dataset = torch.load("/kaggle/working/val_dataset.pt")
    starter_dataset = torch.load("/kaggle/working/starter_dataset.pt")
    test_dataset = torch.load("/kaggle/working/test_dataset.pt")

# You need labels for the sampler;
train_sampler = StratifiedBatchSampler(train_df['attack_cat'].values, BATCH_SIZE)

input_dim = train_df.drop(columns=['attack_cat'], errors='ignore').shape[1]
num_classes = train_df['attack_cat'].nunique()

loaders = {
    "train": DataLoader(train_dataset, batch_sampler=train_sampler, pin_memory=True, num_workers=4),
    "val": DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=4),
    "test": DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=4),
    "start": DataLoader(starter_dataset, batch_size=max(BATCH_SIZE, len(starter_dataset)), shuffle=True, pin_memory=True, num_workers=4),
    "label_encoder": label_encoder,
    "input_dim": input_dim,
    "num_classes": num_classes
}

Loading and cleaning datasets


  df = pd.concat([pd.read_csv(os.path.join(folder_path, f), header = None) for f in files], ignore_index=True)
  df = pd.concat([pd.read_csv(os.path.join(folder_path, f), header = None) for f in files], ignore_index=True)


Processing and splitting
🔍 Classes in full data: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
🧪 Classes in train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
🧪 Classes in val: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
🧪 Classes in test: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Building Training Dataset
Building Starter Dataset
Building Validation Dataset
Building Testing Dataset


In [5]:
train_df['attack_cat'].value_counts()

attack_cat
3     1420008
7      137908
5       28496
6       15517
4       10466
8        8951
0        1714
1        1149
9         967
2         342
10        111
Name: count, dtype: int64

In [6]:
starter_df['attack_cat'].value_counts()

attack_cat
3     10000
4     10000
5     10000
6     10000
7     10000
8      8951
0      1714
1      1149
9       967
2       342
10      111
Name: count, dtype: int64

In [7]:
train_loader = loaders["train"]
start_loader = loaders["start"]
val_loader = loaders["val"]
test_loader = loaders["test"]
label_encoder = loaders["label_encoder"]

# Calculate number of input features (after dropping label columns)
input_dim = loaders["input_dim"]

# Number of unique output classes
num_classes = loaders["num_classes"]
NUM_CLASSES = num_classes
print(f"Input dimension: {input_dim}")
print(f"Number of output classes: {num_classes}")

Input dimension: 43
Number of output classes: 11


In [8]:
print("Label classes:", list(label_encoder['attack_cat'].classes_))

Label classes: ['ANALYSIS', 'BACKDOOR', 'BACKDOORS', 'BENIGN', 'DOS', 'EXPLOITS', 'FUZZERS', 'GENERIC', 'RECONNAISSANCE', 'SHELLCODE', 'WORMS']


In [9]:
class SimpleNN(torch.nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_dim, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, num_classes),
            # torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)
# Training function
scaler = torch.amp.GradScaler('cuda')

def train_model(loader):
    model.train()
    running_loss = 0.0
    loop = tqdm(loader, desc="Training", leave=True)
    for inputs, labels in loop:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        with torch.autocast('cuda'):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * inputs.size(0)
        # print("OUTPUT SHAPE:", outputs.shape)
        # print("LABEL SHAPE:", labels.shape)
    return running_loss / len(loader.dataset)


# Evaluation function
def evaluate_model(loader):
    model.eval()
    running_loss = 0.0
    all_outputs = []
    all_labels = []

    with torch.no_grad():
        loop = tqdm(loader, desc="Evaluating", leave=True)
        for inputs, labels in loop:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)  # raw logits
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)

            probs = F.softmax(outputs, dim=1)  # convert to probabilities
            all_outputs.append(probs.cpu())
            all_labels.append(labels.cpu())

    all_outputs = torch.cat(all_outputs)
    all_labels = torch.cat(all_labels)
    return running_loss / len(loader.dataset), all_outputs, all_labels



In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
criterion = torch.nn.CrossEntropyLoss()
model = SimpleNN(input_dim=input_dim, num_classes=NUM_CLASSES).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
CURRICULUM_LEARNING_END_EPOCH = 10 # 0 omits use of the starter datasets

cpu


In [11]:
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.metrics import accuracy_score


def runit():
    num_epochs = 60
    train_losses = []
    val_losses = []
    val_aucs = []
    test_aucs = []
    
    best_val_auc = 0
    epochs_since_improvement = 0
    best_model_path = 'best_model.pth'

    for epoch in range(1, num_epochs + 1):
        print(f'epoch: ', epoch)
        print("Training")
        if epoch < CURRICULUM_LEARNING_END_EPOCH:
            train_loss = train_model(start_loader)
        else:
            train_loss = train_model(train_loader)
        val_loss, val_outputs, val_labels = evaluate_model(val_loader)
        test_loss, test_outputs, test_labels = evaluate_model(test_loader)

        # Convert to NumPy
        val_outputs_np = val_outputs.detach().cpu().numpy()
        val_labels_np = val_labels.detach().cpu().numpy()
        test_outputs_np = test_outputs.detach().cpu().numpy()
        test_labels_np = test_labels.detach().cpu().numpy()

        # Compute multiclass AUC
        val_auc = roc_auc_score(val_labels_np, val_outputs_np, multi_class='ovr', average='macro')
        test_auc = roc_auc_score(test_labels_np, test_outputs_np, multi_class='ovr', average='macro')

        # Save stats
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_aucs.append(val_auc)
        test_aucs.append(test_auc)

        # Save model every epoch
        torch.save(model.state_dict(), f'model_epoch_{epoch:02d}.pth')

        # Check for improvement
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            torch.save(model.state_dict(), best_model_path)
            epochs_since_improvement = 0
            print(f'✅ New best model saved at epoch {epoch} with val_auc = {val_auc:.4f}')
        else:
            epochs_since_improvement += 1
            print(f'⚠️  No improvement for {epochs_since_improvement} epoch(s)')
            if epochs_since_improvement >= 5:
                print(f'🔁 Reverting to best model from epoch {epoch - epochs_since_improvement}')
                model.load_state_dict(torch.load(best_model_path))
                for g in optimizer.param_groups:
                    g['lr'] = g['lr'] * 0.5
                print(f'💡 Learning rate halved to {g["lr"]:.6f}')
                epochs_since_improvement = 0

        # Predict class labels (argmax over logits or probabilities)
        val_preds = val_outputs_np.argmax(axis=1)
        test_preds = test_outputs_np.argmax(axis=1)

        val_acc_sample = accuracy_score(val_labels_np, val_preds)
        test_acc_sample = accuracy_score(test_labels_np, test_preds)
        
        # Category-weighted accuracy (macro average)
        unique_classes, test_class_counts = np.unique(test_labels_np, return_counts=True)
        weights = test_class_counts / test_class_counts.sum()
        
        per_class_acc = []
        for c in unique_classes:
            class_mask = (test_labels_np == c)
            class_acc = (test_preds[class_mask] == test_labels_np[class_mask]).mean()
            per_class_acc.append(class_acc)
        
        test_acc_macro = np.mean(per_class_acc)
        test_acc_weighted = np.sum(np.array(per_class_acc) * weights)
        
        # Macro F1 score
        val_f1 = f1_score(val_labels_np, val_preds, average='macro')
        test_f1 = f1_score(test_labels_np, test_preds, average='macro')

        print(f'Epoch: {epoch:02d}, Loss: {train_loss:.4f}')
        print(f'Val AUC: {val_auc:.4f} | Val F1: {val_f1:.4f}')
        print(f'Test AUC: {test_auc:.4f} | Test F1: {test_f1:.4f}')
        print(f"Sample-Weighted Accuracy: {test_acc_sample:.4f}")
        print(f"Category-Weighted Accuracy (Macro): {test_acc_macro:.4f}")
        print(f"Category-Weighted Accuracy (Weighted by Class Size): {test_acc_weighted:.4f}")

        # Optional: per-class breakdown for test set
        print("Per-Class Test Metrics:")
        print(classification_report(test_labels_np, test_preds, digits=4))

        print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


In [None]:
runit()

epoch:  1
Training


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 1 with val_auc = 0.5348
Epoch: 01, Loss: 2.3661
Val AUC: 0.5348 | Val F1: 0.0070
Test AUC: 0.5344 | Test F1: 0.0069
Sample-Weighted Accuracy: 0.0156
Category-Weighted Accuracy (Macro): 0.0625
Category-Weighted Accuracy (Weighted by Class Size): 0.0156
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9914    0.0026    0.0052    443753
           4     0.0000    0.0000    0.0000      3271
           5     0.0063    0.0715    0.0116      8905
           6     0.0093    0.5306    0.0183      4849
           7     0.0276    0.0822    0.0413     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0156    508010
   macro avg     0.0941    0.0625    0.0069    508010
weighted avg     0.8686    0.0156    0.0084    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  2
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 2 with val_auc = 0.5803
Epoch: 02, Loss: 2.3471
Val AUC: 0.5803 | Val F1: 0.0078
Test AUC: 0.5803 | Test F1: 0.0079
Sample-Weighted Accuracy: 0.0177
Category-Weighted Accuracy (Macro): 0.0727
Category-Weighted Accuracy (Weighted by Class Size): 0.0177
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9937    0.0028    0.0056    443753
           4     0.0000    0.0000    0.0000      3271
           5     0.0068    0.0810    0.0125      8905
           6     0.0112    0.6228    0.0221      4849
           7     0.0307    0.0932    0.0462     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0177    508010
   macro avg     0.0948    0.0727    0.0079    508010
weighted avg     0.8708    0.0177    0.0093    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  3
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 3 with val_auc = 0.6089
Epoch: 03, Loss: 2.3289
Val AUC: 0.6089 | Val F1: 0.0083
Test AUC: 0.6096 | Test F1: 0.0084
Sample-Weighted Accuracy: 0.0191
Category-Weighted Accuracy (Macro): 0.0770
Category-Weighted Accuracy (Weighted by Class Size): 0.0191
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9955    0.0030    0.0060    443753
           4     0.0000    0.0000    0.0000      3271
           5     0.0072    0.0912    0.0133      8905
           6     0.0123    0.6498    0.0242      4849
           7     0.0323    0.1028    0.0492     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0191    508010
   macro avg     0.0952    0.0770    0.0084    508010
weighted avg     0.8726    0.0191    0.0099    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  4
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 4 with val_auc = 0.6266
Epoch: 04, Loss: 2.3112
Val AUC: 0.6266 | Val F1: 0.0090
Test AUC: 0.6278 | Test F1: 0.0090
Sample-Weighted Accuracy: 0.0206
Category-Weighted Accuracy (Macro): 0.0805
Category-Weighted Accuracy (Weighted by Class Size): 0.0206
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9973    0.0033    0.0066    443753
           4     0.0000    0.0000    0.0000      3271
           5     0.0082    0.1109    0.0152      8905
           6     0.0134    0.6595    0.0264      4849
           7     0.0325    0.1112    0.0503     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0206    508010
   macro avg     0.0956    0.0805    0.0090    508010
weighted avg     0.8742    0.0206    0.0106    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  5
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 5 with val_auc = 0.6385
Epoch: 05, Loss: 2.2939
Val AUC: 0.6385 | Val F1: 0.0098
Test AUC: 0.6404 | Test F1: 0.0099
Sample-Weighted Accuracy: 0.0232
Category-Weighted Accuracy (Macro): 0.0839
Category-Weighted Accuracy (Weighted by Class Size): 0.0232
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9968    0.0050    0.0099    443753
           4     0.0000    0.0000    0.0000      3271
           5     0.0091    0.1306    0.0171      8905
           6     0.0140    0.6676    0.0275      4849
           7     0.0351    0.1199    0.0543     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0232    508010
   macro avg     0.0959    0.0839    0.0099    508010
weighted avg     0.8740    0.0232    0.0138    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  6
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 6 with val_auc = 0.6469
Epoch: 06, Loss: 2.2767
Val AUC: 0.6469 | Val F1: 0.0117
Test AUC: 0.6484 | Test F1: 0.0117
Sample-Weighted Accuracy: 0.0291
Category-Weighted Accuracy (Macro): 0.0878
Category-Weighted Accuracy (Weighted by Class Size): 0.0291
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9984    0.0101    0.0200    443753
           4     0.0000    0.0000    0.0000      3271
           5     0.0101    0.1503    0.0189      8905
           6     0.0142    0.6733    0.0279      4849
           7     0.0404    0.1319    0.0618     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0291    508010
   macro avg     0.0967    0.0878    0.0117    508010
weighted avg     0.8759    0.0291    0.0233    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  7
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 7 with val_auc = 0.6521
Epoch: 07, Loss: 2.2596
Val AUC: 0.6521 | Val F1: 0.0151
Test AUC: 0.6527 | Test F1: 0.0151
Sample-Weighted Accuracy: 0.0400
Category-Weighted Accuracy (Macro): 0.0932
Category-Weighted Accuracy (Weighted by Class Size): 0.0400
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9992    0.0201    0.0393    443753
           4     0.0028    0.0012    0.0017      3271
           5     0.0112    0.1762    0.0211      8905
           6     0.0144    0.6748    0.0282      4849
           7     0.0503    0.1525    0.0756     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0400    508010
   macro avg     0.0980    0.0932    0.0151    508010
weighted avg     0.8774    0.0400    0.0414    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  8
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 8 with val_auc = 0.6550
Epoch: 08, Loss: 2.2423
Val AUC: 0.6550 | Val F1: 0.0208
Test AUC: 0.6548 | Test F1: 0.0208
Sample-Weighted Accuracy: 0.0568
Category-Weighted Accuracy (Macro): 0.1012
Category-Weighted Accuracy (Weighted by Class Size): 0.0568
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9996    0.0337    0.0652    443753
           4     0.0031    0.0018    0.0023      3271
           5     0.0118    0.1996    0.0223      8905
           6     0.0146    0.6721    0.0287      4849
           7     0.0753    0.2061    0.1103     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0568    508010
   macro avg     0.1004    0.1012    0.0208    508010
weighted avg     0.8799    0.0568    0.0670    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  9
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 9 with val_auc = 0.6558
Epoch: 09, Loss: 2.2249
Val AUC: 0.6558 | Val F1: 0.0277
Test AUC: 0.6555 | Test F1: 0.0280
Sample-Weighted Accuracy: 0.0794
Category-Weighted Accuracy (Macro): 0.1103
Category-Weighted Accuracy (Weighted by Class Size): 0.0794
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       535
           1     0.0000    0.0000    0.0000       359
           2     0.0000    0.0000    0.0000       107
           3     0.9998    0.0528    0.1003    443753
           4     0.0044    0.0034    0.0038      3271
           5     0.0124    0.2145    0.0235      8905
           6     0.0150    0.6707    0.0294      4849
           7     0.1044    0.2721    0.1509     43096
           8     0.0000    0.0000    0.0000      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0000    0.0000    0.0000        35

    accuracy                         0.0794    508010
   macro avg     0.1033    0.1103    0.0280    508010
weighted avg     0.8826    0.0794    0.1011    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  10
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 10 with val_auc = 0.9935
Epoch: 10, Loss: 0.1332
Val AUC: 0.9935 | Val F1: 0.3481
Test AUC: 0.9935 | Test F1: 0.3419
Sample-Weighted Accuracy: 0.9726
Category-Weighted Accuracy (Macro): 0.3717
Category-Weighted Accuracy (Weighted by Class Size): 0.9726
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.6471    0.0206    0.0399       535
           1     0.0000    0.0000    0.0000       359
           2     0.1283    0.4953    0.2038       107
           3     0.9900    0.9967    0.9933    443753
           4     0.3420    0.0361    0.0653      3271
           5     0.5469    0.8314    0.6598      8905
           6     0.4649    0.1969    0.2767      4849
           7     0.9907    0.9750    0.9828     43096
           8     0.5276    0.4514    0.4865      2798
           9     0.0000    0.0000    0.0000       302
          10     0.0385    0.0857    0.0531        35

    accuracy                         0.9726    508010
   macro avg     0.4251    0.3717    0.3419    508010
weighted avg     0.9686    0.9726    0.9684    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  11
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 11 with val_auc = 0.9948
Epoch: 11, Loss: 0.0712
Val AUC: 0.9948 | Val F1: 0.3675
Test AUC: 0.9951 | Test F1: 0.3627
Sample-Weighted Accuracy: 0.9745
Category-Weighted Accuracy (Macro): 0.4129
Category-Weighted Accuracy (Weighted by Class Size): 0.9745
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.4848    0.0299    0.0563       535
           1     0.0000    0.0000    0.0000       359
           2     0.1344    0.7196    0.2265       107
           3     0.9908    0.9972    0.9940    443753
           4     0.4217    0.0107    0.0209      3271
           5     0.5793    0.8310    0.6827      8905
           6     0.4710    0.2483    0.3252      4849
           7     0.9972    0.9745    0.9857     43096
           8     0.5572    0.6351    0.5936      2798
           9     0.7500    0.0099    0.0196       302
          10     0.0857    0.0857    0.0857        35

    accuracy                         0.9745    508010
   macro avg     0.4975    0.4129    0.3627    508010
weighted avg     0.9715    0.9745    0.9705    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  12
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 12 with val_auc = 0.9951
Epoch: 12, Loss: 0.0653
Val AUC: 0.9951 | Val F1: 0.4028
Test AUC: 0.9955 | Test F1: 0.3992
Sample-Weighted Accuracy: 0.9754
Category-Weighted Accuracy (Macro): 0.4505
Category-Weighted Accuracy (Weighted by Class Size): 0.9754
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.4091    0.0168    0.0323       535
           1     0.0000    0.0000    0.0000       359
           2     0.1448    0.6916    0.2395       107
           3     0.9934    0.9957    0.9946    443753
           4     0.2206    0.0046    0.0090      3271
           5     0.5642    0.8750    0.6861      8905
           6     0.5186    0.3654    0.4287      4849
           7     0.9958    0.9770    0.9863     43096
           8     0.6261    0.6397    0.6328      2798
           9     0.5410    0.2185    0.3113       302
          10     0.0448    0.1714    0.0710        35

    accuracy                         0.9754    508010
   macro avg     0.4599    0.4505    0.3992    508010
weighted avg     0.9728    0.9754    0.9724    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  13
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 13 with val_auc = 0.9953
Epoch: 13, Loss: 0.0625
Val AUC: 0.9953 | Val F1: 0.4142
Test AUC: 0.9957 | Test F1: 0.4169
Sample-Weighted Accuracy: 0.9761
Category-Weighted Accuracy (Macro): 0.4957
Category-Weighted Accuracy (Weighted by Class Size): 0.9761
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.4500    0.0168    0.0324       535
           1     0.0000    0.0000    0.0000       359
           2     0.1497    0.8224    0.2532       107
           3     0.9924    0.9968    0.9946    443753
           4     0.3875    0.0095    0.0185      3271
           5     0.5745    0.8714    0.6925      8905
           6     0.5807    0.2999    0.3955      4849
           7     0.9914    0.9793    0.9853     43096
           8     0.7224    0.6687    0.6945      2798
           9     0.4348    0.3311    0.3759       302
          10     0.0851    0.4571    0.1435        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 14 with val_auc = 0.9955
Epoch: 14, Loss: 0.0606
Val AUC: 0.9955 | Val F1: 0.4267
Test AUC: 0.9958 | Test F1: 0.4274
Sample-Weighted Accuracy: 0.9765
Category-Weighted Accuracy (Macro): 0.5040
Category-Weighted Accuracy (Weighted by Class Size): 0.9765
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6667    0.0224    0.0434       535
           1     0.1667    0.0056    0.0108       359
           2     0.1435    0.8785    0.2467       107
           3     0.9927    0.9967    0.9947    443753
           4     0.3113    0.0144    0.0275      3271
           5     0.5749    0.8714    0.6928      8905
           6     0.5552    0.3700    0.4441      4849
           7     0.9960    0.9783    0.9871     43096
           8     0.8130    0.6415    0.7171      2798
           9     0.4895    0.3079    0.3780       302
          10     0.0964    0.4571    0.1592        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 15 with val_auc = 0.9956
Epoch: 15, Loss: 0.0588
Val AUC: 0.9956 | Val F1: 0.4301
Test AUC: 0.9959 | Test F1: 0.4386
Sample-Weighted Accuracy: 0.9764
Category-Weighted Accuracy (Macro): 0.5124
Category-Weighted Accuracy (Weighted by Class Size): 0.9764
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5455    0.0224    0.0431       535
           1     0.1429    0.0084    0.0158       359
           2     0.1527    0.8879    0.2606       107
           3     0.9915    0.9976    0.9946    443753
           4     0.3497    0.0804    0.1307      3271
           5     0.5838    0.8465    0.6911      8905
           6     0.5857    0.3423    0.4321      4849
           7     0.9962    0.9780    0.9870     43096
           8     0.8868    0.5350    0.6674      2798
           9     0.5152    0.3377    0.4080       302
          10     0.1160    0.6000    0.1944        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 16 with val_auc = 0.9959
Epoch: 16, Loss: 0.0580
Val AUC: 0.9959 | Val F1: 0.4419
Test AUC: 0.9961 | Test F1: 0.4491
Sample-Weighted Accuracy: 0.9771
Category-Weighted Accuracy (Macro): 0.5439
Category-Weighted Accuracy (Weighted by Class Size): 0.9771
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5938    0.0355    0.0670       535
           1     0.4000    0.0056    0.0110       359
           2     0.1498    0.8318    0.2539       107
           3     0.9931    0.9968    0.9949    443753
           4     0.3512    0.0260    0.0484      3271
           5     0.5923    0.8356    0.6932      8905
           6     0.5693    0.3879    0.4614      4849
           7     0.9963    0.9797    0.9879     43096
           8     0.7315    0.7713    0.7509      2798
           9     0.3762    0.5132    0.4342       302
          10     0.1479    0.6000    0.2373        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 17 with val_auc = 0.9961
Epoch: 17, Loss: 0.0568
Val AUC: 0.9961 | Val F1: 0.4345
Test AUC: 0.9963 | Test F1: 0.4434
Sample-Weighted Accuracy: 0.9773
Category-Weighted Accuracy (Macro): 0.4937
Category-Weighted Accuracy (Weighted by Class Size): 0.9773
Per-Class Test Metrics:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.4706    0.0150    0.0290       535
           1     0.0000    0.0000    0.0000       359
           2     0.1399    0.6916    0.2327       107
           3     0.9921    0.9976    0.9948    443753
           4     0.3953    0.0104    0.0203      3271
           5     0.5792    0.8901    0.7017      8905
           6     0.6097    0.3341    0.4317      4849
           7     0.9963    0.9791    0.9876     43096
           8     0.8423    0.6512    0.7345      2798
           9     0.5137    0.4338    0.4704       302
          10     0.2027    0.4286    0.2752        35

    accuracy                         0.9773    508010
   macro avg     0.5220    0.4937    0.4434    508010
weighted avg     0.9751    0.9773    0.9737    508010

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
epoch:  18
Training


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 18 with val_auc = 0.9962
Epoch: 18, Loss: 0.0567
Val AUC: 0.9962 | Val F1: 0.4434
Test AUC: 0.9964 | Test F1: 0.4558
Sample-Weighted Accuracy: 0.9777
Category-Weighted Accuracy (Macro): 0.5464
Category-Weighted Accuracy (Weighted by Class Size): 0.9777
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6129    0.0355    0.0671       535
           1     0.2857    0.0056    0.0109       359
           2     0.1483    0.8785    0.2537       107
           3     0.9935    0.9969    0.9952    443753
           4     0.4333    0.0079    0.0156      3271
           5     0.5843    0.8720    0.6997      8905
           6     0.6009    0.4030    0.4824      4849
           7     0.9946    0.9810    0.9878     43096
           8     0.7975    0.7166    0.7549      2798
           9     0.4936    0.5132    0.5032       302
          10     0.1522    0.6000    0.2428        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 19 with val_auc = 0.9962
Epoch: 19, Loss: 0.0561
Val AUC: 0.9962 | Val F1: 0.4483
Test AUC: 0.9964 | Test F1: 0.4606
Sample-Weighted Accuracy: 0.9772
Category-Weighted Accuracy (Macro): 0.5503
Category-Weighted Accuracy (Weighted by Class Size): 0.9772
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5556    0.0280    0.0534       535
           1     0.3000    0.0084    0.0163       359
           2     0.1461    0.8972    0.2513       107
           3     0.9918    0.9978    0.9948    443753
           4     0.3718    0.1122    0.1724      3271
           5     0.6098    0.7925    0.6892      8905
           6     0.5984    0.3448    0.4375      4849
           7     0.9924    0.9826    0.9874     43096
           8     0.8275    0.6876    0.7511      2798
           9     0.4727    0.5166    0.4937       302
          10     0.1304    0.6857    0.2192        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 20 with val_auc = 0.9963
Epoch: 20, Loss: 0.0556
Val AUC: 0.9963 | Val F1: 0.4700
Test AUC: 0.9964 | Test F1: 0.4869
Sample-Weighted Accuracy: 0.9775
Category-Weighted Accuracy (Macro): 0.5733
Category-Weighted Accuracy (Weighted by Class Size): 0.9775
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6111    0.0206    0.0398       535
           1     0.2857    0.0056    0.0109       359
           2     0.1491    0.8972    0.2557       107
           3     0.9950    0.9953    0.9952    443753
           4     0.3569    0.2131    0.2668      3271
           5     0.6207    0.7823    0.6922      8905
           6     0.5602    0.5189    0.5388      4849
           7     0.9936    0.9816    0.9876     43096
           8     0.7554    0.7680    0.7617      2798
           9     0.6052    0.4669    0.5271       302
          10     0.1783    0.6571    0.2805        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 21 with val_auc = 0.9963
Epoch: 21, Loss: 0.0552
Val AUC: 0.9963 | Val F1: 0.4373
Test AUC: 0.9964 | Test F1: 0.4485
Sample-Weighted Accuracy: 0.9775
Category-Weighted Accuracy (Macro): 0.5159
Category-Weighted Accuracy (Weighted by Class Size): 0.9775
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5238    0.0206    0.0396       535
           1     0.0000    0.0000    0.0000       359
           2     0.1488    0.8411    0.2528       107
           3     0.9910    0.9982    0.9946    443753
           4     0.3388    0.0125    0.0242      3271
           5     0.5942    0.8688    0.7058      8905
           6     0.6497    0.3232    0.4316      4849
           7     0.9937    0.9820    0.9878     43096
           8     0.8736    0.5979    0.7100      2798
           9     0.5328    0.4305    0.4762       302
          10     0.2100    0.6000    0.3111        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 22 with val_auc = 0.9964
Epoch: 22, Loss: 0.0551
Val AUC: 0.9964 | Val F1: 0.4799
Test AUC: 0.9965 | Test F1: 0.4894
Sample-Weighted Accuracy: 0.9775
Category-Weighted Accuracy (Macro): 0.5808
Category-Weighted Accuracy (Weighted by Class Size): 0.9775
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.7647    0.0243    0.0471       535
           1     0.2800    0.0195    0.0365       359
           2     0.1469    0.8785    0.2517       107
           3     0.9941    0.9967    0.9954    443753
           4     0.3559    0.3360    0.3457      3271
           5     0.6407    0.7302    0.6825      8905
           6     0.6269    0.3811    0.4740      4849
           7     0.9935    0.9819    0.9877     43096
           8     0.7247    0.7931    0.7573      2798
           9     0.4231    0.6192    0.5027       302
          10     0.2000    0.6286    0.3034        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 1 epoch(s)
Epoch: 23, Loss: 0.0553
Val AUC: 0.9962 | Val F1: 0.4411
Test AUC: 0.9964 | Test F1: 0.4500
Sample-Weighted Accuracy: 0.9769
Category-Weighted Accuracy (Macro): 0.5194
Category-Weighted Accuracy (Weighted by Class Size): 0.9769
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5312    0.0318    0.0600       535
           1     0.2857    0.0056    0.0109       359
           2     0.1474    0.8318    0.2504       107
           3     0.9914    0.9975    0.9945    443753
           4     0.3435    0.0483    0.0847      3271
           5     0.5955    0.8338    0.6948      8905
           6     0.5895    0.4205    0.4909      4849
           7     0.9941    0.9818    0.9879     43096
           8     0.9137    0.5071    0.6523      2798
           9     0.5864    0.4272    0.4943       302
          10     0.1401    0.6286    0.2292        35

    accuracy                         0.9769    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 24 with val_auc = 0.9964
Epoch: 24, Loss: 0.0545
Val AUC: 0.9964 | Val F1: 0.4749
Test AUC: 0.9966 | Test F1: 0.4907
Sample-Weighted Accuracy: 0.9775
Category-Weighted Accuracy (Macro): 0.5842
Category-Weighted Accuracy (Weighted by Class Size): 0.9775
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6842    0.0243    0.0469       535
           1     0.5000    0.0084    0.0164       359
           2     0.1471    0.8785    0.2520       107
           3     0.9954    0.9952    0.9953    443753
           4     0.3413    0.2430    0.2839      3271
           5     0.6274    0.7646    0.6892      8905
           6     0.5427    0.5618    0.5521      4849
           7     0.9939    0.9814    0.9876     43096
           8     0.8348    0.7298    0.7788      2798
           9     0.5015    0.5530    0.5260       302
          10     0.1678    0.6857    0.2697        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 25 with val_auc = 0.9964
Epoch: 25, Loss: 0.0546
Val AUC: 0.9964 | Val F1: 0.4585
Test AUC: 0.9966 | Test F1: 0.4651
Sample-Weighted Accuracy: 0.9778
Category-Weighted Accuracy (Macro): 0.5660
Category-Weighted Accuracy (Weighted by Class Size): 0.9778
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5833    0.0262    0.0501       535
           1     0.2667    0.0111    0.0214       359
           2     0.1433    0.8411    0.2449       107
           3     0.9943    0.9963    0.9953    443753
           4     0.3846    0.0199    0.0378      3271
           5     0.5923    0.8568    0.7004      8905
           6     0.5845    0.4337    0.4979      4849
           7     0.9924    0.9822    0.9873     43096
           8     0.7519    0.7873    0.7692      2798
           9     0.4707    0.5861    0.5221       302
          10     0.1832    0.6857    0.2892        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 1 epoch(s)
Epoch: 26, Loss: 0.0540
Val AUC: 0.9964 | Val F1: 0.4583
Test AUC: 0.9966 | Test F1: 0.4657
Sample-Weighted Accuracy: 0.9780
Category-Weighted Accuracy (Macro): 0.5656
Category-Weighted Accuracy (Weighted by Class Size): 0.9780
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6333    0.0355    0.0673       535
           1     0.3500    0.0195    0.0369       359
           2     0.1424    0.8598    0.2444       107
           3     0.9937    0.9969    0.9953    443753
           4     0.3484    0.0400    0.0718      3271
           5     0.5991    0.8298    0.6958      8905
           6     0.6160    0.4232    0.5017      4849
           7     0.9945    0.9823    0.9884     43096
           8     0.7450    0.7927    0.7681      2798
           9     0.4912    0.5563    0.5217       302
          10     0.1387    0.6857    0.2308        35

    accuracy                         0.9780    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 27 with val_auc = 0.9965
Epoch: 27, Loss: 0.0539
Val AUC: 0.9965 | Val F1: 0.4528
Test AUC: 0.9967 | Test F1: 0.4647
Sample-Weighted Accuracy: 0.9779
Category-Weighted Accuracy (Macro): 0.5648
Category-Weighted Accuracy (Weighted by Class Size): 0.9779
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6154    0.0299    0.0570       535
           1     0.3333    0.0195    0.0368       359
           2     0.1487    0.8131    0.2514       107
           3     0.9943    0.9965    0.9954    443753
           4     0.3663    0.0226    0.0426      3271
           5     0.5873    0.8551    0.6964      8905
           6     0.6017    0.4215    0.4958      4849
           7     0.9946    0.9814    0.9879     43096
           8     0.7444    0.7931    0.7680      2798
           9     0.4253    0.6225    0.5054       302
          10     0.1742    0.6571    0.2754        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 1 epoch(s)
Epoch: 28, Loss: 0.0539
Val AUC: 0.9964 | Val F1: 0.4673
Test AUC: 0.9966 | Test F1: 0.4736
Sample-Weighted Accuracy: 0.9776
Category-Weighted Accuracy (Macro): 0.5652
Category-Weighted Accuracy (Weighted by Class Size): 0.9776
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6000    0.0280    0.0536       535
           1     0.2381    0.0139    0.0263       359
           2     0.1492    0.8785    0.2551       107
           3     0.9949    0.9955    0.9952    443753
           4     0.3309    0.0694    0.1147      3271
           5     0.5932    0.8457    0.6973      8905
           6     0.5538    0.5356    0.5446      4849
           7     0.9947    0.9813    0.9880     43096
           8     0.8497    0.7034    0.7697      2798
           9     0.5731    0.4801    0.5225       302
          10     0.1472    0.6857    0.2424        35

    accuracy                         0.9776    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 29 with val_auc = 0.9965
Epoch: 29, Loss: 0.0535
Val AUC: 0.9965 | Val F1: 0.4638
Test AUC: 0.9967 | Test F1: 0.4842
Sample-Weighted Accuracy: 0.9783
Category-Weighted Accuracy (Macro): 0.5648
Category-Weighted Accuracy (Weighted by Class Size): 0.9783
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5625    0.0168    0.0327       535
           1     0.3913    0.0251    0.0471       359
           2     0.1445    0.8224    0.2458       107
           3     0.9933    0.9973    0.9953    443753
           4     0.3925    0.0859    0.1410      3271
           5     0.6053    0.8467    0.7059      8905
           6     0.6314    0.3805    0.4748      4849
           7     0.9940    0.9825    0.9882     43096
           8     0.8156    0.7459    0.7792      2798
           9     0.4311    0.6523    0.5191       302
          10     0.2840    0.6571    0.3966        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 1 epoch(s)
Epoch: 30, Loss: 0.0538
Val AUC: 0.9965 | Val F1: 0.4629
Test AUC: 0.9967 | Test F1: 0.4677
Sample-Weighted Accuracy: 0.9781
Category-Weighted Accuracy (Macro): 0.5677
Category-Weighted Accuracy (Weighted by Class Size): 0.9781
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6000    0.0336    0.0637       535
           1     0.2895    0.0306    0.0554       359
           2     0.1495    0.8692    0.2551       107
           3     0.9934    0.9974    0.9954    443753
           4     0.3460    0.0666    0.1118      3271
           5     0.5989    0.8430    0.7003      8905
           6     0.6428    0.3588    0.4606      4849
           7     0.9943    0.9825    0.9884     43096
           8     0.7805    0.7648    0.7726      2798
           9     0.4648    0.6126    0.5286       302
          10     0.1257    0.6857    0.2124        35

    accuracy                         0.9781    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 31 with val_auc = 0.9966
Epoch: 31, Loss: 0.0535
Val AUC: 0.9966 | Val F1: 0.4632
Test AUC: 0.9967 | Test F1: 0.4664
Sample-Weighted Accuracy: 0.9782
Category-Weighted Accuracy (Macro): 0.5692
Category-Weighted Accuracy (Weighted by Class Size): 0.9782
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6364    0.0262    0.0503       535
           1     0.3333    0.0223    0.0418       359
           2     0.1485    0.8785    0.2541       107
           3     0.9937    0.9968    0.9952    443753
           4     0.3743    0.0400    0.0724      3271
           5     0.5983    0.8475    0.7014      8905
           6     0.6144    0.4574    0.5244      4849
           7     0.9950    0.9821    0.9885     43096
           8     0.8111    0.7366    0.7721      2798
           9     0.5000    0.5596    0.5281       302
          10     0.1179    0.7143    0.2024        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 1 epoch(s)
Epoch: 32, Loss: 0.0531
Val AUC: 0.9965 | Val F1: 0.4725
Test AUC: 0.9966 | Test F1: 0.4806
Sample-Weighted Accuracy: 0.9778
Category-Weighted Accuracy (Macro): 0.5902
Category-Weighted Accuracy (Weighted by Class Size): 0.9778
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.7059    0.0224    0.0435       535
           1     0.2537    0.0474    0.0798       359
           2     0.1509    0.8972    0.2584       107
           3     0.9960    0.9949    0.9954    443753
           4     0.3578    0.0642    0.1089      3271
           5     0.5847    0.8766    0.7015      8905
           6     0.5784    0.5508    0.5643      4849
           7     0.9943    0.9816    0.9879     43096
           8     0.8314    0.6873    0.7525      2798
           9     0.4459    0.6556    0.5308       302
          10     0.1613    0.7143    0.2632        35

    accuracy                         0.9778    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 2 epoch(s)
Epoch: 33, Loss: 0.0532
Val AUC: 0.9965 | Val F1: 0.4547
Test AUC: 0.9966 | Test F1: 0.4673
Sample-Weighted Accuracy: 0.9784
Category-Weighted Accuracy (Macro): 0.5370
Category-Weighted Accuracy (Weighted by Class Size): 0.9784
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.5556    0.0187    0.0362       535
           1     0.8333    0.0139    0.0274       359
           2     0.1424    0.7570    0.2396       107
           3     0.9942    0.9967    0.9954    443753
           4     0.3714    0.0238    0.0448      3271
           5     0.5897    0.8788    0.7058      8905
           6     0.6218    0.4415    0.5164      4849
           7     0.9930    0.9831    0.9880     43096
           8     0.8054    0.7513    0.7774      2798
           9     0.4765    0.4702    0.4733       302
          10     0.2381    0.5714    0.3361        35

    accuracy                         0.9784    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 3 epoch(s)
Epoch: 34, Loss: 0.0527
Val AUC: 0.9965 | Val F1: 0.4632
Test AUC: 0.9967 | Test F1: 0.4719
Sample-Weighted Accuracy: 0.9783
Category-Weighted Accuracy (Macro): 0.5514
Category-Weighted Accuracy (Weighted by Class Size): 0.9783
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.7500    0.0224    0.0436       535
           1     0.5556    0.0139    0.0272       359
           2     0.1477    0.8224    0.2504       107
           3     0.9932    0.9974    0.9953    443753
           4     0.3855    0.0211    0.0400      3271
           5     0.5981    0.8546    0.7037      8905
           6     0.6039    0.4176    0.4938      4849
           7     0.9951    0.9820    0.9885     43096
           8     0.8239    0.7405    0.7800      2798
           9     0.4784    0.6225    0.5410       302
          10     0.2299    0.5714    0.3279        35

    accuracy                         0.9783    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 4 epoch(s)
Epoch: 35, Loss: 0.0527
Val AUC: 0.9965 | Val F1: 0.4592
Test AUC: 0.9967 | Test F1: 0.4678
Sample-Weighted Accuracy: 0.9780
Category-Weighted Accuracy (Macro): 0.5732
Category-Weighted Accuracy (Weighted by Class Size): 0.9780
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.8000    0.0224    0.0436       535
           1     0.4400    0.0306    0.0573       359
           2     0.1481    0.8692    0.2531       107
           3     0.9930    0.9972    0.9951    443753
           4     0.3993    0.0703    0.1196      3271
           5     0.6010    0.8282    0.6965      8905
           6     0.6361    0.4211    0.5068      4849
           7     0.9938    0.9822    0.9880     43096
           8     0.8370    0.7230    0.7758      2798
           9     0.5071    0.5894    0.5452       302
          10     0.0922    0.7714    0.1646        35

    accuracy                         0.9780    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 5 epoch(s)
🔁 Reverting to best model from epoch 31
💡 Learning rate halved to 0.000500


  model.load_state_dict(torch.load(best_model_path))


Epoch: 36, Loss: 0.0528
Val AUC: 0.9965 | Val F1: 0.4781
Test AUC: 0.9967 | Test F1: 0.4914
Sample-Weighted Accuracy: 0.9782
Category-Weighted Accuracy (Macro): 0.5571
Category-Weighted Accuracy (Weighted by Class Size): 0.9782
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6522    0.0280    0.0538       535
           1     0.1951    0.0446    0.0726       359
           2     0.1482    0.9252    0.2555       107
           3     0.9940    0.9966    0.9953    443753
           4     0.3787    0.0673    0.1142      3271
           5     0.5843    0.8878    0.7048      8905
           6     0.6421    0.4230    0.5100      4849
           7     0.9949    0.9819    0.9883     43096
           8     0.8944    0.6601    0.7596      2798
           9     0.5324    0.5993    0.5639       302
          10     0.3103    0.5143    0.3871        35

    accuracy                         0.9782    508010
   macro avg     0.5751    0.5571    0.4914 

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

✅ New best model saved at epoch 37 with val_auc = 0.9966
Epoch: 37, Loss: 0.0527
Val AUC: 0.9966 | Val F1: 0.4733
Test AUC: 0.9968 | Test F1: 0.4768
Sample-Weighted Accuracy: 0.9784
Category-Weighted Accuracy (Macro): 0.5510
Category-Weighted Accuracy (Weighted by Class Size): 0.9784
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.4000    0.0075    0.0147       535
           1     0.3684    0.0195    0.0370       359
           2     0.1503    0.8879    0.2571       107
           3     0.9931    0.9975    0.9953    443753
           4     0.3487    0.0556    0.0960      3271
           5     0.6009    0.8569    0.7064      8905
           6     0.6425    0.3863    0.4825      4849
           7     0.9937    0.9828    0.9882     43096
           8     0.8516    0.7320    0.7872      2798
           9     0.4769    0.6490    0.5498       302
          10     0.2500    0.4857    0.3301        35

    accuracy                         0.97

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 1 epoch(s)
Epoch: 38, Loss: 0.0524
Val AUC: 0.9965 | Val F1: 0.4737
Test AUC: 0.9968 | Test F1: 0.4780
Sample-Weighted Accuracy: 0.9782
Category-Weighted Accuracy (Macro): 0.5820
Category-Weighted Accuracy (Weighted by Class Size): 0.9782
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.7647    0.0243    0.0471       535
           1     0.2727    0.0167    0.0315       359
           2     0.1489    0.8879    0.2550       107
           3     0.9942    0.9965    0.9954    443753
           4     0.3599    0.0939    0.1489      3271
           5     0.6089    0.8221    0.6996      8905
           6     0.5927    0.4723    0.5257      4849
           7     0.9941    0.9828    0.9884     43096
           8     0.8117    0.7673    0.7889      2798
           9     0.4805    0.6523    0.5534       302
          10     0.1341    0.6857    0.2243        35

    accuracy                         0.9782    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/397 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/497 [00:00<?, ?it/s]

⚠️  No improvement for 2 epoch(s)
Epoch: 39, Loss: 0.0526
Val AUC: 0.9966 | Val F1: 0.4622
Test AUC: 0.9967 | Test F1: 0.4717
Sample-Weighted Accuracy: 0.9784
Category-Weighted Accuracy (Macro): 0.5663
Category-Weighted Accuracy (Weighted by Class Size): 0.9784
Per-Class Test Metrics:
              precision    recall  f1-score   support

           0     0.6250    0.0187    0.0363       535
           1     0.4000    0.0056    0.0110       359
           2     0.1499    0.9065    0.2573       107
           3     0.9936    0.9970    0.9953    443753
           4     0.4058    0.0388    0.0709      3271
           5     0.5979    0.8560    0.7041      8905
           6     0.6093    0.4341    0.5070      4849
           7     0.9949    0.9823    0.9886     43096
           8     0.8273    0.7552    0.7896      2798
           9     0.5461    0.5497    0.5479       302
          10     0.1765    0.6857    0.2807        35

    accuracy                         0.9784    508010
   macro a

Training:   0%|          | 0/1587 [00:00<?, ?it/s]



In [None]:
# %timeit train_model(train_loader)

✅ New best model saved at epoch 37 with val_auc = 0.9966
Epoch: 37, Loss: 0.0527
Val AUC: 0.9966 | Val F1: 0.4733
Test AUC: 0.9968 | Test F1: 0.4768
Sample-Weighted Accuracy: 0.9784
Category-Weighted Accuracy (Macro): 0.5510
Category-Weighted Accuracy (Weighted by Class Size): 0.9784

Per-Class Test Metrics:


| class | precision | recall | f1-score | support |  
| --- | --- | --- | --- | --- |
| 0 | 0.4000     | 0.0075   |   0.0147       |  535 |   
| 1 | 0.3684     | 0.0195    |  0.0370       |  359 |   
| 2 | 0.1503     | 0.8879     | 0.2571       |  107 |   
| 3 | 0.9931    |  0.9975     | 0.9953     | 443753 |   
| 4 | 0.3487    |  0.0556    |  0.0960      |  3271 |   
| 5 | 0.6009    |  0.8569     | 0.7064       | 8905 |   
| 6 | 0.6425    |  0.3863     | 0.4825      |  4849 |   
| 7 | 0.9937    |  0.9828     | 0.9882      | 43096 |   
| 8 | 0.8516 |     0.7320     | 0.7872     |   2798 |   
| 9 | 0.4769 |     0.6490     | 0.5498  |       302 |   
| 10 | 0.2500 | 0.4857     | 0.3301  |        35 |   
|      accuracy |           |          |   0.9784 |   508010 |
|    macro avg |    0.5524  |  0.5510  |  0.4768  |  508010 |
| weighted avg |    0.9764  |  0.9784  |  0.9757  |  508010 |

Memory allocated: 0.00 GB
Memory reserved: 0.00 GB

## Summary

The current model achieves:

- Val AUC: 0.9965 | Test AUC: 0.9967
- Macro F1: 0.4647 | Macro Recall: 0.5648
- Accuracy: 97.8% (Sample and Weighted)
- All 11 classes are preserved and evaluated

Compared to the literature, these results:
- Exceed or match state-of-the-art in test AUC and macro F1
- Are competitive while explicitly addressing class imbalance
- Introduce a novel batch sampling strategy to promote inclusion of rare classes
- Do not omit or collapse categories, even for underrepresented classes

This positions our work as a meaningful contribution to both modeling and evaluation practice in multiclass intrusion detection.


## ✅ Next Steps

- **Adaptive batch sizing**: Vary batch size across epochs to gradually transition from class-balanced learning to real-world distribution exposure.
- **Category weighting**: Use class-weighted loss functions (e.g., `weight=` in `CrossEntropyLoss`) to prioritize underrepresented classes.
- **Implement side-by-side experiments on cloud**: Evaluate the effect of different curriculum learning schedules, architectures, and loss functions on standardized metrics.
- **Streamline and modularize code**: Refactor training logic, batch samplers, dataset creation, and evaluation into reusable modules.
- **Save processed data into a Kaggle dataset**: Export the stratified, labeled, and scaled datasets for reproducibility and public benchmarks.
- **Need to update adaptive LR based on Loss**: 

---

## 🔍 Kill Chain-Based Grouping Recommendation

The current class labels can be grouped based on the **Cyber Kill Chain** to reflect real-world attacker stages:

| Kill Chain Phase       | Categories                                 |
|------------------------|--------------------------------------------|
| Reconnaissance         | `RECONNAISSANCE`, `ANALYSIS`               |
| Delivery & Probing     | `FUZZERS`, `GENERIC`                       |
| Exploitation           | `EXPLOITS`, `SHELLCODE`                    |
| Installation           | `BACKDOOR`, `BACKDOORS`, `WORMS`           |
| Impact                 | `DOS`                                      |
| Benign / No Threat     | `BENIGN`                                   |

- This grouping reflects attacker **intent and behavior** rather than statistical frequency.
- Group-based accuracy and confusion evaluation could highlight which *phases* of attacks are better understood by the model.
- These groups can also serve as a **superclass for hierarchical evaluation or modeling**.

---

## 🧠 Other Possible Improvements

### Curriculum & Sampling

- **Community of Experts system**:
  - A *Rare Attack Expert* model distinguishes rare attacks (positive) from benign + common attacks (negative).
  - Its output can act as a binary flag, an input feature, or a gating signal for a full classifier.
  - This model can also benefit from curriculum learning with enforced rare-class exposure.

- **Curriculum scheduling**:
  - Gradually relax enforced class balance.
  - Transition based on class-level confidence, entropy, or plateaued performance.

- **Category-aware sampling**:
  - Ensure each batch includes one or more examples from every class (if available).
  - Smaller batches early on promote meaningful exposure to rare classes.

- **Entropy-based routing**:
  - Use prediction uncertainty to route samples to fallback classifiers or expert branches.

- **Balanced sampling without replacement**:
  - Cycle through rare categories across epochs in a structured manner.

---

### Loss Functions

- **Cross Entropy Loss (baseline)**  
- **Class-weighted Cross Entropy**  
- **Focal Loss**: Emphasizes difficult or minority examples  
- **LDAM Loss**: Enforces larger margins for rare classes  
- **Label Smoothing**: Reduces overconfidence in dominant classes  
- **Contrastive Loss**: Useful for unsupervised or hybrid pretraining

---

### Optimizers & Scheduling

- **Adam / AdamW / RAdam**
- **SGD + Momentum**: More sensitive to batch quality, may benefit curriculum
- **Lookahead Optimizer**: Adds stability
- **Learning Rate Schedulers**:
  - `OneCycleLR`, `CosineAnnealing`, or plateau-based adjustments

---

### Modeling & Architectures

- **Mixture-of-Experts (MoE)**: Gated network architecture where subnetworks specialize in different classes or superclasses.
- **Rare vs Common Hybrid Classifier**: Binary classifier distinguishes “rare” vs “not-rare” then passes to full classifier.
- **Contrastive Pretraining + Supervised Fine-Tuning**

---

### Evaluation & Monitoring

- **Macro vs Weighted Precision/Recall/F1**
- **Per-Class AUC and Calibration**
- **GPU Utilization & Profiling** (`torch.profiler`, Kaggle’s resource monitor)
- **Memory Tracking**: `torch.cuda.memory_allocated()`, etc.

---

### Class Similarity & Semantically-Aware Evaluation

- **Problem with top-k metrics**:
  - (DOG or DUCK) ≠ (CHICKEN or TURKEY); proximity matters
- **Alternative strategies**:
  - Confusion matrix clustering
  - Manual taxonomy (e.g., propagation vs stealth vs denial)
  - Group-aware metrics (e.g., phase-level accuracy)
- **Future Work**:
  - Hierarchical prediction
  - Cost-sensitive loss tied to group similarity
  - Logit-space analysis for class neighborhoods
