<a href="https://colab.research.google.com/github/Justin-Hwang/EEG-AD-FTD-Detection/blob/main/Initial_Model_Training_Sanity_Check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# MyDrive 루트에 어떤 폴더들이 있는지 확인
!ls /content/drive/MyDrive

'0405 Research report.gdoc'
'2022 한화 글로벌 인턴십 최종 발표 자료.pptx'
'2023 Spring Semester'
'2024 Fall Lab Research'
'2024 Fall Semester'
'2024 Hanwha Global Internship'
'2024 Resume'
'2024 Spring Semester'
'2024 생각정리.gdoc'
'2024 신년 계획.gsheet'
 2025_Lab_Research
'2025 신년 계획.gsheet'
'Act Utilitarianism.gdoc'
 A_LAB3_wireless_communication.gdoc
'A_Lab7&8_Carrer_Selep_Hwang.gdoc'
'Anheuser Busch final round interview.gdoc'
'Anheuser-Busch ML Engineer Qualification.gdoc'
 Assignment2.zip
'Assignment 4.gdoc'
'Assignment 7 [Output Screenshots].gdoc'
 Assignments
'Assignments 2.gdoc'
'Berkeley Personal History.gdoc'
'Big Data Final Project'
'Big Data.zip'
'B_Lab 2_Justin Hwang .gdoc'
'Bondit Internship'
'Bonus Point.gdoc'
'Career development'
'Coding Test'
'Colab Notebooks'
'Columbia University Office Hour.gdoc'
'Columbia University Video Interview.gdoc'
'Columbia Univ Interview Preparation Answer sheet.gdoc'
'Copy of cover-letter-guide-and-samples.gdoc'
'Cop

In [None]:
# 그중에 2025 Lab Research 폴더 안을 확인
!ls "/content/drive/MyDrive/2025_Lab_Research"

'Colab Files'			  eeg_holdout.db
'Data Preparation.gdoc'		  eeg_holdout_fixed_1.db
 eeg_dataset.py			  eeg_optuna_trial_1.db
 EEGformer_model_training.ipynb   eeg_optuna_trial_2.db
 eegformer_optuna_cv_3.db	  eeg_optuna_trial_3.db
 eegformer_optuna_cv_4.db	 'EEG Transformer Architecture.gdoc'
 eegformer_optuna_cv_5.db	 'Lab Info'
 eeg_holdout-1.db		 'Lab Research Paper Review'
 eeg_holdout-2.db		 'Meeting Note.gdoc'
 eeg_holdout-3.db		  model-data
 eeg_holdout-4.db		  model-data.zip
 eeg_holdout-5.db		  models_depracated.py
 eeg_holdout-6.db		  models.py
 eeg_holdout-7.db		  Practice_Note0.ipynb
 eeg_holdout-8.db		  __pycache__
 eeg_holdout-9.db		  Untitled


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/2025_Lab_Research')

In [None]:
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Running on", DEVICE)  # → “cuda” 가 뜨면 GPU 정상

Running on cuda


In [None]:
import wandb
wandb.login()  # 첫 실행 시 API 키 입력

[34m[1mwandb[0m: Currently logged in as: [33mjh8032[0m ([33mjh8032-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
!pip install optuna
!pip install wandb
!pip install mne



### 1. Model Overfit Test

In [None]:
import os
import json

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from eeg_dataset import EEGDataset
from models import EEGformer

def main():
    # ─── 설정 ──────────────────────────────────────────
    DATA_DIR    = '/content/drive/MyDrive/2025_Lab_Research/model-data'
    LABEL_FILE  = "labels.json"
    BATCH_SIZE  = 32
    LR          = 1e-3
    NUM_ITERS   = 100
    KERNEL_SIZE = 10      # ODCM Kernel Size
    NUM_FILTERS = 120     # ODCM Filter (C)
    NUM_HEADS   = 4       # Transformer Heads
    NUM_BLOCKS  = 2       # Transformer Blocks
    NUM_SEGMENTS= 15      # TTM time segments (M)
    NUM_CLASSES = 3       # Class

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # ─── Meta Data Load─────────────────────────────────
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]

    # ─── Dataset & DataLoader ───────────────────────────
    dataset    = EEGDataset(DATA_DIR, train_meta)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )

    # ─── Extract One Batch ─────────────────────────────────
    X_small, y_small = next(iter(dataloader))
    X_small, y_small = X_small.to(device), y_small.to(device)
    B, S, L = X_small.shape  # Batch, Channels, Time-length

    print(f"Overfit Test Batch Shape: X_small={X_small.shape}, y_small={y_small.shape} on {device}")

    # ─── Model, Loss, Optimizer ────────────────────────
    model = EEGformer(
        in_channels  = S,
        input_length = L,
        kernel_size  = KERNEL_SIZE,
        num_filters  = NUM_FILTERS,
        num_heads    = NUM_HEADS,
        num_blocks   = NUM_BLOCKS,
        num_segments = NUM_SEGMENTS,
        num_classes  = NUM_CLASSES,
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    # ─── Overfit Training Loop ─────────────────────────────────
    model.train()
    for i in range(1, NUM_ITERS+1):
        optimizer.zero_grad()
        logits = model(X_small)            # [BATCH_SIZE, NUM_CLASSES]
        loss   = criterion(logits, y_small)
        loss.backward()
        optimizer.step()

        if i == 1 or i % 10 == 0:
            # 배치 정확도 계산
            with torch.no_grad():
                preds = logits.argmax(dim=1)
                acc   = (preds == y_small).float().mean().item() * 100
            print(f"Iter {i:03d} | loss = {loss.item():.6f} | acc = {acc:5.2f}%")

    print("Finished overfit test.")

if __name__ == "__main__":
    main()

Attempting to create new mne-python configuration file:
/root/.mne/mne-python.json
Now using CUDA device 0
Enabling CUDA with 39.14 GiB available memory
Overfit Test Batch Shape: X_small=torch.Size([32, 19, 1425]), y_small=torch.Size([32]) on cuda
Iter 001 | loss = 1.102822 | acc = 28.12%
Iter 010 | loss = 1.007585 | acc = 53.12%
Iter 020 | loss = 0.973565 | acc = 53.12%
Iter 030 | loss = 0.077861 | acc = 100.00%
Iter 040 | loss = 0.000408 | acc = 100.00%
Iter 050 | loss = 0.000021 | acc = 100.00%
Iter 060 | loss = 0.000009 | acc = 100.00%
Iter 070 | loss = 0.000005 | acc = 100.00%
Iter 080 | loss = 0.000003 | acc = 100.00%
Iter 090 | loss = 0.000002 | acc = 100.00%
Iter 100 | loss = 0.000002 | acc = 100.00%
Finished overfit test.


### Search the best Hyperparameter using Hold-out set

### Test with dropout rate = 0.1
- Transformer Model: Revised the model to add a dropout parameter in self-attention head and transformer block
- Change the Learning Rate Scheduler from LRstep to ReduceLROnPlateau for improving the validation loss decrease
- Step Size = 5, gamma = 0.5 -> If validation loss does not decrease in 5 epochs, the LR will be decreased by half

#### Implement Grid Search to find the search space

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Grid Search 후보값 ────────────────────────────────────────
LR_CHOICES          = [3e-4, 4e-4, 5e-4]
WD_CHOICES          = [5e-5, 5e-4, 1e-3]
NUM_FILTERS         = 120
NUM_BLOCK_CHOICES   = [2]
NUM_HEAD_CHOICES    = [3, 4]
SEGMENT_CHOICES     = [5]

# ─── Training configuration ───────────────────────────────────
MAX_EPOCHS  = 100
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ──────────────────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ─── Scheduler Hyperparameter ─────────────────────────────────
FIXED_STEP_SIZE = 5
FIXED_GAMMA     = 0.5


def objective_holdout(trial):
    # ─── 1) Grid용 sampling ───────────────────────────────────
    lr           = trial.suggest_categorical("lr", LR_CHOICES)
    weight_decay = trial.suggest_categorical("weight_decay", WD_CHOICES)
    num_blocks   = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads    = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    # ─── 2) 데이터 로드 ────────────────────────────────────────
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds    = EEGDataset(DATA_DIR, train_meta)
    labels     = [d["label"] for d in train_meta]
    n_samples  = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # ─── 3) Hold-out split ─────────────────────────────────────
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # ─── 4) W&B init ───────────────────────────────────────────
    wandb.init(
        project="eeg-holdout-grid-search",
        config={
            "lr": lr,
            "weight_decay": weight_decay,
            "num_blocks": num_blocks,
            "num_heads": num_heads,
            "num_segments": num_segments
        }
    )

    print(f"\n===== Trial {trial.number} =====")
    print(
        f" lr={lr:.2e}, wd={weight_decay:.2e}, "
        f"blocks={num_blocks}, heads={num_heads}, segs={num_segments}"
    )

    # ─── 5) Model / optimizer / loss ──────────────────────────
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay
    )
    criterion = nn.CrossEntropyLoss()

    # ─── 6) Scheduler ──────────────────────────────────────────
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=FIXED_GAMMA,
        patience=FIXED_STEP_SIZE,
        min_lr=1e-6
    )

    # ─── 7) Training loop w/ Early Stopping & Pruning ─────────
    best_val_loss     = float("inf")
    epochs_no_improve = 0

    best_train_loss = best_train_acc = best_val_acc = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()

        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()

            tloss    += loss.item()
            tcorrect += (logits.argmax(1) == y).sum().item()
            ttotal   += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — validate —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y    = X.to(DEVICE), y.to(DEVICE)
                logits  = model(X)
                loss    = criterion(logits, y)
                vloss   += loss.item()
                vcorrect+= (logits.argmax(1) == y).sum().item()
                vtotal  += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # — report & pruning check —
        trial.report(val_loss, epoch)
        if trial.should_prune():
            wandb.finish()
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        # — print & log —
        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
            f"time={elapsed:.1f}s"
        )
        wandb.log({
            "epoch":               epoch,
            "train_loss":          train_loss,
            "train_accuracy":      train_acc,
            "validation_loss":     val_loss,
            "validation_accuracy": val_acc,
        }, step=epoch)

        scheduler.step(val_loss)

        # — early stopping logic & save best metrics —
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # store best-epoch metrics
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()

    # ─── GridSampler용 파라미터 그리드 ─────────────────────────
    param_grid = {
        "lr":            LR_CHOICES,
        "weight_decay":  WD_CHOICES,
        "num_blocks":    NUM_BLOCK_CHOICES,
        "num_heads":     NUM_HEAD_CHOICES,
        "num_segments":  SEGMENT_CHOICES,
    }

    sampler = optuna.samplers.GridSampler(param_grid)
    study = optuna.create_study(
        direction="minimize",
        sampler=sampler,
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_grid_search",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_grid_search.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout)  # grid 크기만큼 자동 실행

    # ─── 결과 출력 ─────────────────────────────────────────────
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss       = {best.value:.6f}")
    print(f"best_train_loss     = {best.user_attrs['best_train_loss']:.6f}")
    print(f"best_train_accuracy = {best.user_attrs['best_train_acc']:.4f}")
    print(f"best_val_accuracy   = {best.user_attrs['best_val_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


Now using CUDA device 0
Enabling CUDA with 39.14 GiB available memory


[I 2025-05-02 00:26:22,491] A new study created in RDB with name: eeg_holdout_grid_search



===== Trial 0 =====
 lr=3.00e-04, wd=5.00e-04, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0704 acc=0.4237 | val_loss=1.0744 acc=0.4317 | time=30.6s
Epoch 002 | train_loss=1.0677 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=29.9s
Epoch 003 | train_loss=1.0672 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=29.9s
Epoch 004 | train_loss=1.0685 acc=0.4276 | val_loss=1.0817 acc=0.4317 | time=30.0s
Epoch 005 | train_loss=1.0665 acc=0.4260 | val_loss=1.0804 acc=0.4317 | time=29.9s
Epoch 006 | train_loss=1.0696 acc=0.4276 | val_loss=1.0754 acc=0.4317 | time=29.8s
Epoch 007 | train_loss=1.0673 acc=0.4311 | val_loss=1.0764 acc=0.4317 | time=30.0s
Epoch 008 | train_loss=1.0669 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=29.9s
Epoch 009 | train_loss=1.0666 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=30.0s
Epoch 010 | train_loss=1.0665 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=30.0s
Epoch 011 | train_loss=1.0671 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=30.0s
Epoch 012 | t

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁██▅▃▅███████████████████████████████
train_loss,█▄▃▅▃▇▄▃▃▃▃▃▃▂▁▂▃▂▂▂▂▃▁▃▂▃▂▂▃▂▂▂▂▂▁▂▂
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▁▂▃█▇▂▃▁▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▂▂▁▂▂▂▂▁

0,1
epoch,37.0
train_accuracy,0.43107
train_loss,1.06584
validation_accuracy,0.43168
validation_loss,1.07483


[I 2025-05-02 00:44:55,381] Trial 0 finished with value: 1.0743041322344826 and parameters: {'lr': 0.0003, 'weight_decay': 0.0005, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5}. Best is trial 0 with value: 1.0743041322344826.



===== Trial 1 =====
 lr=4.00e-04, wd=5.00e-05, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0702 acc=0.4217 | val_loss=1.0841 acc=0.4317 | time=29.9s
Epoch 002 | train_loss=1.0691 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=29.9s
Epoch 003 | train_loss=1.0697 acc=0.4283 | val_loss=1.0795 acc=0.4317 | time=30.1s
Epoch 004 | train_loss=1.0682 acc=0.4280 | val_loss=1.0785 acc=0.4317 | time=29.8s
Epoch 005 | train_loss=1.0686 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=29.8s
Epoch 006 | train_loss=1.0686 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=29.9s
Epoch 007 | train_loss=1.0678 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=29.8s
Epoch 008 | train_loss=1.0672 acc=0.4311 | val_loss=1.0767 acc=0.4317 | time=30.1s
Epoch 009 | train_loss=1.0677 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=30.0s
Epoch 010 | train_loss=1.0662 acc=0.4311 | val_loss=1.0764 acc=0.4317 | time=29.9s
Epoch 011 | train_loss=1.0663 acc=0.4311 | val_loss=1.0721 acc=0.4317 | time=30.1s
Epoch 012 | t

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▁▁▁▁▂▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▇▇▇▇▇███████
train_loss,████████████▇▇▇▇▆▆▆▆▅▆▅▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▅▆▆▆▆▆▆▇▆▆▆▆▇▇▇▇█▇▇▇██▆▇▇▇██
validation_loss,▃▂▃▃▂▃▃▃▂▃▂▂▂▁▁▁▁▁▁▁▁▁▁▃▃▃▄▄▄▄▅▅▆▇▇████

0,1
epoch,39.0
train_accuracy,0.96621
train_loss,0.12327
validation_accuracy,0.63665
validation_loss,1.5999


[I 2025-05-02 01:04:25,881] Trial 1 finished with value: 0.9266147471609569 and parameters: {'lr': 0.0004, 'weight_decay': 5e-05, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5}. Best is trial 1 with value: 0.9266147471609569.



===== Trial 2 =====
 lr=4.00e-04, wd=1.00e-03, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0724 acc=0.4140 | val_loss=1.0747 acc=0.4317 | time=29.9s
Epoch 002 | train_loss=1.0685 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.2s
Epoch 003 | train_loss=1.0678 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=29.7s
Epoch 004 | train_loss=1.0686 acc=0.4198 | val_loss=1.0745 acc=0.4317 | time=29.8s
Epoch 005 | train_loss=1.0677 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=30.1s
Epoch 006 | train_loss=1.0686 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=29.8s
Epoch 007 | train_loss=1.0688 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=29.8s
Epoch 008 | train_loss=1.0671 acc=0.4311 | val_loss=1.0782 acc=0.4317 | time=30.1s
Epoch 009 | train_loss=1.0679 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=29.8s
Epoch 010 | train_loss=1.0670 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=30.0s
Epoch 011 | train_loss=1.0670 acc=0.4311 | val_loss=1.0741 acc=0.4317 | time=30.0s
Epoch 012 | t

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▃▃▃▄▄▄▄▅▅▆▆▆▇██████████
train_loss,█████████████▇▇▇▇▇▆▆▆▅▅▅▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▄▄▄▄▄▅▅▆▆▇▇█▇▇▇█▇▇▇▇█▇███▇█
validation_loss,▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▂▁▁▁▁▃▄▂▄▅▅▆▆▇▇▇▇█

0,1
epoch,51.0
train_accuracy,0.99184
train_loss,0.03457
validation_accuracy,0.68012
validation_loss,1.80403


[I 2025-05-02 01:29:56,139] Trial 2 finished with value: 0.7996216373784202 and parameters: {'lr': 0.0004, 'weight_decay': 0.001, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5}. Best is trial 2 with value: 0.7996216373784202.



===== Trial 3 =====
 lr=4.00e-04, wd=5.00e-04, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0716 acc=0.4202 | val_loss=1.0754 acc=0.4317 | time=36.8s
Epoch 002 | train_loss=1.0671 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=36.0s
Epoch 003 | train_loss=1.0690 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=36.0s
Epoch 004 | train_loss=1.0683 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.0s
Epoch 005 | train_loss=1.0684 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=36.1s
Epoch 006 | train_loss=1.0671 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=36.3s
Epoch 007 | train_loss=1.0675 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=36.1s
Epoch 008 | train_loss=1.0655 acc=0.4311 | val_loss=1.0681 acc=0.4317 | time=36.0s
Epoch 009 | train_loss=1.0513 acc=0.4738 | val_loss=1.0173 acc=0.5575 | time=36.2s
Epoch 010 | train_loss=0.9770 acc=0.5650 | val_loss=0.9725 acc=0.5606 | time=36.1s
Epoch 011 | train_loss=0.9497 acc=0.5821 | val_loss=0.9709 acc=0.5730 | time=36.1s
Epoch 012 | t

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
train_accuracy,▁▁▁▁▁▁▁▁▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇█▇███████
train_loss,█████████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▅▅▆▄▆▆▆▇▆▇▇█▇▇▇█▇█▇█▇██▆▇▇▇██▇▇▇
validation_loss,▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▂▃▄▄▅▆▅▆▇▆▇▇██

0,1
epoch,41.0
train_accuracy,0.99612
train_loss,0.02005
validation_accuracy,0.62733
validation_loss,2.44821


[I 2025-05-02 01:54:41,284] Trial 3 finished with value: 0.9026221718106952 and parameters: {'lr': 0.0004, 'weight_decay': 0.0005, 'num_blocks': 2, 'num_heads': 4, 'num_segments': 5}. Best is trial 2 with value: 0.7996216373784202.



===== Trial 4 =====
 lr=5.00e-04, wd=1.00e-03, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0722 acc=0.4229 | val_loss=1.0775 acc=0.4317 | time=29.9s
Epoch 002 | train_loss=1.0677 acc=0.4315 | val_loss=1.0770 acc=0.4317 | time=30.1s
Epoch 003 | train_loss=1.0693 acc=0.4202 | val_loss=1.0753 acc=0.4317 | time=29.9s
Epoch 004 | train_loss=1.0677 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=30.0s
Epoch 005 | train_loss=1.0665 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=30.4s
Epoch 006 | train_loss=1.0654 acc=0.4311 | val_loss=1.0494 acc=0.5559 | time=29.9s
Epoch 007 | train_loss=0.9910 acc=0.5398 | val_loss=1.0136 acc=0.5109 | time=30.2s
Epoch 008 | train_loss=0.9423 acc=0.5825 | val_loss=0.9692 acc=0.5854 | time=30.0s
Epoch 009 | train_loss=0.9173 acc=0.6004 | val_loss=0.9431 acc=0.5854 | time=30.0s
Epoch 010 | train_loss=0.8819 acc=0.6167 | val_loss=1.0466 acc=0.5668 | time=30.0s
Epoch 011 | train_loss=0.8719 acc=0.6245 | val_loss=0.9881 acc=0.5932 | time=30.0s
Epoch 012 | t

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▂▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇████████
train_loss,██████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▄▃▅▅▅▆▃▆▆▆▆▆▅▆▇▇▆▇▇▇▇▇▇▇███▇▇████
validation_loss,▂▂▂▂▂▂▂▁▁▂▂▂▁▁▁▁▁▁▁▃▁▂▁▁▂▃▃▄▄▄▅▆▇▇▇▇▇█

0,1
epoch,38.0
train_accuracy,0.99379
train_loss,0.02996
validation_accuracy,0.66925
validation_loss,2.11681


[I 2025-05-02 02:13:48,905] Trial 4 finished with value: 0.8873245375497001 and parameters: {'lr': 0.0005, 'weight_decay': 0.001, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5}. Best is trial 2 with value: 0.7996216373784202.



===== Trial 5 =====
 lr=3.00e-04, wd=5.00e-04, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0696 acc=0.4225 | val_loss=1.0820 acc=0.4317 | time=36.6s
Epoch 002 | train_loss=1.0675 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=36.3s
Epoch 003 | train_loss=1.0674 acc=0.4311 | val_loss=1.0766 acc=0.4317 | time=36.6s
Epoch 004 | train_loss=1.0678 acc=0.4311 | val_loss=1.0741 acc=0.4317 | time=36.7s
Epoch 005 | train_loss=1.0662 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=36.4s
Epoch 006 | train_loss=1.0671 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=36.5s
Epoch 007 | train_loss=1.0170 acc=0.5181 | val_loss=0.9864 acc=0.5683 | time=36.5s
Epoch 008 | train_loss=0.9484 acc=0.5794 | val_loss=0.9693 acc=0.5714 | time=36.3s
Epoch 009 | train_loss=0.9235 acc=0.5996 | val_loss=0.9490 acc=0.5745 | time=36.3s
Epoch 010 | train_loss=0.9049 acc=0.6097 | val_loss=0.9441 acc=0.5497 | time=36.5s
Epoch 011 | train_loss=0.8867 acc=0.6148 | val_loss=0.9505 acc=0.5963 | time=36.6s
Epoch 012 | t

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▂▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇█▇▇████
train_loss,███████▇▇▇▆▆▆▆▆▅▅▅▅▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▇▇▇▆██▇█▇▇▇▇▇▇█▇▇▆█▇█▇███▇▇█▇
validation_loss,▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▂▁▂▂▂▃▃▂▄▅▅▅▆▆▇▇██

0,1
epoch,35.0
train_accuracy,0.90058
train_loss,0.27484
validation_accuracy,0.58385
validation_loss,1.71205


[I 2025-05-02 02:35:08,760] Trial 5 finished with value: 0.9205371141433716 and parameters: {'lr': 0.0003, 'weight_decay': 0.0005, 'num_blocks': 2, 'num_heads': 4, 'num_segments': 5}. Best is trial 2 with value: 0.7996216373784202.



===== Trial 6 =====
 lr=3.00e-04, wd=1.00e-03, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0712 acc=0.4128 | val_loss=1.0749 acc=0.4317 | time=30.2s
Epoch 002 | train_loss=1.0695 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=30.3s
Epoch 003 | train_loss=1.0688 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=30.3s
Epoch 004 | train_loss=1.0677 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=30.2s
Epoch 005 | train_loss=1.0688 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=30.4s
Epoch 006 | train_loss=1.0671 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=30.3s
Epoch 007 | train_loss=1.0674 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=30.3s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁██████
train_loss,█▅▄▂▄▁▂
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▃▁█▁▅▇▁

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06738
validation_accuracy,0.43168
validation_loss,1.07445


[I 2025-05-02 02:39:13,111] Trial 6 pruned. 


▸ Trial 6 pruned at epoch 8



===== Trial 7 =====
 lr=5.00e-04, wd=5.00e-04, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0752 acc=0.4171 | val_loss=1.0795 acc=0.4317 | time=36.7s
Epoch 002 | train_loss=1.0691 acc=0.4311 | val_loss=1.0824 acc=0.4317 | time=36.6s
Epoch 003 | train_loss=1.0690 acc=0.4311 | val_loss=1.0770 acc=0.4317 | time=36.4s
Epoch 004 | train_loss=1.0668 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=36.4s
Epoch 005 | train_loss=1.0676 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=36.6s
Epoch 006 | train_loss=1.0675 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=36.4s


0,1
epoch,▁▂▄▅▇█
train_accuracy,▁█████
train_loss,█▃▃▁▂▂
validation_accuracy,▁▁▁▁▁▁
validation_loss,▅█▃▁▁▁

0,1
epoch,6.0
train_accuracy,0.43107
train_loss,1.06751
validation_accuracy,0.43168
validation_loss,1.0747


[I 2025-05-02 02:43:30,529] Trial 7 pruned. 


▸ Trial 7 pruned at epoch 7



===== Trial 8 =====
 lr=5.00e-04, wd=1.00e-03, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0718 acc=0.4179 | val_loss=1.0790 acc=0.4317 | time=36.3s
Epoch 002 | train_loss=1.0678 acc=0.4311 | val_loss=1.0780 acc=0.4317 | time=36.6s
Epoch 003 | train_loss=1.0694 acc=0.4311 | val_loss=1.0781 acc=0.4317 | time=36.7s
Epoch 004 | train_loss=1.0675 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.6s
Epoch 005 | train_loss=1.0671 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.5s
Epoch 006 | train_loss=1.0676 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=36.5s
Epoch 007 | train_loss=1.0662 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=36.5s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁██████
train_loss,█▃▅▃▂▃▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▇▇▁▁▃▂

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06618
validation_accuracy,0.43168
validation_loss,1.07492


[I 2025-05-02 02:48:24,622] Trial 8 pruned. 


▸ Trial 8 pruned at epoch 8



===== Trial 9 =====
 lr=4.00e-04, wd=5.00e-05, blocks=2, heads=4, segs=5


[34m[1mwandb[0m: 503 encountered (
[34m[1mwandb[0m: <html><head>
[34m[1mwandb[0m: <meta http-equiv="content-type" content="text/html;charset=utf-8">
[34m[1mwandb[0m: <title>503 Server Error</title>
[34m[1mwandb[0m: </head>
[34m[1mwandb[0m: <body text=#000000 bgcolor=#ffffff>
[34m[1mwandb[0m: <h1>Error: Server Error</h1>
[34m[1mwandb[0m: <h2>The service you requested is not available at this time.<p>Service error -27.</h2>
[34m[1mwandb[0m: <h2></h2>
[34m[1mwandb[0m: </body></html>), retrying request


Epoch 001 | train_loss=1.0693 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=36.3s
Epoch 002 | train_loss=1.0679 acc=0.4311 | val_loss=1.0822 acc=0.4317 | time=36.2s
Epoch 003 | train_loss=1.0711 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.4s
Epoch 004 | train_loss=1.0682 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=36.6s
Epoch 005 | train_loss=1.0675 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=36.6s
Epoch 006 | train_loss=1.0678 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=36.3s
Epoch 007 | train_loss=1.0672 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=36.4s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▁▁▁▁▁▁
train_loss,▅▂█▃▂▂▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▂█▁▁▂▁▁

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06717
validation_accuracy,0.43168
validation_loss,1.07449


[I 2025-05-02 02:53:20,047] Trial 9 pruned. 


▸ Trial 9 pruned at epoch 8



===== Trial 10 =====
 lr=4.00e-04, wd=5.00e-04, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0735 acc=0.4194 | val_loss=1.0757 acc=0.4317 | time=30.4s
Epoch 002 | train_loss=1.0677 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=30.2s
Epoch 003 | train_loss=1.0689 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=30.3s
Epoch 004 | train_loss=1.0685 acc=0.4311 | val_loss=1.0826 acc=0.4317 | time=30.3s
Epoch 005 | train_loss=1.0688 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.3s
Epoch 006 | train_loss=1.0725 acc=0.4109 | val_loss=1.0796 acc=0.4317 | time=30.4s
Epoch 007 | train_loss=1.0679 acc=0.4311 | val_loss=1.0773 acc=0.4317 | time=30.0s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▄████▁█
train_loss,█▁▂▂▂▇▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▂▁▂█▁▅▃

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06794
validation_accuracy,0.43168
validation_loss,1.07732


[I 2025-05-02 02:57:24,256] Trial 10 pruned. 


▸ Trial 10 pruned at epoch 8



===== Trial 11 =====
 lr=5.00e-04, wd=5.00e-04, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0699 acc=0.4186 | val_loss=1.0754 acc=0.4317 | time=30.3s
Epoch 002 | train_loss=1.0688 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=30.1s
Epoch 003 | train_loss=1.0677 acc=0.4311 | val_loss=1.0769 acc=0.4317 | time=30.5s
Epoch 004 | train_loss=1.0689 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=30.3s


0,1
epoch,▁▃▆█
train_accuracy,▁███
train_loss,█▅▁▅
validation_accuracy,▁▁▁▁
validation_loss,▁▃█▅

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06888
validation_accuracy,0.43168
validation_loss,1.07629


[I 2025-05-02 02:59:58,069] Trial 11 pruned. 


▸ Trial 11 pruned at epoch 5



===== Trial 12 =====
 lr=4.00e-04, wd=1.00e-03, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0708 acc=0.4311 | val_loss=1.0768 acc=0.4317 | time=36.1s
Epoch 002 | train_loss=1.0689 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.4s
Epoch 003 | train_loss=1.0685 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.3s
Epoch 004 | train_loss=1.0682 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=36.5s
Epoch 005 | train_loss=1.0688 acc=0.4311 | val_loss=1.0791 acc=0.4317 | time=36.5s
Epoch 006 | train_loss=1.0678 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=36.5s
Epoch 007 | train_loss=1.0667 acc=0.4311 | val_loss=1.0776 acc=0.4317 | time=36.2s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▁▁▁▁▁▁
train_loss,█▅▄▄▅▃▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▅▁▁▂█▁▆

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06665
validation_accuracy,0.43168
validation_loss,1.07759


[I 2025-05-02 03:04:51,071] Trial 12 pruned. 


▸ Trial 12 pruned at epoch 8



===== Trial 13 =====
 lr=3.00e-04, wd=5.00e-05, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0712 acc=0.4245 | val_loss=1.0749 acc=0.4317 | time=36.5s
Epoch 002 | train_loss=1.0698 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=36.5s
Epoch 003 | train_loss=1.0676 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=36.3s
Epoch 004 | train_loss=1.0681 acc=0.4311 | val_loss=1.0800 acc=0.4317 | time=36.4s
Epoch 005 | train_loss=1.0666 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=36.2s
Epoch 006 | train_loss=1.0677 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=36.5s
Epoch 007 | train_loss=1.0661 acc=0.4311 | val_loss=1.0770 acc=0.4317 | time=36.5s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁██████
train_loss,█▆▃▄▂▃▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▂▂▃█▁▂▄

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06615
validation_accuracy,0.43168
validation_loss,1.07702


[I 2025-05-02 03:09:44,770] Trial 13 pruned. 


▸ Trial 13 pruned at epoch 8



===== Trial 14 =====
 lr=3.00e-04, wd=5.00e-05, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0696 acc=0.4241 | val_loss=1.0744 acc=0.4317 | time=30.3s
Epoch 002 | train_loss=1.0703 acc=0.4229 | val_loss=1.0748 acc=0.4317 | time=30.1s
Epoch 003 | train_loss=1.0695 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=30.6s
Epoch 004 | train_loss=1.0667 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=30.2s
Epoch 005 | train_loss=1.0673 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=30.0s
Epoch 006 | train_loss=1.0670 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=30.4s
Epoch 007 | train_loss=1.0673 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=30.2s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▂▁█████
train_loss,▇█▆▁▂▂▂
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▂▄▁▅█▃▁

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06731
validation_accuracy,0.43168
validation_loss,1.07425


[I 2025-05-02 03:13:48,887] Trial 14 pruned. 


▸ Trial 14 pruned at epoch 8



===== Trial 15 =====
 lr=3.00e-04, wd=1.00e-03, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0707 acc=0.4311 | val_loss=1.0788 acc=0.4317 | time=36.5s
Epoch 002 | train_loss=1.0691 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=36.4s
Epoch 003 | train_loss=1.0689 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=36.3s
Epoch 004 | train_loss=1.0679 acc=0.4311 | val_loss=1.0764 acc=0.4317 | time=36.4s
Epoch 005 | train_loss=1.0677 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=36.3s
Epoch 006 | train_loss=1.0659 acc=0.4311 | val_loss=1.0780 acc=0.4317 | time=36.2s
Epoch 007 | train_loss=1.0667 acc=0.4311 | val_loss=1.0769 acc=0.4317 | time=36.6s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▁▁▁▁▁▁
train_loss,█▆▅▄▄▁▂
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▄▁▄▁▇▅

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06671
validation_accuracy,0.43168
validation_loss,1.07692


[I 2025-05-02 03:18:42,869] Trial 15 pruned. 


▸ Trial 15 pruned at epoch 8



===== Trial 16 =====
 lr=5.00e-04, wd=5.00e-05, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0706 acc=0.4280 | val_loss=1.0789 acc=0.4317 | time=36.6s
Epoch 002 | train_loss=1.0695 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=36.6s
Epoch 003 | train_loss=1.0683 acc=0.4311 | val_loss=1.0771 acc=0.4317 | time=36.5s
Epoch 004 | train_loss=1.0680 acc=0.4311 | val_loss=1.0781 acc=0.4317 | time=36.1s
Epoch 005 | train_loss=1.0688 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.4s
Epoch 006 | train_loss=1.0672 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=36.2s
Epoch 007 | train_loss=1.0682 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=36.5s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁██████
train_loss,█▆▃▃▄▁▃
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▂▅▇▁▁▁

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06821
validation_accuracy,0.43168
validation_loss,1.07442


[I 2025-05-02 03:23:36,811] Trial 16 pruned. 


▸ Trial 16 pruned at epoch 8



===== Trial 17 =====
 lr=5.00e-04, wd=5.00e-05, blocks=2, heads=3, segs=5
Epoch 001 | train_loss=1.0704 acc=0.4311 | val_loss=1.0816 acc=0.4317 | time=30.2s
Epoch 002 | train_loss=1.0702 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.4s
Epoch 003 | train_loss=1.0670 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.1s
Epoch 004 | train_loss=1.0681 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=30.1s
Epoch 005 | train_loss=1.0677 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=30.1s
Epoch 006 | train_loss=1.0678 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=30.2s
Epoch 007 | train_loss=1.0667 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=30.1s


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▁▁▁▁▁▁
train_loss,█▇▁▃▃▃▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▁▁▂▁▁▂

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06674
validation_accuracy,0.43168
validation_loss,1.07487


[I 2025-05-02 03:27:41,000] Trial 17 pruned. 


▸ Trial 17 pruned at epoch 8

===== Best Trial =====
best_val_loss       = 0.799622
best_train_loss     = 0.557883
best_train_accuracy = 0.7751
best_val_accuracy   = 0.6801
best params:
  lr: 0.0004
  weight_decay: 0.001
  num_blocks: 2
  num_heads: 3
  num_segments: 5


In [None]:
===== Best Trial =====
best_val_loss       = 0.799622
best_train_loss     = 0.557883
best_train_accuracy = 0.7751
best_val_accuracy   = 0.6801
best params:
  lr: 0.0004
  weight_decay: 0.001
  num_blocks: 2
  num_heads: 3
  num_segments: 5

### Reduce the Model depth and width
- Decrease the model complexity due to overfitting
- Decrease the model blocks from 2 to 1
- Test it with different heads = [1,2,3]

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Grid Search 후보값 ────────────────────────────────────────
LR_CHOICES          = [3e-4, 4e-4, 5e-4]
WD_CHOICES          = [5e-5, 5e-4, 1e-3]
NUM_FILTERS         = 120
NUM_BLOCK_CHOICES   = [1]
NUM_HEAD_CHOICES    = [2, 3, 4]
SEGMENT_CHOICES     = [5]

# ─── Training configuration ───────────────────────────────────
MAX_EPOCHS  = 100
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ──────────────────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ─── Scheduler 하이퍼파라미터 ─────────────────────────────────
FIXED_STEP_SIZE = 5
FIXED_GAMMA     = 0.5


def objective_holdout(trial):
    # ─── 1) Grid용 sampling ───────────────────────────────────
    lr           = trial.suggest_categorical("lr", LR_CHOICES)
    weight_decay = trial.suggest_categorical("weight_decay", WD_CHOICES)
    num_blocks   = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads    = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    # ─── 2) 데이터 로드 ────────────────────────────────────────
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds    = EEGDataset(DATA_DIR, train_meta)
    labels     = [d["label"] for d in train_meta]
    n_samples  = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # ─── 3) Hold-out split ─────────────────────────────────────
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # ─── 4) W&B init ───────────────────────────────────────────
    wandb.init(
        project="eeg-holdout-grid-search",
        config={
            "lr": lr,
            "weight_decay": weight_decay,
            "num_blocks": num_blocks,
            "num_heads": num_heads,
            "num_segments": num_segments
        }
    )

    print(f"\n===== Trial {trial.number} =====")
    print(
        f" lr={lr:.2e}, wd={weight_decay:.2e}, "
        f"blocks={num_blocks}, heads={num_heads}, segs={num_segments}"
    )

    # ─── 5) Model / optimizer / loss ──────────────────────────
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay
    )
    criterion = nn.CrossEntropyLoss()

    # ─── 6) Scheduler ──────────────────────────────────────────
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=FIXED_GAMMA,
        patience=FIXED_STEP_SIZE,
        min_lr=1e-6
    )

    # ─── 7) Training loop w/ Early Stopping & Pruning ─────────
    best_val_loss     = float("inf")
    epochs_no_improve = 0

    best_train_loss = best_train_acc = best_val_acc = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()

        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()

            tloss    += loss.item()
            tcorrect += (logits.argmax(1) == y).sum().item()
            ttotal   += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — validate —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y    = X.to(DEVICE), y.to(DEVICE)
                logits  = model(X)
                loss    = criterion(logits, y)
                vloss   += loss.item()
                vcorrect+= (logits.argmax(1) == y).sum().item()
                vtotal  += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # — report & pruning check —
        trial.report(val_loss, epoch)
        if trial.should_prune():
            wandb.finish()
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        # — print & log —
        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
            f"time={elapsed:.1f}s"
        )
        wandb.log({
            "epoch":               epoch,
            "train_loss":          train_loss,
            "train_accuracy":      train_acc,
            "validation_loss":     val_loss,
            "validation_accuracy": val_acc,
        }, step=epoch)

        scheduler.step(val_loss)

        # — early stopping logic & save best metrics —
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # store best-epoch metrics
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()

    # ─── GridSampler용 파라미터 그리드 ─────────────────────────
    param_grid = {
        "lr":            LR_CHOICES,
        "weight_decay":  WD_CHOICES,
        "num_blocks":    NUM_BLOCK_CHOICES,
        "num_heads":     NUM_HEAD_CHOICES,
        "num_segments":  SEGMENT_CHOICES,
    }

    sampler = optuna.samplers.GridSampler(param_grid)
    study = optuna.create_study(
        direction="minimize",
        sampler=sampler,
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_grid_search",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_grid_search.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout)  # grid 크기만큼 자동 실행

    # ─── 결과 출력 ─────────────────────────────────────────────
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss       = {best.value:.6f}")
    print(f"best_train_loss     = {best.user_attrs['best_train_loss']:.6f}")
    print(f"best_train_accuracy = {best.user_attrs['best_train_acc']:.4f}")
    print(f"best_val_accuracy   = {best.user_attrs['best_val_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Hyperparameter ranges and fixed settings ─────────────────────────
LR_MIN, LR_MAX     = 1e-4, 1e-2
WD_MIN, WD_MAX     = 1e-6, 1e-3

NUM_FILTERS        = 120
NUM_BLOCK_CHOICES  = [2, 3]
NUM_HEAD_CHOICES   = [2, 4]
SEGMENT_CHOICES    = [5]

# ─── Training configuration ─────────────────────────
MAX_EPOCHS  = 100   # 100 epochs 고정
PATIENCE    = 20    # Early stopping patience
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ─────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ─── 고정 스케줄러 하이퍼파라미터 ─────────────────────────
FIXED_STEP_SIZE = 5
FIXED_GAMMA     = 0.5


def objective_holdout(trial):
    # ─ 1) Sample hyperparameters ────────────────────────────────
    lr           = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_blocks   = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads    = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    # ─ 2) Load data ────────────────────────────────────────────────
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds    = EEGDataset(DATA_DIR, train_meta)
    labels     = [d["label"] for d in train_meta]
    n_samples  = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # ─ 3) Hold-out split ──────────────────────────────────────────
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # ─ 4) W&B init ────────────────────────────────────────────────
    wandb.init(project="eeg-holdout-tuning-9", config=trial.params)

    print(f"\n===== Trial {trial.number} =====")
    print(
        f" lr={lr:.2e}, wd={weight_decay:.2e}, "
        f"blocks={num_blocks}, heads={num_heads}, segs={num_segments}")

    # ─ 5) Model / optimizer / loss ───────────────────────────────
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay  # L2
    )
    criterion = nn.CrossEntropyLoss()

    # ─ 6) Scheduler ───────────────────────────────────────────────
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=FIXED_GAMMA,
        patience=FIXED_STEP_SIZE,
        min_lr=1e-6
    )

    # ─ 7) Training loop w/ Early Stopping & Pruning ──────────────
    best_val_loss     = float("inf")
    epochs_no_improve = 0

    best_train_loss = best_train_acc = best_val_acc = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()

        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()

            tloss    += loss.item()
            tcorrect += (logits.argmax(1) == y).sum().item()
            ttotal   += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — validate —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y    = X.to(DEVICE), y.to(DEVICE)
                logits  = model(X)
                loss    = criterion(logits, y)
                vloss   += loss.item()
                vcorrect+= (logits.argmax(1) == y).sum().item()
                vtotal  += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # — report & pruning check —
        trial.report(val_loss, epoch)
        if trial.should_prune():
            wandb.finish()
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        # — print & log —
        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
            f"time={elapsed:.1f}s | "
        )
        wandb.log({
            "epoch":               epoch,
            "train_loss":          train_loss,
            "train_accuracy":      train_acc,
            "validation_loss":     val_loss,
            "validation_accuracy": val_acc,
        }, step=epoch)

        scheduler.step(val_loss)

        # — early stopping logic & save best metrics —
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # store best-epoch metrics
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_trial-9",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_holdout-9.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout, n_trials=20)

    # ─── 결과 출력 ────────────────────────────────────────────────
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss       = {best.value:.6f}")
    print(f"best_train_loss     = {best.user_attrs['best_train_loss']:.6f}")
    print(f"best_train_accuracy = {best.user_attrs['best_train_acc']:.4f}")
    print(f"best_val_accuracy   = {best.user_attrs['best_val_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


Now using CUDA device 0
Enabling CUDA with 39.14 GiB available memory


[I 2025-05-01 22:22:27,459] Using an existing study with name 'eeg_holdout_trial-9' instead of creating a new one.



===== Trial 1 =====
 lr=2.35e-04, wd=7.48e-05, blocks=2, heads=4, segs=5
Epoch 001 | train_loss=1.0738 acc=0.4214 | val_loss=1.0746 acc=0.4317 | time=36.7s | 
Epoch 002 | train_loss=1.0684 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=36.0s | 
Epoch 003 | train_loss=1.0684 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=36.3s | 
Epoch 004 | train_loss=1.0671 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=36.3s | 
Epoch 005 | train_loss=1.0675 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=36.1s | 
Epoch 006 | train_loss=1.0661 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=36.1s | 
Epoch 007 | train_loss=1.0669 acc=0.4311 | val_loss=1.0764 acc=0.4317 | time=36.0s | 
Epoch 008 | train_loss=1.0677 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=36.1s | 
Epoch 009 | train_loss=1.0667 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=36.2s | 
Epoch 010 | train_loss=1.0671 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=36.3s | 
Epoch 011 | train_loss=1.0677 acc=0.4311 | val_loss=1.0750 acc=0.4

0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇███████
train_loss,█████████████▇▇▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▆▆▆▆▆▆▆▆▆▇▆█████████▇█
validation_loss,██████████████▆▅▄▄▃▃▂▂▃▂▂▂▂▁▂▂▄▁▂▁▁▂▂▃▃▃

0,1
epoch,77.0
train_accuracy,0.7965
train_loss,0.51931
validation_accuracy,0.65839
validation_loss,0.93526


[I 2025-05-01 23:08:57,626] Trial 1 finished with value: 0.8704692948432196 and parameters: {'lr': 0.00023450279490485625, 'weight_decay': 7.480682204115078e-05, 'num_blocks': 2, 'num_heads': 4, 'num_segments': 5}. Best is trial 1 with value: 0.8704692948432196.



===== Trial 2 =====
 lr=6.33e-04, wd=6.11e-06, blocks=2, heads=2, segs=5
Epoch 001 | train_loss=1.0701 acc=0.4291 | val_loss=1.0755 acc=0.4317 | time=24.5s | 
Epoch 002 | train_loss=1.0675 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=24.5s | 
Epoch 003 | train_loss=1.0682 acc=0.4311 | val_loss=1.0789 acc=0.4317 | time=24.4s | 
Epoch 004 | train_loss=1.0670 acc=0.4311 | val_loss=1.0736 acc=0.4317 | time=24.4s | 
Epoch 005 | train_loss=1.0680 acc=0.4171 | val_loss=1.0712 acc=0.4317 | time=24.4s | 
Epoch 006 | train_loss=1.0287 acc=0.4854 | val_loss=1.0088 acc=0.5652 | time=24.5s | 
Epoch 007 | train_loss=0.9581 acc=0.5821 | val_loss=0.9513 acc=0.5776 | time=24.5s | 
Epoch 008 | train_loss=0.9300 acc=0.5868 | val_loss=1.0152 acc=0.5730 | time=24.6s | 
Epoch 009 | train_loss=0.8953 acc=0.6062 | val_loss=0.9252 acc=0.5963 | time=24.5s | 
Epoch 010 | train_loss=0.8730 acc=0.6151 | val_loss=0.9047 acc=0.6025 | time=24.6s | 
Epoch 011 | train_loss=0.8523 acc=0.6299 | val_loss=0.9220 acc=0.5

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_accuracy,▁▁▁▁▁▂▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train_loss,██████▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▂▂▂▂▁▁▁
validation_accuracy,▁▁▁▁▁▆▇▆▇█▇▇▇▇▇▆▇▅▇▇▇█▇███▇███
validation_loss,▄▄▄▄▄▃▂▃▁▁▁▁▂▂▂▂▂▃▂▃▃▁▄▃▄▅▅▅▆█

0,1
epoch,30.0
train_accuracy,0.86951
train_loss,0.35468
validation_accuracy,0.61491
validation_loss,1.35241


[I 2025-05-01 23:21:21,543] Trial 2 finished with value: 0.9047478267124721 and parameters: {'lr': 0.0006332497165117276, 'weight_decay': 6.1060665269356535e-06, 'num_blocks': 2, 'num_heads': 2, 'num_segments': 5}. Best is trial 1 with value: 0.8704692948432196.



===== Trial 3 =====
 lr=5.07e-03, wd=1.51e-05, blocks=3, heads=4, segs=5


[W 2025-05-01 23:21:28,840] Trial 3 failed with parameters: {'lr': 0.005073462485729068, 'weight_decay': 1.5078657828589909e-05, 'num_blocks': 3, 'num_heads': 4, 'num_segments': 5} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 2.48 GiB. GPU 0 has a total capacity of 39.56 GiB of which 2.06 GiB is free. Process 734759 has 37.49 GiB memory in use. Of the allocated memory 35.18 GiB is allocated by PyTorch, and 1.80 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)').
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-8-173f8f6107e4>", line 126

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.48 GiB. GPU 0 has a total capacity of 39.56 GiB of which 2.06 GiB is free. Process 734759 has 37.49 GiB memory in use. Of the allocated memory 35.18 GiB is allocated by PyTorch, and 1.80 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Test with dropout = 0.3
- Transformer Model: Revised the model to add a dropout parameter in self-attention head and transformer block
- Change the Learning Rate Scheduler from LRstep to ReduceLROnPlateau for improving the validation loss decrease

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Hyperparameter ranges and fixed settings ─────────────────────────
LR_MIN, LR_MAX     = 1e-5, 5e-3
WD_MIN, WD_MAX     = 1e-6, 1e-3
L1_MIN, L1_MAX     = 1e-7, 1e-3

NUM_FILTERS        = 120
NUM_BLOCK_CHOICES  = [1, 2, 3]
NUM_HEAD_CHOICES   = [2, 3, 4]
SEGMENT_CHOICES    = [5, 10, 15]

# ─── Training configuration ─────────────────────────
MAX_EPOCHS  = 100   # 100 epochs 고정
PATIENCE    = 20    # Early stopping patience
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ─────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def objective_holdout(trial):
    # ─ 1) Sample hyperparameters ────────────────────────────────
    lr           = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_blocks   = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads    = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    use_l1       = trial.suggest_categorical("use_l1", [False, True])
    l1_lambda    = trial.suggest_float("l1_lambda", L1_MIN, L1_MAX, log=True) if use_l1 else 0.0

    step_size    = trial.suggest_int("step_size", 10, 20, step=5)
    gamma        = trial.suggest_float("gamma", 0.1, 0.9)

    print(f"\n===== Trial {trial.number} =====")
    print(
        f" lr={lr:.2e}, wd={weight_decay:.2e}, "
        f"L1={'on' if use_l1 else 'off'}{f'({l1_lambda:.2e})' if use_l1 else ''}, "
        f"blocks={num_blocks}, heads={num_heads}, segs={num_segments}, "
        f"step_size={step_size}, gamma={gamma:.2f}"
    )

    # ─ 2) Load data ────────────────────────────────────────────────
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds    = EEGDataset(DATA_DIR, train_meta)
    labels     = [d["label"] for d in train_meta]
    n_samples  = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # ─ 3) Hold-out split ──────────────────────────────────────────
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # ─ 4) W&B init ────────────────────────────────────────────────
    wandb.init(project="eeg-holdout-tuning-4", config=trial.params)

    # ─ 5) Model / optimizer / loss ───────────────────────────────
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay  # L2
    )
    criterion = nn.CrossEntropyLoss()

    # ─ 6) Scheduler ───────────────────────────────────────────────
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                           factor=gamma, patience=step_size,
                                                           min_lr=1e-6)

    # ─ 7) Training loop w/ Early Stopping & Pruning ──────────────
    best_val_loss     = float("inf")
    epochs_no_improve = 0

    # placeholders for best-epoch metrics
    best_train_loss = best_train_acc = best_val_acc = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()

        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            if l1_lambda > 0:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss    = loss + l1_lambda * l1_norm
            loss.backward()
            optimizer.step()
            tloss    += loss.item()
            tcorrect += (logits.argmax(1) == y).sum().item()
            ttotal   += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — validate —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                logits = model(X)
                loss   = criterion(logits, y)
                vloss    += loss.item()
                vcorrect += (logits.argmax(1) == y).sum().item()
                vtotal   += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # — report & pruning check —
        trial.report(val_loss, epoch)
        if trial.should_prune():
            wandb.finish()
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        # — print & log —
        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
            f"time={elapsed:.1f}s"
        )
        wandb.log({
            "epoch":               epoch,
            "train_loss":          train_loss,
            "train_accuracy":      train_acc,
            "validation_loss":     val_loss,
            "validation_accuracy": val_acc,
        }, step=epoch)

        scheduler.step(val_loss)

        # — early stopping logic & save best metrics —
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # store best-epoch metrics
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_trial-4",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_holdout-4.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout, n_trials=20)

    # ─── 결과 출력 ────────────────────────────────────────────────
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss       = {best.value:.6f}")
    print(f"best_train_loss     = {best.user_attrs['best_train_loss']:.6f}")
    print(f"best_train_accuracy = {best.user_attrs['best_train_acc']:.4f}")
    print(f"best_val_accuracy   = {best.user_attrs['best_val_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


[I 2025-05-01 10:20:21,793] A new study created in RDB with name: eeg_holdout_trial-4



===== Trial 0 =====
 lr=6.47e-05, wd=3.44e-05, L1=on(3.29e-05), blocks=1, heads=3, segs=5, step_size=15, gamma=0.72


Epoch 001 | train_loss=17.0697 acc=0.4206 | val_loss=1.0745 acc=0.4317 | time=223.0s
Epoch 002 | train_loss=16.8072 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=18.7s
Epoch 003 | train_loss=16.5682 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=18.7s
Epoch 004 | train_loss=16.3461 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=18.7s
Epoch 005 | train_loss=16.1411 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=18.6s
Epoch 006 | train_loss=15.9511 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=18.8s
Epoch 007 | train_loss=15.7722 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=18.6s
Epoch 008 | train_loss=15.6012 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=18.8s
Epoch 009 | train_loss=15.4385 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=18.5s
Epoch 010 | train_loss=15.2845 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=18.6s
Epoch 011 | train_loss=15.1332 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=18.5s
Epoch 012 | train_loss=14.9882 acc=0.4311 | val_loss=1.0749 acc=0.4317 | ti

0,1
epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇██
train_accuracy,▁██████████████████████
train_loss,█▇▇▆▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▂▃▁▂▇▁▂▆▇▃▃▅▅▇▆▇▂▄▅▄▄▅█

0,1
epoch,23.0
train_accuracy,0.43107
train_loss,13.85227
validation_accuracy,0.43168
validation_loss,1.07512


[I 2025-05-01 10:30:58,770] Trial 0 finished with value: 1.074448063260033 and parameters: {'lr': 6.469169596116346e-05, 'weight_decay': 3.4401973392369906e-05, 'num_blocks': 1, 'num_heads': 3, 'num_segments': 5, 'use_l1': True, 'l1_lambda': 3.2893090509651604e-05, 'step_size': 15, 'gamma': 0.7165871970832125}. Best is trial 0 with value: 1.074448063260033.



===== Trial 1 =====
 lr=1.04e-03, wd=4.53e-05, L1=on(5.52e-07), blocks=2, heads=3, segs=5, step_size=10, gamma=0.54


Epoch 001 | train_loss=1.3416 acc=0.4217 | val_loss=1.0852 acc=0.4317 | time=30.2s
Epoch 002 | train_loss=1.3065 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=30.2s
Epoch 003 | train_loss=1.2809 acc=0.4311 | val_loss=1.0791 acc=0.4317 | time=30.2s
Epoch 004 | train_loss=1.2550 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=30.1s
Epoch 005 | train_loss=1.2329 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=30.0s
Epoch 006 | train_loss=1.2133 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=30.3s
Epoch 007 | train_loss=1.1943 acc=0.4311 | val_loss=1.0780 acc=0.4317 | time=30.1s
Epoch 008 | train_loss=1.1818 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=30.0s
Epoch 009 | train_loss=1.1666 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=30.3s
Epoch 010 | train_loss=1.1535 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=30.1s
Epoch 011 | train_loss=1.1423 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=30.1s
Epoch 012 | train_loss=1.1314 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=30.3s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁██████████████████████████████████
train_loss,█▇▆▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▂▄▂▂▁▃▁▂▂▁▂▁▂▁▁▁▁▁▂▁▁▂▂▁▁▁▁▂▁▁▁▁▁▁

0,1
epoch,35.0
train_accuracy,0.43107
train_loss,1.06946
validation_accuracy,0.43168
validation_loss,1.07497


[I 2025-05-01 10:48:37,548] Trial 1 finished with value: 1.0742701008206321 and parameters: {'lr': 0.0010362332645279116, 'weight_decay': 4.5293156757788826e-05, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5, 'use_l1': True, 'l1_lambda': 5.515543108510157e-07, 'step_size': 10, 'gamma': 0.5430885137659128}. Best is trial 1 with value: 1.0742701008206321.



===== Trial 2 =====
 lr=2.66e-04, wd=1.38e-06, L1=on(9.28e-06), blocks=1, heads=2, segs=5, step_size=20, gamma=0.33


Epoch 001 | train_loss=5.4962 acc=0.4283 | val_loss=1.0746 acc=0.4317 | time=17.3s
Epoch 002 | train_loss=5.2780 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=16.9s
Epoch 003 | train_loss=5.0948 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=16.9s
Epoch 004 | train_loss=4.9363 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=17.3s
Epoch 005 | train_loss=4.7950 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=17.0s
Epoch 006 | train_loss=4.6692 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=16.7s
Epoch 007 | train_loss=4.5498 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=17.0s
Epoch 008 | train_loss=4.4367 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=16.6s
Epoch 009 | train_loss=4.3264 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=16.7s
Epoch 010 | train_loss=4.2226 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=16.9s
Epoch 011 | train_loss=4.1222 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=16.9s
Epoch 012 | train_loss=4.0256 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=16.9s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁█████████████████████████████████████
train_loss,█▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▃▂▆▁▆▅▂▂▂▂▇▄▄▃▁▇▄▁▇▆▄▃▂▄█▃▄▄▃▄▄▄▄▃▃▄▄▄

0,1
epoch,38.0
train_accuracy,0.43107
train_loss,2.71504
validation_accuracy,0.43168
validation_loss,1.07492


[I 2025-05-01 10:59:22,037] Trial 2 finished with value: 1.0742079076312838 and parameters: {'lr': 0.0002662198643972252, 'weight_decay': 1.375317596527714e-06, 'num_blocks': 1, 'num_heads': 2, 'num_segments': 5, 'use_l1': True, 'l1_lambda': 9.280814582588644e-06, 'step_size': 20, 'gamma': 0.3311066554634084}. Best is trial 2 with value: 1.0742079076312838.



===== Trial 3 =====
 lr=1.45e-03, wd=3.51e-04, L1=on(2.15e-04), blocks=2, heads=3, segs=5, step_size=10, gamma=0.12


Epoch 001 | train_loss=94.0929 acc=0.4237 | val_loss=1.0755 acc=0.4317 | time=30.2s
Epoch 002 | train_loss=76.8012 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=30.1s
Epoch 003 | train_loss=64.9220 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=30.0s
Epoch 004 | train_loss=54.7576 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=30.3s
Epoch 005 | train_loss=45.8319 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.2s
Epoch 006 | train_loss=38.0489 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.1s
Epoch 007 | train_loss=31.3293 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=30.2s
Epoch 008 | train_loss=25.5848 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=30.1s
Epoch 009 | train_loss=20.7351 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=30.1s
Epoch 010 | train_loss=16.7056 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=30.2s
Epoch 011 | train_loss=13.3832 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=30.2s
Epoch 012 | train_loss=10.6744 acc=0.4311 | val_loss=1.0763 acc=0.4317 | tim

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
train_accuracy,▁██████████████████████████████
train_loss,█▇▆▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▅▂▂▅▂▂▆▂▅▁▁█▂▃▄▁▇▃▂▆▂▂▃▃▂▃▂▃▃▃▃

0,1
epoch,31.0
train_accuracy,0.43107
train_loss,1.30755
validation_accuracy,0.43168
validation_loss,1.0747


[I 2025-05-01 11:15:00,197] Trial 3 finished with value: 1.0742230727559043 and parameters: {'lr': 0.0014549118742377173, 'weight_decay': 0.0003509188413769776, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5, 'use_l1': True, 'l1_lambda': 0.0002150115435194463, 'step_size': 10, 'gamma': 0.11897525372056572}. Best is trial 2 with value: 1.0742079076312838.



===== Trial 4 =====
 lr=4.22e-03, wd=1.19e-04, L1=off, blocks=2, heads=3, segs=10, step_size=20, gamma=0.27


Epoch 001 | train_loss=1.0722 acc=0.4206 | val_loss=1.0742 acc=0.4317 | time=30.1s
Epoch 002 | train_loss=1.0689 acc=0.4311 | val_loss=1.0837 acc=0.4317 | time=30.1s
Epoch 003 | train_loss=1.0686 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.2s
Epoch 004 | train_loss=1.0686 acc=0.4280 | val_loss=1.0819 acc=0.4317 | time=30.2s
Epoch 005 | train_loss=1.0694 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=30.3s
Epoch 006 | train_loss=1.0671 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=30.1s
Epoch 007 | train_loss=1.0697 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=30.1s
Epoch 008 | train_loss=1.0669 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=30.3s
Epoch 009 | train_loss=1.0653 acc=0.4311 | val_loss=1.0800 acc=0.4317 | time=30.2s
Epoch 010 | train_loss=1.0682 acc=0.4311 | val_loss=1.0765 acc=0.4317 | time=30.3s
Epoch 011 | train_loss=1.0680 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=30.4s
Epoch 012 | train_loss=1.0668 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=30.2s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁██▆███████████████████████████████████
train_loss,█▅▄▄▅▃▅▃▁▄▄▂▃▃▄▂▃▃▂▂▃▂▂▂▂▂▁▂▂▂▃▂▂▂▃▁▂▁▂
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▁█▁▇▂▁▁▁▅▃▁▂▂▂▁▂▁▁▁▃▂▂▂▁▁▂▁▂▂▂▂▁▂▁▁▁▂▁▂

0,1
epoch,39.0
train_accuracy,0.43107
train_loss,1.06625
validation_accuracy,0.43168
validation_loss,1.07492


[I 2025-05-01 11:34:46,786] Trial 4 finished with value: 1.0742088499523343 and parameters: {'lr': 0.00421821567844694, 'weight_decay': 0.00011946286871298436, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 10, 'use_l1': False, 'step_size': 20, 'gamma': 0.26986870444488426}. Best is trial 2 with value: 1.0742079076312838.



===== Trial 5 =====
 lr=1.41e-05, wd=4.77e-06, L1=on(2.59e-05), blocks=2, heads=4, segs=10, step_size=20, gamma=0.67


Epoch 001 | train_loss=14.3605 acc=0.2975 | val_loss=1.0913 acc=0.4317 | time=36.8s
Epoch 002 | train_loss=14.2887 acc=0.4311 | val_loss=1.0802 acc=0.4317 | time=36.7s
Epoch 003 | train_loss=14.2266 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=37.0s
Epoch 004 | train_loss=14.1707 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=36.8s


[W 2025-05-01 11:37:44,572] Trial 5 failed with parameters: {'lr': 1.4148595050869106e-05, 'weight_decay': 4.7749477404437315e-06, 'num_blocks': 2, 'num_heads': 4, 'num_segments': 10, 'use_l1': True, 'l1_lambda': 2.5920124189920035e-05, 'step_size': 20, 'gamma': 0.6651222578443667} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-9-e3aa22c865ab>", line 136, in objective_holdout
    tloss    += loss.item()
                ^^^^^^^^^^^
KeyboardInterrupt
[W 2025-05-01 11:37:44,574] Trial 5 failed with value None.


KeyboardInterrupt: 

### Test with dropout = 0.2 Model
- Cause Model Overfitting
- Validation Loss fastly decreases and increases after that
- Validation Accuracy increases, but the Validation Loss does not improve after some amount of epochs
- This is due to the overfitting and over confidence that model predicts the correct label as answer but cannot sure about it's answer

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Hyperparameter ranges and fixed settings ─────────────────────────
LR_MIN, LR_MAX     = 1e-5, 5e-3
WD_MIN, WD_MAX     = 1e-6, 1e-3
L1_MIN, L1_MAX     = 1e-7, 1e-3

NUM_FILTERS        = 120
NUM_BLOCK_CHOICES  = [1, 2, 3]
NUM_HEAD_CHOICES   = [2, 3, 4]
SEGMENT_CHOICES    = [5, 10, 15]

# ─── Training configuration ─────────────────────────
MAX_EPOCHS  = 100   # 100 epochs 고정
PATIENCE    = 20    # Early stopping patience
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ─────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def objective_holdout(trial):
    # ─ 1) Sample hyperparameters ────────────────────────────────
    lr           = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_blocks   = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads    = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    use_l1       = trial.suggest_categorical("use_l1", [False, True])
    l1_lambda    = trial.suggest_float("l1_lambda", L1_MIN, L1_MAX, log=True) if use_l1 else 0.0

    step_size    = trial.suggest_int("step_size", 10, 30, step=10)
    gamma        = trial.suggest_float("gamma", 0.1, 0.9)

    print(f"\n===== Trial {trial.number} =====")
    print(
        f" lr={lr:.2e}, wd={weight_decay:.2e}, "
        f"L1={'on' if use_l1 else 'off'}{f'({l1_lambda:.2e})' if use_l1 else ''}, "
        f"blocks={num_blocks}, heads={num_heads}, segs={num_segments}, "
        f"step_size={step_size}, gamma={gamma:.2f}"
    )

    # ─ 2) Load data ────────────────────────────────────────────────
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds    = EEGDataset(DATA_DIR, train_meta)
    labels     = [d["label"] for d in train_meta]
    n_samples  = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # ─ 3) Hold-out split ──────────────────────────────────────────
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # ─ 4) W&B init ────────────────────────────────────────────────
    wandb.init(project="eeg-holdout-tuning-3", config=trial.params)

    # ─ 5) Model / optimizer / loss ───────────────────────────────
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay  # L2
    )
    criterion = nn.CrossEntropyLoss()

    # ─ 6) Scheduler ───────────────────────────────────────────────
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=step_size,
        gamma=gamma
    )

    # ─ 7) Training loop w/ Early Stopping & Pruning ──────────────
    best_val_loss     = float("inf")
    epochs_no_improve = 0

    # placeholders for best-epoch metrics
    best_train_loss = best_train_acc = best_val_acc = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()

        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            if l1_lambda > 0:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss    = loss + l1_lambda * l1_norm
            loss.backward()
            optimizer.step()
            tloss    += loss.item()
            tcorrect += (logits.argmax(1) == y).sum().item()
            ttotal   += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — validate —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                logits = model(X)
                loss   = criterion(logits, y)
                vloss    += loss.item()
                vcorrect += (logits.argmax(1) == y).sum().item()
                vtotal   += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # — report & pruning check —
        trial.report(val_loss, epoch)
        if trial.should_prune():
            wandb.finish()
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        # — print & log —
        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
            f"time={elapsed:.1f}s"
        )
        wandb.log({
            "epoch":               epoch,
            "train_loss":          train_loss,
            "train_accuracy":      train_acc,
            "validation_loss":     val_loss,
            "validation_accuracy": val_acc,
        }, step=epoch)

        scheduler.step()

        # — early stopping logic & save best metrics —
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # store best-epoch metrics
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_trial-3",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_holdout-3.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout, n_trials=30)

    # ─── 결과 출력 ────────────────────────────────────────────────
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss       = {best.value:.6f}")
    print(f"best_train_loss     = {best.user_attrs['best_train_loss']:.6f}")
    print(f"best_train_accuracy = {best.user_attrs['best_train_acc']:.4f}")
    print(f"best_val_accuracy   = {best.user_attrs['best_val_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


[I 2025-05-01 05:26:38,791] A new study created in RDB with name: eeg_holdout_trial-3



===== Trial 0 =====
 lr=3.77e-03, wd=2.27e-06, L1=off, blocks=3, heads=3, segs=15, step_size=20, gamma=0.39


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,██▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▃▃▂▄▂▁▃▁▂▂▁▁▁▂▂▁▂▁▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▁▁▂

0,1
epoch,50.0
train_accuracy,0.43107
train_loss,3.89059
validation_accuracy,0.43168
validation_loss,1.0748


Epoch 001 | train_loss=1.0715 acc=0.4291 | val_loss=1.0786 acc=0.4317 | time=38.8s
Epoch 002 | train_loss=1.0694 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=38.6s
Epoch 003 | train_loss=1.0699 acc=0.4260 | val_loss=1.0767 acc=0.4317 | time=38.7s
Epoch 004 | train_loss=1.0687 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=38.5s
Epoch 005 | train_loss=1.0678 acc=0.4221 | val_loss=1.0767 acc=0.4317 | time=38.7s
Epoch 006 | train_loss=1.0675 acc=0.4311 | val_loss=1.0764 acc=0.4317 | time=38.6s
Epoch 007 | train_loss=1.0685 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=38.7s
Epoch 008 | train_loss=1.0674 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=38.7s
Epoch 009 | train_loss=1.0675 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=38.8s
Epoch 010 | train_loss=1.0671 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=38.8s
Epoch 011 | train_loss=1.0670 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=38.8s
Epoch 012 | train_loss=1.0665 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=38.7s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train_accuracy,▆█▄█▁█████████████████████████████
train_loss,█▆▆▅▄▃▄▃▃▃▃▂▂▃▂▂▄▄▂▃▂▂▃▁▁▂▂▁▂▂▂▁▂▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▂▅▁▅▅▃▂▂▁▁▂▃▁▂▁▂▂▂▃▂▂▂▁▂▂▂▂▂▂▂▂▂▂

0,1
epoch,34.0
train_accuracy,0.43107
train_loss,1.06561
validation_accuracy,0.43168
validation_loss,1.0747


[I 2025-05-01 05:48:37,850] Trial 0 finished with value: 1.0742357401620775 and parameters: {'lr': 0.003765783429817261, 'weight_decay': 2.2676625492339326e-06, 'num_blocks': 3, 'num_heads': 3, 'num_segments': 15, 'use_l1': False, 'step_size': 20, 'gamma': 0.391554470062229}. Best is trial 0 with value: 1.0742357401620775.



===== Trial 1 =====
 lr=1.64e-03, wd=1.76e-05, L1=off, blocks=1, heads=2, segs=5, step_size=30, gamma=0.29


Epoch 001 | train_loss=1.0704 acc=0.4264 | val_loss=1.0764 acc=0.4317 | time=17.5s
Epoch 002 | train_loss=1.0709 acc=0.4237 | val_loss=1.0755 acc=0.4317 | time=17.7s
Epoch 003 | train_loss=1.0689 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=17.4s
Epoch 004 | train_loss=1.0687 acc=0.4311 | val_loss=1.0766 acc=0.5497 | time=17.7s
Epoch 005 | train_loss=1.0689 acc=0.4237 | val_loss=1.0878 acc=0.4317 | time=17.5s
Epoch 006 | train_loss=1.0701 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=17.6s
Epoch 007 | train_loss=1.0669 acc=0.4311 | val_loss=1.0740 acc=0.4317 | time=17.8s
Epoch 008 | train_loss=1.0350 acc=0.4761 | val_loss=0.9835 acc=0.5668 | time=17.3s
Epoch 009 | train_loss=0.9477 acc=0.5852 | val_loss=0.9568 acc=0.5435 | time=17.4s
Epoch 010 | train_loss=0.9278 acc=0.5899 | val_loss=0.9399 acc=0.5637 | time=17.4s
Epoch 011 | train_loss=0.8934 acc=0.6058 | val_loss=0.9082 acc=0.5590 | time=17.4s
Epoch 012 | train_loss=0.8634 acc=0.6237 | val_loss=0.9219 acc=0.5823 | time=17.7s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
train_accuracy,▁▁▁▁▁▁▁▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇██████████
train_loss,████████▇▇▇▇▆▆▆▆▅▅▅▄▅▄▄▄▃▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▄▁▁▁▅▄▅▄▅▆▇▇▇▇▇▇▆█▇█▇▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇
validation_loss,▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▅▅▅▆▆▇█

0,1
epoch,41.0
train_accuracy,0.99883
train_loss,0.0059
validation_accuracy,0.65994
validation_loss,4.16215


[I 2025-05-01 06:00:37,715] Trial 1 finished with value: 0.8029196219784873 and parameters: {'lr': 0.0016414239569236648, 'weight_decay': 1.7560316464941144e-05, 'num_blocks': 1, 'num_heads': 2, 'num_segments': 5, 'use_l1': False, 'step_size': 30, 'gamma': 0.2883742978308369}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 2 =====
 lr=4.43e-03, wd=2.92e-04, L1=off, blocks=1, heads=3, segs=15, step_size=30, gamma=0.58


Epoch 001 | train_loss=1.0763 acc=0.4163 | val_loss=1.0788 acc=0.4317 | time=18.1s
Epoch 002 | train_loss=1.0697 acc=0.4276 | val_loss=1.0811 acc=0.4317 | time=18.0s
Epoch 003 | train_loss=1.0676 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=18.1s
Epoch 004 | train_loss=1.0698 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=18.0s
Epoch 005 | train_loss=1.0690 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=18.2s
Epoch 006 | train_loss=1.0680 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=18.0s
Epoch 007 | train_loss=1.0684 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=18.0s
Epoch 008 | train_loss=1.0667 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=17.9s
Epoch 009 | train_loss=1.0677 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=18.0s
Epoch 010 | train_loss=1.0670 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=18.2s
Epoch 011 | train_loss=1.0672 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=17.8s
Epoch 012 | train_loss=1.0671 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=18.2s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▆█████████████████████████████████
train_loss,█▄▂▄▃▂▃▂▂▂▂▂▂▂▁▂▂▁▂▂▂▂▂▂▁▁▁▂▂▁▁▁▁▂▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▆█▁▃▁▁▁▁▂▂▂▁▁▂▁▂▆▁▁▁▂▁▁▂▃▃▂▁▁▂▂▁▁▂▂

0,1
epoch,35.0
train_accuracy,0.43107
train_loss,1.06629
validation_accuracy,0.43168
validation_loss,1.07477


[I 2025-05-01 06:11:11,814] Trial 2 finished with value: 1.074278161639259 and parameters: {'lr': 0.004432919051350885, 'weight_decay': 0.0002924091059973708, 'num_blocks': 1, 'num_heads': 3, 'num_segments': 15, 'use_l1': False, 'step_size': 30, 'gamma': 0.5790497300853228}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 3 =====
 lr=5.27e-05, wd=2.07e-06, L1=off, blocks=2, heads=2, segs=15, step_size=30, gamma=0.52


Epoch 001 | train_loss=1.0902 acc=0.3755 | val_loss=1.0777 acc=0.4317 | time=23.6s
Epoch 002 | train_loss=1.0688 acc=0.4311 | val_loss=1.0808 acc=0.4317 | time=23.6s
Epoch 003 | train_loss=1.0675 acc=0.4311 | val_loss=1.0771 acc=0.4317 | time=23.5s
Epoch 004 | train_loss=1.0682 acc=0.4311 | val_loss=1.0773 acc=0.4317 | time=23.6s
Epoch 005 | train_loss=1.0665 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=23.5s
Epoch 006 | train_loss=1.0671 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=23.5s
Epoch 007 | train_loss=1.0664 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=23.8s
Epoch 008 | train_loss=1.0671 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=23.7s
Epoch 009 | train_loss=1.0674 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=23.6s
Epoch 010 | train_loss=1.0669 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=23.6s
Epoch 011 | train_loss=1.0669 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=23.7s
Epoch 012 | train_loss=1.0665 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=23.5s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█████
train_loss,████████████████▇▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▂▃▂▂▂▂▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▆▇▇▇▇▆▇▇▇▇▇▇▇▇▇▇██▇██
validation_loss,███████████████▄▃▂▂▃▂▂▃▁▁▂▂▁▂▁▁▁▄▃▂▂▂▂▃▂

0,1
epoch,71.0
train_accuracy,0.77437
train_loss,0.52679
validation_accuracy,0.65062
validation_loss,0.90594


[I 2025-05-01 06:39:10,259] Trial 3 finished with value: 0.8621873628525507 and parameters: {'lr': 5.2710727169582986e-05, 'weight_decay': 2.0673073320955644e-06, 'num_blocks': 2, 'num_heads': 2, 'num_segments': 15, 'use_l1': False, 'step_size': 30, 'gamma': 0.5198228506140427}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 4 =====
 lr=1.19e-04, wd=2.28e-06, L1=off, blocks=2, heads=3, segs=5, step_size=10, gamma=0.50


Epoch 001 | train_loss=1.0758 acc=0.4132 | val_loss=1.0746 acc=0.4317 | time=27.9s
Epoch 002 | train_loss=1.0662 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=27.9s
Epoch 003 | train_loss=1.0669 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=28.0s
Epoch 004 | train_loss=1.0659 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=28.1s
Epoch 005 | train_loss=1.0668 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=27.9s
Epoch 006 | train_loss=1.0671 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=27.9s
Epoch 007 | train_loss=1.0669 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=27.9s
Epoch 008 | train_loss=1.0679 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=28.2s
Epoch 009 | train_loss=1.0675 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=27.8s
Epoch 010 | train_loss=1.0674 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=27.8s
Epoch 011 | train_loss=1.0669 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=27.9s
Epoch 012 | train_loss=1.0660 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=28.0s
Epoc

0,1
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
train_accuracy,▁███████████████████████████
train_loss,█▁▂▁▂▂▂▃▂▂▂▁▂▂▂▁▂▁▁▁▁▁▂▁▁▁▂▂
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▂▃▃█▂▁▇▁▅▂▃▂▃▂▂▃▄▁▆▆▃▃▃▃▄▂▃▂

0,1
epoch,28.0
train_accuracy,0.43107
train_loss,1.06686
validation_accuracy,0.43168
validation_loss,1.07468


[I 2025-05-01 06:52:14,978] Trial 4 finished with value: 1.0743985800516038 and parameters: {'lr': 0.0001190383086698816, 'weight_decay': 2.2762893026346425e-06, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5, 'use_l1': False, 'step_size': 10, 'gamma': 0.5023523291402322}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 5 =====
 lr=4.94e-03, wd=8.78e-05, L1=off, blocks=3, heads=2, segs=10, step_size=20, gamma=0.72


Epoch 001 | train_loss=1.0747 acc=0.4171 | val_loss=1.0757 acc=0.4317 | time=31.5s
Epoch 002 | train_loss=1.0735 acc=0.4311 | val_loss=1.0768 acc=0.4317 | time=31.5s
Epoch 003 | train_loss=1.0687 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=31.3s
Epoch 004 | train_loss=1.0692 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=31.5s
Epoch 005 | train_loss=1.0676 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.4s
Epoch 006 | train_loss=1.0673 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=31.3s
Epoch 007 | train_loss=1.0678 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=31.4s
Epoch 008 | train_loss=1.0667 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=31.4s
Epoch 009 | train_loss=1.0671 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=31.5s
Epoch 010 | train_loss=1.0680 acc=0.4311 | val_loss=1.0795 acc=0.4317 | time=31.5s
Epoch 011 | train_loss=1.0667 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.5s
Epoch 012 | train_loss=1.0659 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.5s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train_accuracy,▁█████████████████████████████████
train_loss,█▇▃▄▂▂▃▂▂▃▂▁▂▂▂▂▂▂▂▁▁▁▁▁▁▂▂▂▁▂▁▂▁▂
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▃▄▃▃▁▃▄▃▃█▂▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂

0,1
epoch,34.0
train_accuracy,0.43107
train_loss,1.06661
validation_accuracy,0.43168
validation_loss,1.07486


[I 2025-05-01 07:10:07,066] Trial 5 finished with value: 1.0742452769052415 and parameters: {'lr': 0.004942789966732406, 'weight_decay': 8.779834965044777e-05, 'num_blocks': 3, 'num_heads': 2, 'num_segments': 10, 'use_l1': False, 'step_size': 20, 'gamma': 0.7236839001591417}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 6 =====
 lr=4.34e-04, wd=9.31e-05, L1=off, blocks=3, heads=4, segs=5, step_size=20, gamma=0.19


Epoch 001 | train_loss=1.0753 acc=0.4151 | val_loss=1.0768 acc=0.4317 | time=46.6s
Epoch 002 | train_loss=1.0677 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=46.6s
Epoch 003 | train_loss=1.0663 acc=0.4283 | val_loss=1.0762 acc=0.4317 | time=46.6s
Epoch 004 | train_loss=1.0706 acc=0.4179 | val_loss=1.0748 acc=0.4317 | time=46.6s
Epoch 005 | train_loss=1.0692 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=46.7s
Epoch 006 | train_loss=1.0677 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=46.6s
Epoch 007 | train_loss=1.0705 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=46.5s
Epoch 008 | train_loss=1.0700 acc=0.4311 | val_loss=1.0762 acc=0.4317 | time=46.5s
Epoch 009 | train_loss=1.0665 acc=0.4311 | val_loss=1.0809 acc=0.4317 | time=46.5s
Epoch 010 | train_loss=1.0669 acc=0.4311 | val_loss=1.0809 acc=0.4317 | time=46.6s
Epoch 011 | train_loss=1.0675 acc=0.4311 | val_loss=1.0730 acc=0.4317 | time=46.7s
Epoch 012 | train_loss=1.0677 acc=0.4303 | val_loss=1.0747 acc=0.4317 | time=46.5s
Epoc

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▁▁▁▁▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇███████
train_loss,███████████▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▃▃▄▄▅▅▅▆▆▆▆▆▆▆▇▆▇▇▇▇▇█▇▇▇█████
validation_loss,▅▅▅▅▅▅▅▅▅▅▄▄▃▂▂▃▂▂▂▂▂▂▂▁▂▄▄▄▄▃▅▄▅▅▆▆▆▇▇█

0,1
epoch,51.0
train_accuracy,0.91495
train_loss,0.21164
validation_accuracy,0.68012
validation_loss,1.28684


[I 2025-05-01 07:49:46,021] Trial 6 finished with value: 0.8401936023008256 and parameters: {'lr': 0.0004336672000136861, 'weight_decay': 9.309453956390096e-05, 'num_blocks': 3, 'num_heads': 4, 'num_segments': 5, 'use_l1': False, 'step_size': 20, 'gamma': 0.19028122261932925}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 7 =====
 lr=5.62e-05, wd=4.70e-04, L1=on(1.20e-06), blocks=1, heads=2, segs=5, step_size=10, gamma=0.63


Epoch 001 | train_loss=1.6787 acc=0.3701 | val_loss=1.0764 acc=0.4317 | time=17.6s
Epoch 002 | train_loss=1.6524 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=17.6s
Epoch 003 | train_loss=1.6470 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=17.6s
Epoch 004 | train_loss=1.6431 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=17.4s
Epoch 005 | train_loss=1.6398 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=17.3s
Epoch 006 | train_loss=1.6350 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=17.5s
Epoch 007 | train_loss=1.6320 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=17.6s
Epoch 008 | train_loss=1.6269 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=17.7s
Epoch 009 | train_loss=1.6233 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=17.6s
Epoch 010 | train_loss=1.6193 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=17.2s
Epoch 011 | train_loss=1.6160 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=17.7s
Epoch 012 | train_loss=1.6133 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=17.5s
Epoc

0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
train_accuracy,▁████████████
train_loss,█▅▅▄▄▃▃▃▂▂▂▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▃▂▁▃▁▂▃▃▁▃▄▄

0,1
epoch,13.0
train_accuracy,0.43107
train_loss,1.61093
validation_accuracy,0.43168
validation_loss,1.07528


[I 2025-05-01 07:53:53,935] Trial 7 pruned. 


▸ Trial 7 pruned at epoch 14

===== Trial 8 =====
 lr=1.14e-04, wd=1.26e-05, L1=on(3.97e-05), blocks=3, heads=3, segs=10, step_size=20, gamma=0.73


Epoch 001 | train_loss=21.8731 acc=0.4264 | val_loss=1.0743 acc=0.4317 | time=38.6s
Epoch 002 | train_loss=21.0796 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=38.5s
Epoch 003 | train_loss=20.3695 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=38.6s
Epoch 004 | train_loss=19.7420 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=38.7s
Epoch 005 | train_loss=19.1872 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=38.7s
Epoch 006 | train_loss=18.7003 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=38.7s
Epoch 007 | train_loss=18.2626 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=38.7s
Epoch 008 | train_loss=17.8673 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=38.8s
Epoch 009 | train_loss=17.5146 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=38.9s
Epoch 010 | train_loss=17.2010 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=38.7s
Epoch 011 | train_loss=16.9267 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=38.5s
Epoch 012 | train_loss=16.6762 acc=0.4311 | val_loss=1.0745 acc=0.4317 | tim

0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
train_accuracy,▁████████████
train_loss,█▇▆▅▅▄▃▃▂▂▂▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▁▆▁█▃▃█▆▅▂▂▂▂

0,1
epoch,13.0
train_accuracy,0.43107
train_loss,16.44486
validation_accuracy,0.43168
validation_loss,1.07455


[I 2025-05-01 08:02:57,386] Trial 8 pruned. 


▸ Trial 8 pruned at epoch 14

===== Trial 9 =====
 lr=3.74e-05, wd=2.44e-05, L1=off, blocks=3, heads=2, segs=10, step_size=20, gamma=0.17


Epoch 001 | train_loss=1.0724 acc=0.4210 | val_loss=1.0745 acc=0.4317 | time=31.6s
Epoch 002 | train_loss=1.0672 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=31.3s
Epoch 003 | train_loss=1.0669 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=31.5s
Epoch 004 | train_loss=1.0658 acc=0.4311 | val_loss=1.0768 acc=0.4317 | time=31.5s
Epoch 005 | train_loss=1.0661 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.3s
Epoch 006 | train_loss=1.0667 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=31.4s
Epoch 007 | train_loss=1.0669 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.4s
Epoch 008 | train_loss=1.0672 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=31.4s
Epoch 009 | train_loss=1.0676 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=31.4s
Epoch 010 | train_loss=1.0668 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=31.6s
Epoch 011 | train_loss=1.0673 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.3s
Epoch 012 | train_loss=1.0659 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=31.3s
Epoc

0,1
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
train_accuracy,▁████████████████████████
train_loss,█▃▂▁▂▂▃▃▃▂▃▁▂▂▂▂▂▁▂▂▁▁▂▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,▂▆▂█▁▄▃▂▃▅▃▂▃▂▁▃▃▄▂▄▃▃▄▃▃

0,1
epoch,25.0
train_accuracy,0.43107
train_loss,1.06545
validation_accuracy,0.43168
validation_loss,1.07488


[I 2025-05-01 08:16:05,851] Trial 9 finished with value: 1.074171318894341 and parameters: {'lr': 3.742882361019872e-05, 'weight_decay': 2.439192182545922e-05, 'num_blocks': 3, 'num_heads': 2, 'num_segments': 10, 'use_l1': False, 'step_size': 20, 'gamma': 0.16647853546069247}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 10 =====
 lr=7.67e-04, wd=8.45e-06, L1=on(6.07e-04), blocks=1, heads=4, segs=5, step_size=30, gamma=0.89


Epoch 001 | train_loss=273.9746 acc=0.4194 | val_loss=1.0753 acc=0.4317 | time=20.2s
Epoch 002 | train_loss=242.1309 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=20.4s
Epoch 003 | train_loss=221.1341 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=20.4s
Epoch 004 | train_loss=202.2783 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=20.5s
Epoch 005 | train_loss=184.9904 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=20.1s
Epoch 006 | train_loss=169.1731 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=20.3s
Epoch 007 | train_loss=154.4483 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=20.3s
Epoch 008 | train_loss=140.6719 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=20.4s
Epoch 009 | train_loss=127.8163 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=20.5s
Epoch 010 | train_loss=115.8559 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=20.3s
Epoch 011 | train_loss=104.7579 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=20.4s
Epoch 012 | train_loss=94.4891 acc=0.4311 | val_loss=1.0748 acc=0

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train_accuracy,▁█████████████████████████████████
train_loss,█▇▇▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▂▄▃▃▅▁▂▄▄▂▅▂▁▂▂▃▁▂▄▁▂▃▃▄▃▄▂▆▂▃▄▄▄

0,1
epoch,34.0
train_accuracy,0.43107
train_loss,6.09721
validation_accuracy,0.43168
validation_loss,1.07471


[I 2025-05-01 08:27:42,032] Trial 10 finished with value: 1.0742883057821364 and parameters: {'lr': 0.0007669393343338705, 'weight_decay': 8.450671040376852e-06, 'num_blocks': 1, 'num_heads': 4, 'num_segments': 5, 'use_l1': True, 'l1_lambda': 0.0006073678327728755, 'step_size': 30, 'gamma': 0.892573883329919}. Best is trial 1 with value: 0.8029196219784873.



===== Trial 11 =====
 lr=5.94e-04, wd=1.14e-04, L1=off, blocks=1, heads=4, segs=5, step_size=30, gamma=0.18


Epoch 001 | train_loss=1.0729 acc=0.4128 | val_loss=1.0742 acc=0.4317 | time=20.3s
Epoch 002 | train_loss=1.0698 acc=0.4198 | val_loss=1.0841 acc=0.4317 | time=20.4s
Epoch 003 | train_loss=1.0677 acc=0.4252 | val_loss=1.0843 acc=0.4317 | time=20.3s
Epoch 004 | train_loss=1.0685 acc=0.4214 | val_loss=1.0786 acc=0.4317 | time=20.3s
Epoch 005 | train_loss=1.0679 acc=0.4311 | val_loss=1.0769 acc=0.4317 | time=20.4s
Epoch 006 | train_loss=1.0677 acc=0.4311 | val_loss=1.0709 acc=0.4317 | time=20.3s
Epoch 007 | train_loss=0.9980 acc=0.5254 | val_loss=0.9644 acc=0.5590 | time=20.4s
Epoch 008 | train_loss=0.9389 acc=0.5852 | val_loss=0.9560 acc=0.5512 | time=20.3s
Epoch 009 | train_loss=0.9268 acc=0.5860 | val_loss=0.9426 acc=0.5637 | time=20.4s
Epoch 010 | train_loss=0.8981 acc=0.6016 | val_loss=0.9455 acc=0.6056 | time=20.2s
Epoch 011 | train_loss=0.8621 acc=0.6303 | val_loss=0.8939 acc=0.6227 | time=20.3s
Epoch 012 | train_loss=0.8332 acc=0.6466 | val_loss=0.9771 acc=0.6289 | time=20.3s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇██████████
train_loss,███████▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▄▄▄▅▆▆▅▇▇████▇██▇▇█▇▇█▇█▇▇▇▇▇▇▇▇
validation_loss,▃▃▃▃▃▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▃▃▂▄▅▂▅▅▅▆▆▆▇▇▇█

0,1
epoch,38.0
train_accuracy,0.99728
train_loss,0.01344
validation_accuracy,0.66615
validation_loss,2.37659


[I 2025-05-01 08:40:37,807] Trial 11 finished with value: 0.6841289514587039 and parameters: {'lr': 0.0005937996982252374, 'weight_decay': 0.00011437784562484397, 'num_blocks': 1, 'num_heads': 4, 'num_segments': 5, 'use_l1': False, 'step_size': 30, 'gamma': 0.18108826448896542}. Best is trial 11 with value: 0.6841289514587039.



===== Trial 12 =====
 lr=1.30e-03, wd=6.93e-05, L1=off, blocks=1, heads=4, segs=5, step_size=30, gamma=0.30


Epoch 001 | train_loss=1.0727 acc=0.4272 | val_loss=1.0828 acc=0.4317 | time=20.5s
Epoch 002 | train_loss=1.0710 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=20.1s
Epoch 003 | train_loss=1.0686 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=20.2s
Epoch 004 | train_loss=1.0690 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=20.3s


0,1
epoch,▁▃▆█
train_accuracy,▁███
train_loss,█▅▁▂
validation_accuracy,▁▁▁▁
validation_loss,█▁▂▁

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06896
validation_accuracy,0.43168
validation_loss,1.07512


[I 2025-05-01 08:42:25,655] Trial 12 pruned. 


▸ Trial 12 pruned at epoch 5

===== Trial 13 =====
 lr=1.14e-03, wd=1.96e-04, L1=off, blocks=1, heads=4, segs=5, step_size=30, gamma=0.33


Epoch 001 | train_loss=1.0736 acc=0.4171 | val_loss=1.0809 acc=0.4317 | time=20.4s
Epoch 002 | train_loss=1.0690 acc=0.4311 | val_loss=1.0779 acc=0.4317 | time=20.3s
Epoch 003 | train_loss=1.0674 acc=0.4311 | val_loss=1.0723 acc=0.4317 | time=20.2s
Epoch 004 | train_loss=1.0326 acc=0.4862 | val_loss=1.0437 acc=0.5342 | time=20.3s
Epoch 005 | train_loss=0.9423 acc=0.5856 | val_loss=0.9551 acc=0.5683 | time=20.4s
Epoch 006 | train_loss=0.8944 acc=0.6132 | val_loss=0.9267 acc=0.5823 | time=20.5s
Epoch 007 | train_loss=0.8811 acc=0.6140 | val_loss=0.9465 acc=0.5776 | time=20.5s
Epoch 008 | train_loss=0.8429 acc=0.6346 | val_loss=0.9277 acc=0.5854 | time=20.4s
Epoch 009 | train_loss=0.7974 acc=0.6532 | val_loss=0.9905 acc=0.5994 | time=20.6s
Epoch 010 | train_loss=0.7792 acc=0.6621 | val_loss=0.9284 acc=0.5994 | time=20.7s
Epoch 011 | train_loss=0.7715 acc=0.6649 | val_loss=1.0118 acc=0.5854 | time=20.6s
Epoch 012 | train_loss=0.7126 acc=0.6850 | val_loss=0.9501 acc=0.6025 | time=20.4s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▁▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██████
train_loss,████▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▃▃▃▂▂▃▂▂▂▂▁▁▁▁▁
validation_accuracy,▁▁▁▄▆▆▆▆▇▇▆▇▇▇▆█▇▇▇█▇▇█▇▇▇▇▇▇▇▇▇▇▇▆▇▇▆
validation_loss,▂▂▂▂▁▁▁▁▁▁▂▁▂▁▁▁▂▁▁▁▁▁▂▃▃▃▅▄▃▃▃▄▅▆▆███

0,1
epoch,38.0
train_accuracy,0.97825
train_loss,0.0651
validation_accuracy,0.58696
validation_loss,3.00367


[I 2025-05-01 08:55:33,519] Trial 13 finished with value: 0.8515792347135998 and parameters: {'lr': 0.0011374342496391653, 'weight_decay': 0.00019628505332231725, 'num_blocks': 1, 'num_heads': 4, 'num_segments': 5, 'use_l1': False, 'step_size': 30, 'gamma': 0.3314326940086906}. Best is trial 11 with value: 0.6841289514587039.



===== Trial 14 =====
 lr=3.26e-04, wd=8.69e-04, L1=on(1.72e-07), blocks=1, heads=4, segs=5, step_size=30, gamma=0.12


Epoch 001 | train_loss=1.1564 acc=0.4132 | val_loss=1.0746 acc=0.4317 | time=20.5s
Epoch 002 | train_loss=1.1496 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=20.5s
Epoch 003 | train_loss=1.1464 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=20.5s
Epoch 004 | train_loss=1.1445 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=20.4s
Epoch 005 | train_loss=1.1424 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=20.3s
Epoch 006 | train_loss=1.1399 acc=0.4311 | val_loss=1.0719 acc=0.4317 | time=20.6s
Epoch 007 | train_loss=1.0876 acc=0.5344 | val_loss=0.9845 acc=0.5714 | time=20.3s
Epoch 008 | train_loss=1.0611 acc=0.5530 | val_loss=0.9821 acc=0.5528 | time=20.4s
Epoch 009 | train_loss=1.0161 acc=0.5891 | val_loss=1.0412 acc=0.4969 | time=20.3s
Epoch 010 | train_loss=1.0031 acc=0.5872 | val_loss=0.9763 acc=0.5792 | time=20.3s
Epoch 011 | train_loss=0.9849 acc=0.6008 | val_loss=0.9596 acc=0.5512 | time=20.4s
Epoch 012 | train_loss=0.9720 acc=0.6089 | val_loss=0.9630 acc=0.5559 | time=20.5s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇███
train_loss,██████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▁▁
validation_accuracy,▁▁▁▁▁▁▇▇▄█▇▇▇▇▇▇██▇▇▇▇▆▇▆▇▆▆▇▅▆▆▆
validation_loss,▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▅▄▄▆▇█

0,1
epoch,33.0
train_accuracy,0.91068
train_loss,0.32389
validation_accuracy,0.53882
validation_loss,2.06606


[I 2025-05-01 09:06:54,383] Trial 14 finished with value: 0.952350948538099 and parameters: {'lr': 0.0003264831943129236, 'weight_decay': 0.0008686858820868705, 'num_blocks': 1, 'num_heads': 4, 'num_segments': 5, 'use_l1': True, 'l1_lambda': 1.7178879551866324e-07, 'step_size': 30, 'gamma': 0.12018926331640767}. Best is trial 11 with value: 0.6841289514587039.



===== Trial 15 =====
 lr=1.44e-05, wd=2.85e-05, L1=off, blocks=1, heads=2, segs=5, step_size=30, gamma=0.27


Epoch 001 | train_loss=1.0812 acc=0.3697 | val_loss=1.0771 acc=0.4317 | time=17.5s
Epoch 002 | train_loss=1.0689 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=17.2s
Epoch 003 | train_loss=1.0673 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=17.3s
Epoch 004 | train_loss=1.0661 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=17.5s
Epoch 005 | train_loss=1.0664 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=17.5s
Epoch 006 | train_loss=1.0659 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=17.3s
Epoch 007 | train_loss=1.0667 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=17.3s
Epoch 008 | train_loss=1.0668 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=17.4s
Epoch 009 | train_loss=1.0670 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=17.4s
Epoch 010 | train_loss=1.0665 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=17.5s
Epoch 011 | train_loss=1.0663 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=17.3s
Epoch 012 | train_loss=1.0662 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=17.2s
Epoc

0,1
epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██
train_accuracy,▁█████████████████████
train_loss,█▂▂▁▁▁▁▂▂▁▁▁▁▁▂▁▂▁▂▁▂▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▁▂▂▂▃▃▃▃▃▂▃▃▃▄▂▃▃▃▂▂▂

0,1
epoch,22.0
train_accuracy,0.43107
train_loss,1.06581
validation_accuracy,0.43168
validation_loss,1.07471


[I 2025-05-01 09:13:24,118] Trial 15 finished with value: 1.0742352292651223 and parameters: {'lr': 1.4403036813086675e-05, 'weight_decay': 2.8450636938411763e-05, 'num_blocks': 1, 'num_heads': 2, 'num_segments': 5, 'use_l1': False, 'step_size': 30, 'gamma': 0.26892398845800014}. Best is trial 11 with value: 0.6841289514587039.



===== Trial 16 =====
 lr=1.89e-03, wd=5.98e-06, L1=off, blocks=2, heads=2, segs=5, step_size=30, gamma=0.40


Epoch 001 | train_loss=1.0704 acc=0.4233 | val_loss=1.0786 acc=0.4317 | time=23.3s
Epoch 002 | train_loss=1.0698 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=23.2s
Epoch 003 | train_loss=1.0700 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=23.2s
Epoch 004 | train_loss=1.0687 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=23.4s
Epoch 005 | train_loss=1.0677 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=23.5s
Epoch 006 | train_loss=1.0676 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=23.5s
Epoch 007 | train_loss=1.0701 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=23.2s
Epoch 008 | train_loss=1.0669 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=23.2s
Epoch 009 | train_loss=1.0675 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=23.3s
Epoch 010 | train_loss=1.0665 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=23.1s
Epoch 011 | train_loss=1.0676 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=23.2s
Epoch 012 | train_loss=1.0668 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=23.4s
Epoc

0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
train_accuracy,▁████████████
train_loss,█▇▇▅▃▃▇▂▃▁▃▂▂
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▂▄▃▁▄▁▄▂▂▂▄▃

0,1
epoch,13.0
train_accuracy,0.43107
train_loss,1.0671
validation_accuracy,0.43168
validation_loss,1.07559


[I 2025-05-01 09:18:56,906] Trial 16 pruned. 


▸ Trial 16 pruned at epoch 14

===== Trial 17 =====
 lr=6.61e-04, wd=5.31e-05, L1=off, blocks=1, heads=4, segs=5, step_size=20, gamma=0.21


Epoch 001 | train_loss=1.0709 acc=0.4245 | val_loss=1.0811 acc=0.4317 | time=20.4s
Epoch 002 | train_loss=1.0691 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=20.3s
Epoch 003 | train_loss=1.0703 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=20.5s
Epoch 004 | train_loss=1.0680 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=20.3s
Epoch 005 | train_loss=1.0683 acc=0.4311 | val_loss=1.0834 acc=0.4317 | time=20.3s
Epoch 006 | train_loss=1.0688 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=20.4s
Epoch 007 | train_loss=1.0681 acc=0.4311 | val_loss=1.0772 acc=0.4317 | time=20.3s
Epoch 008 | train_loss=1.0676 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=20.4s
Epoch 009 | train_loss=1.0668 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=20.3s
Epoch 010 | train_loss=1.0675 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=20.3s
Epoch 011 | train_loss=1.0658 acc=0.4311 | val_loss=1.0711 acc=0.4317 | time=20.4s
Epoch 012 | train_loss=1.0330 acc=0.4924 | val_loss=0.9821 acc=0.5295 | time=20.2s
Epoc

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇██
train_accuracy,▁▁▁▁▁▁▁▁▁▂▄▄▄▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇█▇██████████
train_loss,█████████▇▇▆▆▆▆▅▅▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▄▅▅▅▆▆▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇██████████
validation_loss,██████▅▅▅▄▄▃▃▃▃▃▃▂▃▂▂▃▃▃▄▁▂▄▄▄▅▄▄▄▅▅▅▅▆▆

0,1
epoch,58.0
train_accuracy,0.86796
train_loss,0.33339
validation_accuracy,0.67391
validation_loss,0.99908


[I 2025-05-01 09:38:43,220] Trial 17 finished with value: 0.8123467592965989 and parameters: {'lr': 0.0006613322502150471, 'weight_decay': 5.306026913460125e-05, 'num_blocks': 1, 'num_heads': 4, 'num_segments': 5, 'use_l1': False, 'step_size': 20, 'gamma': 0.20561411467734086}. Best is trial 11 with value: 0.6841289514587039.



===== Trial 18 =====
 lr=1.90e-04, wd=1.72e-04, L1=on(1.48e-05), blocks=1, heads=2, segs=10, step_size=10, gamma=0.39


Epoch 001 | train_loss=8.1850 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=17.3s
Epoch 002 | train_loss=7.9208 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=17.3s
Epoch 003 | train_loss=7.6896 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=17.5s
Epoch 004 | train_loss=7.4814 acc=0.4311 | val_loss=1.0813 acc=0.4317 | time=17.5s
Epoch 005 | train_loss=7.2928 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=17.6s
Epoch 006 | train_loss=7.1234 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=17.5s
Epoch 007 | train_loss=6.9673 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=17.3s
Epoch 008 | train_loss=6.8240 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=17.6s
Epoch 009 | train_loss=6.6866 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=17.5s


In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Hyperparameter ranges and fixed settings ─────────────────────────
LR_MIN, LR_MAX     = 1e-4, 1e-3
WD_MIN, WD_MAX     = 1e-5, 1e-3
NUM_FILTERS        = 120
NUM_BLOCK_CHOICES  = [1, 2, 3]
NUM_HEAD_CHOICES   = [2, 3, 4]
SEGMENT_CHOICES    = [5, 10, 15]

# ─── Training configuration ─────────────────────────
MAX_EPOCHS  = 200
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ─────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def objective_holdout(trial):
    # 1) Sample hyperparameters
    lr            = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay  = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_blocks    = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads     = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments  = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)
    # New: sample scheduler hyperparameters
    step_size     = trial.suggest_int("step_size", 10, 30, step=10)
    gamma         = trial.suggest_float("gamma", 0.1, 0.9)

    print(f"\n===== Trial {trial.number} =====")
    print(
        f" lr={lr:.2e}, wd={weight_decay:.2e}, blocks={num_blocks}, "
        f"heads={num_heads}, segs={num_segments}, "
        f"step_size={step_size}, gamma={gamma:.2f}"
    )

    # 2) Load metadata and build dataset
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta  = [d for d in all_meta if d["type"] == "train"]
    full_ds     = EEGDataset(DATA_DIR, train_meta)
    labels      = [d["label"] for d in train_meta]
    n_samples   = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # 3) Single hold-out split (80/20), stratified
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # 4) W&B init
    wandb.init(project="eeg-holdout-tuning-1", config=trial.params)

    # 5) Build model, optimizer, loss
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay
    )
    criterion = nn.CrossEntropyLoss()

    # 6) Scheduler with sampled step_size & gamma
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=step_size,
        gamma=gamma
    )

    # 7) Training loop with early stopping & pruning
    best_val_loss     = float("inf")
    epochs_no_improve = 0
    # store best-epoch metrics
    best_train_loss = best_train_acc = best_val_acc = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()

        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()
            tloss    += loss.item()
            tcorrect += (logits.argmax(1) == y).sum().item()
            ttotal   += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — validate —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                logits = model(X)
                loss   = criterion(logits, y)
                vloss    += loss.item()
                vcorrect += (logits.argmax(1) == y).sum().item()
                vtotal   += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # report & prune
        trial.report(val_loss, epoch)
        if trial.should_prune():
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
            f"time={elapsed:.1f}s"
        )

        # — log all four metrics to wandb for graphing
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "train_accuracy": train_acc,
            "validation_loss": val_loss,
            "validation_accuracy": val_acc,
        }, step=epoch)

        # step the scheduler
        scheduler.step()

        # early stopping & record best
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # store best-epoch metrics
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_trial-1",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_holdout-1.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout, n_trials=30)

    # print best trial & metrics
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss   = {best.value:.6f}")
    print(f"best_val_acc    = {best.user_attrs['best_val_acc']:.4f}")
    print(f"best_train_loss = {best.user_attrs['best_train_loss']:.4f}")
    print(f"best_train_acc  = {best.user_attrs['best_train_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


[I 2025-04-30 22:40:14,173] Using an existing study with name 'eeg_holdout_trial-1' instead of creating a new one.



===== Trial 1 =====
 lr=2.50e-04, wd=1.45e-04, blocks=1, heads=4, segs=10, step_size=20, gamma=0.70


0,1
epoch,▁█
train_accuracy,▁█
train_loss,▁█
validation_accuracy,▁█
validation_loss,█▁

0,1
epoch,2.0
train_accuracy,0.42951
train_loss,1.07145
validation_accuracy,0.43168
validation_loss,1.07453


Epoch 001 | train_loss=1.0706 acc=0.4260 | val_loss=1.0761 acc=0.4317 | time=20.7s
Epoch 002 | train_loss=1.0684 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=20.4s
Epoch 003 | train_loss=1.0676 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=20.3s
Epoch 004 | train_loss=1.0671 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=20.5s
Epoch 005 | train_loss=1.0679 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=20.4s
Epoch 006 | train_loss=1.0684 acc=0.4311 | val_loss=1.0762 acc=0.4317 | time=20.5s
Epoch 007 | train_loss=1.0676 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=20.4s
Epoch 008 | train_loss=1.0669 acc=0.4311 | val_loss=1.0765 acc=0.4317 | time=20.5s
Epoch 009 | train_loss=1.0672 acc=0.4311 | val_loss=1.0836 acc=0.4317 | time=20.4s
Epoch 010 | train_loss=1.0709 acc=0.4249 | val_loss=1.0759 acc=0.4317 | time=20.4s
Epoch 011 | train_loss=1.0673 acc=0.4311 | val_loss=1.0768 acc=0.4317 | time=20.6s
Epoch 012 | train_loss=1.0681 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=20.3s
Epoc

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▂▃▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇▇█████████
train_loss,█████████████▇▇▇▇▆▆▆▅▅▅▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇█▇█▇▇█▇█
validation_loss,▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂▃▂▂▃▅▃▄▅▇▆▇█▇

0,1
epoch,49.0
train_accuracy,0.97165
train_loss,0.09872
validation_accuracy,0.67857
validation_loss,2.05648


[I 2025-04-30 22:57:00,480] Trial 1 finished with value: 0.8923814424446651 and parameters: {'lr': 0.000249828961706864, 'weight_decay': 0.000144630365763184, 'num_blocks': 1, 'num_heads': 4, 'num_segments': 10, 'step_size': 20, 'gamma': 0.6986434256347877}. Best is trial 1 with value: 0.8923814424446651.



===== Trial 2 =====
 lr=4.82e-04, wd=7.72e-05, blocks=1, heads=2, segs=10, step_size=20, gamma=0.70


Epoch 001 | train_loss=1.0710 acc=0.4256 | val_loss=1.0749 acc=0.4317 | time=17.7s
Epoch 002 | train_loss=1.0690 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=17.2s
Epoch 003 | train_loss=1.0680 acc=0.4311 | val_loss=1.0782 acc=0.4317 | time=17.6s
Epoch 004 | train_loss=1.0697 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=17.6s
Epoch 005 | train_loss=1.0682 acc=0.4311 | val_loss=1.0781 acc=0.4317 | time=17.7s
Epoch 006 | train_loss=1.0679 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=17.6s
Epoch 007 | train_loss=1.0316 acc=0.4792 | val_loss=0.9652 acc=0.5575 | time=17.4s
Epoch 008 | train_loss=0.9506 acc=0.5833 | val_loss=0.9921 acc=0.5140 | time=17.4s
Epoch 009 | train_loss=0.9262 acc=0.5922 | val_loss=0.9432 acc=0.5528 | time=17.6s
Epoch 010 | train_loss=0.9066 acc=0.6035 | val_loss=0.9319 acc=0.5714 | time=17.2s
Epoch 011 | train_loss=0.8757 acc=0.6167 | val_loss=0.9201 acc=0.5745 | time=17.8s
Epoch 012 | train_loss=0.8435 acc=0.6311 | val_loss=0.9452 acc=0.5870 | time=17.4s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▂▃▃▃▃▄▄▄▄▄▄▅▅▆▆▆▇▇▇▇▇▇█▇████
train_loss,███████▇▇▇▇▆▆▆▆▅▆▅▅▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▅▄▅▆▆▆▇█▆▆▇████▇█▇██▇▇█▇▇▇██
validation_loss,▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▂▁▂▂▂▂▂▃▃▃▅▄▅▅▅▆▆▆█

0,1
epoch,34.0
train_accuracy,0.97359
train_loss,0.07808
validation_accuracy,0.63509
validation_loss,2.59377


[I 2025-04-30 23:06:59,494] Trial 2 finished with value: 0.8809579866273063 and parameters: {'lr': 0.00048240685668179294, 'weight_decay': 7.722789242670862e-05, 'num_blocks': 1, 'num_heads': 2, 'num_segments': 10, 'step_size': 20, 'gamma': 0.703019033811063}. Best is trial 2 with value: 0.8809579866273063.



===== Trial 3 =====
 lr=5.76e-04, wd=3.83e-05, blocks=2, heads=4, segs=10, step_size=20, gamma=0.11


Epoch 001 | train_loss=1.0753 acc=0.4151 | val_loss=1.0790 acc=0.4317 | time=33.8s
Epoch 002 | train_loss=1.0717 acc=0.4252 | val_loss=1.0744 acc=0.4317 | time=33.4s
Epoch 003 | train_loss=1.0686 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=33.6s
Epoch 004 | train_loss=1.0700 acc=0.4311 | val_loss=1.0785 acc=0.4317 | time=33.7s
Epoch 005 | train_loss=1.0672 acc=0.4210 | val_loss=1.0869 acc=0.4317 | time=33.4s
Epoch 006 | train_loss=1.0672 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=33.5s
Epoch 007 | train_loss=1.0667 acc=0.4311 | val_loss=1.0808 acc=0.4317 | time=33.6s
Epoch 008 | train_loss=1.0678 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=33.6s
Epoch 009 | train_loss=1.0676 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=33.5s
Epoch 010 | train_loss=1.0617 acc=0.4392 | val_loss=1.0224 acc=0.5730 | time=33.4s
Epoch 011 | train_loss=0.9791 acc=0.5616 | val_loss=0.9815 acc=0.5217 | time=33.6s
Epoch 012 | train_loss=0.9443 acc=0.5852 | val_loss=1.0092 acc=0.5683 | time=33.5s
Epoc

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
train_accuracy,▁▁▁▁▁▁▁▁▁▁▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇███
train_loss,██████████▇▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▁▆▄▆▆▆▇▇▇▇▇████████▇▇▇▇█████████
validation_loss,█████████▆▄▅▄▄▃▃▂▅▂▁▁▁▁▁▂▂▂▂▂▂▂▃▄▃▇▅▅▆▆▆

0,1
epoch,43.0
train_accuracy,0.81631
train_loss,0.48586
validation_accuracy,0.61491
validation_loss,1.03933


[I 2025-04-30 23:31:03,649] Trial 3 finished with value: 0.87745715039117 and parameters: {'lr': 0.0005763140134110753, 'weight_decay': 3.829160944186297e-05, 'num_blocks': 2, 'num_heads': 4, 'num_segments': 10, 'step_size': 20, 'gamma': 0.10570697111390635}. Best is trial 3 with value: 0.87745715039117.



===== Trial 4 =====
 lr=9.67e-04, wd=1.00e-04, blocks=3, heads=2, segs=5, step_size=30, gamma=0.46


Epoch 001 | train_loss=1.0718 acc=0.4148 | val_loss=1.0756 acc=0.4317 | time=31.3s
Epoch 002 | train_loss=1.0684 acc=0.4311 | val_loss=1.0816 acc=0.4317 | time=31.3s
Epoch 003 | train_loss=1.0679 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.3s
Epoch 004 | train_loss=1.0679 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.4s
Epoch 005 | train_loss=1.0676 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.2s
Epoch 006 | train_loss=1.0670 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=31.3s
Epoch 007 | train_loss=1.0677 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.2s
Epoch 008 | train_loss=1.0672 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.5s
Epoch 009 | train_loss=1.0669 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=31.3s
Epoch 010 | train_loss=1.0675 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=31.4s
Epoch 011 | train_loss=1.0667 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=31.2s
Epoch 012 | train_loss=1.0674 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.3s
Epoc

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇██
train_accuracy,▁▁▁▁▁▁▂▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇██████
train_loss,████████▇▇▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
validation_accuracy,▁▁▁▁▁▃▃▄▄▄▄▄▅▅▅▆▅▆▅▆▆▆▆▆▆▆▆▆▇▆▇▇▇▇▇█████
validation_loss,███████▆▆▅▅▅▄▄▄▃▃▃▃▄▃▂▂▂▁▁▂▂▂▁▁▂▂▁▂▂▂▃▂▃

0,1
epoch,79.0
train_accuracy,0.93748
train_loss,0.16023
validation_accuracy,0.79658
validation_loss,0.77196


[I 2025-05-01 00:12:52,337] Trial 4 finished with value: 0.5753395855426788 and parameters: {'lr': 0.0009667592898435953, 'weight_decay': 9.995420002148595e-05, 'num_blocks': 3, 'num_heads': 2, 'num_segments': 5, 'step_size': 30, 'gamma': 0.4573463771689297}. Best is trial 4 with value: 0.5753395855426788.



===== Trial 5 =====
 lr=6.22e-04, wd=4.54e-05, blocks=2, heads=2, segs=5, step_size=20, gamma=0.71


Epoch 001 | train_loss=1.0704 acc=0.4190 | val_loss=1.0747 acc=0.4317 | time=23.6s
Epoch 002 | train_loss=1.0710 acc=0.4082 | val_loss=1.0748 acc=0.4317 | time=23.5s
Epoch 003 | train_loss=1.0687 acc=0.4311 | val_loss=1.0779 acc=0.4317 | time=23.3s
Epoch 004 | train_loss=1.0686 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=23.4s
Epoch 005 | train_loss=1.0671 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=23.4s
Epoch 006 | train_loss=1.0664 acc=0.4311 | val_loss=1.0767 acc=0.4317 | time=23.5s
Epoch 007 | train_loss=1.0655 acc=0.4311 | val_loss=1.0824 acc=0.4317 | time=23.6s
Epoch 008 | train_loss=1.0107 acc=0.5219 | val_loss=0.9660 acc=0.5776 | time=23.6s
Epoch 009 | train_loss=0.9627 acc=0.5670 | val_loss=0.9528 acc=0.5714 | time=23.7s
Epoch 010 | train_loss=0.9302 acc=0.5891 | val_loss=0.9506 acc=0.5606 | time=23.4s
Epoch 011 | train_loss=0.9221 acc=0.5953 | val_loss=1.0528 acc=0.5435 | time=23.5s
Epoch 012 | train_loss=0.9045 acc=0.6050 | val_loss=0.9833 acc=0.5839 | time=23.7s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██████
train_loss,████████▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▆▆▅▅▆▆▇▆▇▇▆▇▇▇▆▇▆▇▆▇▇▆▇▇▇▇▇▇▇▇█▇▇
validation_loss,▃▃▃▃▃▃▃▂▂▂▃▂▂▂▂▁▁▂▁▁▂▁▂▂▂▂▄▃▂▃▄▃▅▅▇▅▅███

0,1
epoch,40.0
train_accuracy,0.92699
train_loss,0.20714
validation_accuracy,0.60093
validation_loss,1.59606


[I 2025-05-01 00:28:41,345] Trial 5 finished with value: 0.8657736324128651 and parameters: {'lr': 0.0006224513223071905, 'weight_decay': 4.540866158772019e-05, 'num_blocks': 2, 'num_heads': 2, 'num_segments': 5, 'step_size': 20, 'gamma': 0.7083541661273333}. Best is trial 4 with value: 0.5753395855426788.



===== Trial 6 =====
 lr=5.21e-04, wd=1.63e-04, blocks=2, heads=4, segs=15, step_size=10, gamma=0.76


Epoch 001 | train_loss=1.0724 acc=0.4287 | val_loss=1.0797 acc=0.4317 | time=33.5s
Epoch 002 | train_loss=1.0711 acc=0.4311 | val_loss=1.0772 acc=0.4317 | time=33.8s
Epoch 003 | train_loss=1.0683 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=33.9s
Epoch 004 | train_loss=1.0680 acc=0.4311 | val_loss=1.0775 acc=0.4317 | time=33.9s
Epoch 005 | train_loss=1.0689 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=33.9s
Epoch 006 | train_loss=1.0672 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=34.0s
Epoch 007 | train_loss=1.0683 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=33.9s
Epoch 008 | train_loss=1.0674 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=34.0s
Epoch 009 | train_loss=1.0668 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=34.0s


[I 2025-05-01 00:34:21,823] Trial 6 pruned. 


▸ Trial 6 pruned at epoch 10

===== Trial 7 =====
 lr=2.61e-04, wd=4.44e-04, blocks=2, heads=4, segs=5, step_size=30, gamma=0.88


0,1
epoch,▁▂▃▄▅▅▆▇█
train_accuracy,▁████████
train_loss,█▆▃▂▄▁▃▂▁
validation_accuracy,▁▁▁▁▁▁▁▁▁
validation_loss,█▅▃▅▁▁▂▂▁

0,1
epoch,9.0
train_accuracy,0.43107
train_loss,1.06678
validation_accuracy,0.43168
validation_loss,1.07437


Epoch 001 | train_loss=1.0728 acc=0.4264 | val_loss=1.0748 acc=0.4317 | time=33.8s
Epoch 002 | train_loss=1.0692 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=33.7s
Epoch 003 | train_loss=1.0681 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=33.5s
Epoch 004 | train_loss=1.0677 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=33.5s
Epoch 005 | train_loss=1.0690 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=33.6s
Epoch 006 | train_loss=1.0676 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=33.4s
Epoch 007 | train_loss=1.0681 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=33.7s
Epoch 008 | train_loss=1.0682 acc=0.4311 | val_loss=1.0804 acc=0.4317 | time=33.5s
Epoch 009 | train_loss=1.0682 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=33.5s


[I 2025-05-01 00:39:59,635] Trial 7 pruned. 


▸ Trial 7 pruned at epoch 10

===== Trial 8 =====
 lr=1.94e-04, wd=2.01e-05, blocks=2, heads=4, segs=5, step_size=30, gamma=0.86


0,1
epoch,▁▂▃▄▅▅▆▇█
train_accuracy,▁████████
train_loss,█▃▂▁▃▁▂▂▂
validation_accuracy,▁▁▁▁▁▁▁▁▁
validation_loss,▂▂▂▁▁▁▂█▂

0,1
epoch,9.0
train_accuracy,0.43107
train_loss,1.06825
validation_accuracy,0.43168
validation_loss,1.07467


Epoch 001 | train_loss=1.0716 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=33.7s
Epoch 002 | train_loss=1.0663 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=33.7s
Epoch 003 | train_loss=1.0671 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=33.7s
Epoch 004 | train_loss=1.0677 acc=0.4311 | val_loss=1.0764 acc=0.4317 | time=33.5s
Epoch 005 | train_loss=1.0671 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=33.6s
Epoch 006 | train_loss=1.0665 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=33.8s


[I 2025-05-01 00:43:56,779] Trial 8 pruned. 


▸ Trial 8 pruned at epoch 7

===== Trial 9 =====
 lr=2.78e-04, wd=4.85e-04, blocks=2, heads=4, segs=5, step_size=30, gamma=0.56


0,1
epoch,▁▂▄▅▇█
train_accuracy,▁▁▁▁▁▁
train_loss,█▁▂▃▂▁
validation_accuracy,▁▁▁▁▁▁
validation_loss,▇▅▂█▁▂

0,1
epoch,6.0
train_accuracy,0.43107
train_loss,1.06649
validation_accuracy,0.43168
validation_loss,1.07459


Epoch 001 | train_loss=1.0774 acc=0.4074 | val_loss=1.0744 acc=0.4317 | time=33.5s
Epoch 002 | train_loss=1.0695 acc=0.4311 | val_loss=1.0780 acc=0.4317 | time=33.7s
Epoch 003 | train_loss=1.0689 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=33.6s
Epoch 004 | train_loss=1.0679 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=33.6s
Epoch 005 | train_loss=1.0663 acc=0.4311 | val_loss=1.0799 acc=0.4317 | time=33.8s
Epoch 006 | train_loss=1.0675 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=33.7s


[I 2025-05-01 00:47:53,980] Trial 9 pruned. 


▸ Trial 9 pruned at epoch 7

===== Trial 10 =====
 lr=7.45e-04, wd=2.97e-04, blocks=1, heads=3, segs=15, step_size=20, gamma=0.52


0,1
epoch,▁▂▄▅▇█
train_accuracy,▁█████
train_loss,█▃▃▂▁▂
validation_accuracy,▁▁▁▁▁▁
validation_loss,▁▆▁▂█▂

0,1
epoch,6.0
train_accuracy,0.43107
train_loss,1.06746
validation_accuracy,0.43168
validation_loss,1.07479


Epoch 001 | train_loss=1.0703 acc=0.4276 | val_loss=1.0747 acc=0.4317 | time=18.7s
Epoch 002 | train_loss=1.0708 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=18.8s
Epoch 003 | train_loss=1.0677 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=19.1s
Epoch 004 | train_loss=1.0702 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=19.1s
Epoch 005 | train_loss=1.0688 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=18.8s
Epoch 006 | train_loss=1.0679 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=18.7s
Epoch 007 | train_loss=1.0674 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=18.6s
Epoch 008 | train_loss=1.0665 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=18.9s
Epoch 009 | train_loss=1.0204 acc=0.5021 | val_loss=1.0276 acc=0.4953 | time=18.6s
Epoch 010 | train_loss=0.9694 acc=0.5763 | val_loss=0.9753 acc=0.5730 | time=18.9s
Epoch 011 | train_loss=0.9385 acc=0.5891 | val_loss=0.9889 acc=0.5264 | time=18.7s
Epoch 012 | train_loss=0.9213 acc=0.5973 | val_loss=0.9832 acc=0.5807 | time=19.0s
Epoc

[I 2025-05-01 00:54:12,505] Trial 10 pruned. 


▸ Trial 10 pruned at epoch 20

===== Trial 11 =====
 lr=9.76e-04, wd=1.63e-05, blocks=3, heads=2, segs=5, step_size=10, gamma=0.27


0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
train_accuracy,▁▁▁▁▁▁▁▁▃▅▆▆▆▆▇▇▇██
train_loss,████████▇▆▅▅▄▄▃▃▂▁▁
validation_accuracy,▁▁▁▁▁▁▁▁▃▆▅▇▇▇▇▇▇▅█
validation_loss,████████▆▄▅▄▃▃▂▁▂▁▃

0,1
epoch,19.0
train_accuracy,0.67689
train_loss,0.74629
validation_accuracy,0.61957
validation_loss,0.9572


Epoch 001 | train_loss=1.0702 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.8s
Epoch 002 | train_loss=1.0687 acc=0.4311 | val_loss=1.0799 acc=0.4317 | time=31.7s
Epoch 003 | train_loss=1.0665 acc=0.4311 | val_loss=1.0915 acc=0.4317 | time=31.8s
Epoch 004 | train_loss=1.0686 acc=0.4311 | val_loss=1.0833 acc=0.4317 | time=31.4s
Epoch 005 | train_loss=1.0680 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=31.7s
Epoch 006 | train_loss=1.0673 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=31.5s
Epoch 007 | train_loss=1.0663 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.7s
Epoch 008 | train_loss=1.0669 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=31.5s
Epoch 009 | train_loss=1.0684 acc=0.4311 | val_loss=1.0776 acc=0.4317 | time=31.7s


[I 2025-05-01 00:59:30,768] Trial 11 pruned. 


▸ Trial 11 pruned at epoch 10

===== Trial 12 =====
 lr=1.25e-04, wd=5.46e-05, blocks=3, heads=2, segs=5, step_size=30, gamma=0.37


0,1
epoch,▁▂▃▄▅▅▆▇█
train_accuracy,▁▁▁▁▁▁▁▁▁
train_loss,█▅▁▅▄▃▁▂▅
validation_accuracy,▁▁▁▁▁▁▁▁▁
validation_loss,▁▃█▅▁▁▁▁▂

0,1
epoch,9.0
train_accuracy,0.43107
train_loss,1.06844
validation_accuracy,0.43168
validation_loss,1.07762


Epoch 001 | train_loss=1.0692 acc=0.4311 | val_loss=1.0785 acc=0.4317 | time=31.6s
Epoch 002 | train_loss=1.0679 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=31.5s
Epoch 003 | train_loss=1.0666 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=31.5s
Epoch 004 | train_loss=1.0667 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=31.5s
Epoch 005 | train_loss=1.0667 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.6s
Epoch 006 | train_loss=1.0659 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=31.2s


[I 2025-05-01 01:03:13,327] Trial 12 pruned. 


▸ Trial 12 pruned at epoch 7

===== Trial 13 =====
 lr=9.76e-04, wd=3.13e-05, blocks=3, heads=2, segs=5, step_size=10, gamma=0.38


0,1
epoch,▁▂▄▅▇█
train_accuracy,▁▁▁▁▁▁
train_loss,█▅▂▃▃▁
validation_accuracy,▁▁▁▁▁▁
validation_loss,█▂▃▂▂▁

0,1
epoch,6.0
train_accuracy,0.43107
train_loss,1.06588
validation_accuracy,0.43168
validation_loss,1.07442


Epoch 001 | train_loss=1.0753 acc=0.4132 | val_loss=1.0776 acc=0.4317 | time=31.5s
Epoch 002 | train_loss=1.0751 acc=0.4206 | val_loss=1.0819 acc=0.4317 | time=31.6s
Epoch 003 | train_loss=1.0677 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.6s
Epoch 004 | train_loss=1.0678 acc=0.4311 | val_loss=1.0771 acc=0.4317 | time=31.8s
Epoch 005 | train_loss=1.0669 acc=0.4311 | val_loss=1.0790 acc=0.4317 | time=31.7s
Epoch 006 | train_loss=1.0685 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.9s
Epoch 007 | train_loss=1.0673 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=31.8s
Epoch 008 | train_loss=1.0679 acc=0.4311 | val_loss=1.0766 acc=0.4317 | time=31.8s
Epoch 009 | train_loss=1.0678 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=31.7s


[I 2025-05-01 01:08:32,378] Trial 13 pruned. 


▸ Trial 13 pruned at epoch 10

===== Trial 14 =====
 lr=4.05e-04, wd=9.54e-04, blocks=3, heads=2, segs=5, step_size=20, gamma=0.58


0,1
epoch,▁▂▃▄▅▅▆▇█
train_accuracy,▁▄███████
train_loss,██▂▂▁▂▁▂▂
validation_accuracy,▁▁▁▁▁▁▁▁▁
validation_loss,▄█▁▄▅▁▁▃▁

0,1
epoch,9.0
train_accuracy,0.43107
train_loss,1.06784
validation_accuracy,0.43168
validation_loss,1.07433


Epoch 001 | train_loss=1.0738 acc=0.4206 | val_loss=1.0748 acc=0.4317 | time=31.9s
Epoch 002 | train_loss=1.0714 acc=0.4252 | val_loss=1.0793 acc=0.3416 | time=31.9s
Epoch 003 | train_loss=1.0702 acc=0.4245 | val_loss=1.0745 acc=0.4317 | time=32.1s
Epoch 004 | train_loss=1.0684 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=31.9s
Epoch 005 | train_loss=1.0672 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.7s
Epoch 006 | train_loss=1.0671 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=31.7s
Epoch 007 | train_loss=1.0670 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=32.0s
Epoch 008 | train_loss=1.0661 acc=0.4311 | val_loss=1.0699 acc=0.4317 | time=31.8s
Epoch 009 | train_loss=1.0564 acc=0.4392 | val_loss=1.0699 acc=0.4317 | time=32.1s
Epoch 010 | train_loss=1.0097 acc=0.5320 | val_loss=0.9582 acc=0.5792 | time=31.7s
Epoch 011 | train_loss=0.9440 acc=0.5806 | val_loss=0.9452 acc=0.5590 | time=32.0s
Epoch 012 | train_loss=0.9116 acc=0.6031 | val_loss=0.9299 acc=0.5621 | time=31.9s
Epoc

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▁▁▁▁▁▁▁▁▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█████
train_loss,█████████▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▃▂▂▂▁▁▁▁
validation_accuracy,▃▁▃▃▃▃▃▃▃▆▅▅▆▇▆▇▆▇▇▇▇▇▇▇█▇▆▆▇████▇█▇▇▇█
validation_loss,▇▇▇▇▇▇▇▆▆▄▃▃▃▂▂▂▂▁▁▁▂▂▂▁▅▂▂▂▃▂▂▂▃▃▄▅▅▇█

0,1
epoch,39.0
train_accuracy,0.85359
train_loss,0.36178
validation_accuracy,0.68478
validation_loss,1.13606


[I 2025-05-01 01:29:17,070] Trial 14 finished with value: 0.8427037994066874 and parameters: {'lr': 0.00040519319762359493, 'weight_decay': 0.0009536838748084607, 'num_blocks': 3, 'num_heads': 2, 'num_segments': 5, 'step_size': 20, 'gamma': 0.5833961123553751}. Best is trial 4 with value: 0.5753395855426788.



===== Trial 15 =====
 lr=3.91e-04, wd=8.56e-04, blocks=3, heads=3, segs=5, step_size=30, gamma=0.56


Epoch 001 | train_loss=1.0739 acc=0.4272 | val_loss=1.0789 acc=0.4317 | time=38.8s
Epoch 002 | train_loss=1.0688 acc=0.4311 | val_loss=1.0767 acc=0.4317 | time=38.6s
Epoch 003 | train_loss=1.0725 acc=0.4311 | val_loss=1.0779 acc=0.4317 | time=38.5s
Epoch 004 | train_loss=1.0700 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=38.7s
Epoch 005 | train_loss=1.0685 acc=0.4311 | val_loss=1.0762 acc=0.4317 | time=38.6s
Epoch 006 | train_loss=1.0682 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=38.7s
Epoch 007 | train_loss=1.0671 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=38.7s


[I 2025-05-01 01:34:27,671] Trial 15 pruned. 


▸ Trial 15 pruned at epoch 8

===== Trial 16 =====
 lr=1.04e-04, wd=2.13e-04, blocks=3, heads=2, segs=15, step_size=20, gamma=0.41


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁██████
train_loss,█▃▇▄▂▂▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▅▆▂▄▁▁

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06706
validation_accuracy,0.43168
validation_loss,1.07436


Epoch 001 | train_loss=1.0740 acc=0.4074 | val_loss=1.0766 acc=0.4317 | time=31.9s
Epoch 002 | train_loss=1.0661 acc=0.4311 | val_loss=1.0775 acc=0.4317 | time=31.8s
Epoch 003 | train_loss=1.0670 acc=0.4311 | val_loss=1.0769 acc=0.4317 | time=31.9s
Epoch 004 | train_loss=1.0682 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=32.0s


[I 2025-05-01 01:37:09,146] Trial 16 pruned. 


▸ Trial 16 pruned at epoch 5

===== Trial 17 =====
 lr=1.74e-04, wd=9.97e-04, blocks=3, heads=2, segs=5, step_size=20, gamma=0.23


0,1
epoch,▁▃▆█
train_accuracy,▁███
train_loss,█▁▂▃
validation_accuracy,▁▁▁▁
validation_loss,▆█▆▁

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06823
validation_accuracy,0.43168
validation_loss,1.0747


Epoch 001 | train_loss=1.0707 acc=0.4264 | val_loss=1.0764 acc=0.4317 | time=31.3s
Epoch 002 | train_loss=1.0700 acc=0.4229 | val_loss=1.0746 acc=0.4317 | time=31.6s
Epoch 003 | train_loss=1.0674 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.5s
Epoch 004 | train_loss=1.0668 acc=0.4311 | val_loss=1.0764 acc=0.4317 | time=31.8s


[I 2025-05-01 01:39:48,532] Trial 17 pruned. 


▸ Trial 17 pruned at epoch 5

===== Trial 18 =====
 lr=3.84e-04, wd=1.05e-05, blocks=3, heads=2, segs=5, step_size=30, gamma=0.62


0,1
epoch,▁▃▆█
train_accuracy,▄▁██
train_loss,█▇▂▁
validation_accuracy,▁▁▁▁
validation_loss,█▁▂█

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06682
validation_accuracy,0.43168
validation_loss,1.07641


Epoch 001 | train_loss=1.0707 acc=0.4311 | val_loss=1.0762 acc=0.4317 | time=31.7s
Epoch 002 | train_loss=1.0686 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.5s
Epoch 003 | train_loss=1.0676 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=31.7s
Epoch 004 | train_loss=1.0672 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=31.7s
Epoch 005 | train_loss=1.0675 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=31.6s
Epoch 006 | train_loss=1.0676 acc=0.4311 | val_loss=1.0771 acc=0.4317 | time=31.6s
Epoch 007 | train_loss=1.0673 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=31.7s


[I 2025-05-01 01:44:03,264] Trial 18 pruned. 


▸ Trial 18 pruned at epoch 8

===== Trial 19 =====
 lr=7.33e-04, wd=1.01e-04, blocks=3, heads=3, segs=15, step_size=10, gamma=0.45


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▁▁▁▁▁▁
train_loss,█▄▂▁▂▂▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▆▃▁▂▄█▂

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06726
validation_accuracy,0.43168
validation_loss,1.07447


Epoch 001 | train_loss=1.0732 acc=0.4109 | val_loss=1.0793 acc=0.4317 | time=39.0s
Epoch 002 | train_loss=1.0681 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=39.1s
Epoch 003 | train_loss=1.0689 acc=0.4311 | val_loss=1.0766 acc=0.4317 | time=39.2s
Epoch 004 | train_loss=1.0674 acc=0.4311 | val_loss=1.0821 acc=0.4317 | time=39.0s


[I 2025-05-01 01:47:20,282] Trial 19 pruned. 


▸ Trial 19 pruned at epoch 5

===== Trial 20 =====
 lr=3.97e-04, wd=8.81e-05, blocks=3, heads=2, segs=10, step_size=20, gamma=0.28


0,1
epoch,▁▃▆█
train_accuracy,▁███
train_loss,█▂▃▁
validation_accuracy,▁▁▁▁
validation_loss,▅▁▂█

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06744
validation_accuracy,0.43168
validation_loss,1.08212


Epoch 001 | train_loss=1.0735 acc=0.4229 | val_loss=1.0769 acc=0.4317 | time=31.7s
Epoch 002 | train_loss=1.0683 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=31.8s
Epoch 003 | train_loss=1.0679 acc=0.4311 | val_loss=1.0816 acc=0.4317 | time=31.7s
Epoch 004 | train_loss=1.0675 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=31.8s


[I 2025-05-01 01:50:01,376] Trial 20 pruned. 


▸ Trial 20 pruned at epoch 5

===== Trial 21 =====
 lr=7.13e-04, wd=3.39e-04, blocks=3, heads=2, segs=5, step_size=30, gamma=0.47


0,1
epoch,▁▃▆█
train_accuracy,▁███
train_loss,█▂▁▁
validation_accuracy,▁▁▁▁
validation_loss,▃▁█▁

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06754
validation_accuracy,0.43168
validation_loss,1.07541


Epoch 001 | train_loss=1.0734 acc=0.4190 | val_loss=1.0750 acc=0.4317 | time=31.7s
Epoch 002 | train_loss=1.0691 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=31.7s
Epoch 003 | train_loss=1.0680 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=31.5s
Epoch 004 | train_loss=1.0683 acc=0.4287 | val_loss=1.0755 acc=0.4317 | time=31.8s


[I 2025-05-01 01:52:41,351] Trial 21 pruned. 


▸ Trial 21 pruned at epoch 5

===== Trial 22 =====
 lr=6.20e-04, wd=4.70e-05, blocks=2, heads=2, segs=5, step_size=20, gamma=0.64


0,1
epoch,▁▃▆█
train_accuracy,▁██▇
train_loss,█▂▁▁
validation_accuracy,▁▁▁▁
validation_loss,▁▄▃█

0,1
epoch,4.0
train_accuracy,0.42874
train_loss,1.06832
validation_accuracy,0.43168
validation_loss,1.07554


Epoch 001 | train_loss=1.0698 acc=0.4237 | val_loss=1.0852 acc=0.4317 | time=23.6s
Epoch 002 | train_loss=1.0688 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=23.6s
Epoch 003 | train_loss=1.0702 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=23.4s
Epoch 004 | train_loss=1.0674 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=23.5s
Epoch 005 | train_loss=1.0672 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=23.7s
Epoch 006 | train_loss=1.0697 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=23.6s
Epoch 007 | train_loss=1.0692 acc=0.4311 | val_loss=1.0787 acc=0.4317 | time=23.7s


[I 2025-05-01 01:55:51,922] Trial 22 pruned. 


▸ Trial 22 pruned at epoch 8

===== Trial 23 =====
 lr=8.15e-04, wd=6.16e-05, blocks=2, heads=2, segs=5, step_size=20, gamma=0.80


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁██████
train_loss,▇▅█▂▁▇▆
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▂▁▁▂▂▄

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06923
validation_accuracy,0.43168
validation_loss,1.07867


Epoch 001 | train_loss=1.0716 acc=0.4287 | val_loss=1.0828 acc=0.4317 | time=23.3s
Epoch 002 | train_loss=1.0709 acc=0.4167 | val_loss=1.0771 acc=0.4317 | time=23.5s
Epoch 003 | train_loss=1.0686 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=23.5s
Epoch 004 | train_loss=1.0692 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=23.8s


[I 2025-05-01 01:57:51,459] Trial 23 pruned. 


▸ Trial 23 pruned at epoch 5

===== Trial 24 =====
 lr=4.88e-04, wd=2.76e-05, blocks=1, heads=2, segs=5, step_size=20, gamma=0.64


0,1
epoch,▁▃▆█
train_accuracy,▇▁██
train_loss,█▆▁▂
validation_accuracy,▁▁▁▁
validation_loss,█▃▁▁

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06917
validation_accuracy,0.43168
validation_loss,1.07481


Epoch 001 | train_loss=1.0721 acc=0.4171 | val_loss=1.0746 acc=0.4317 | time=18.4s
Epoch 002 | train_loss=1.0701 acc=0.4210 | val_loss=1.0754 acc=0.4317 | time=18.5s
Epoch 003 | train_loss=1.0691 acc=0.4311 | val_loss=1.0766 acc=0.4317 | time=19.0s
Epoch 004 | train_loss=1.0680 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=18.6s
Epoch 005 | train_loss=1.0676 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=18.4s
Epoch 006 | train_loss=1.0673 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=18.4s
Epoch 007 | train_loss=1.0675 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=18.6s


[I 2025-05-01 02:00:21,852] Trial 24 pruned. 


▸ Trial 24 pruned at epoch 8

===== Trial 25 =====
 lr=3.46e-04, wd=1.34e-04, blocks=3, heads=2, segs=5, step_size=20, gamma=0.51


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▃█████
train_loss,█▅▄▂▁▁▁
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,▁▄█▂▁▁▆

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.0675
validation_accuracy,0.43168
validation_loss,1.0761


Epoch 001 | train_loss=1.0702 acc=0.4311 | val_loss=1.0800 acc=0.4317 | time=31.6s
Epoch 002 | train_loss=1.0688 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=31.4s
Epoch 003 | train_loss=1.0693 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=31.8s
Epoch 004 | train_loss=1.0672 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=31.6s
Epoch 005 | train_loss=1.0680 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=31.2s
Epoch 006 | train_loss=1.0670 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=31.2s
Epoch 007 | train_loss=1.0673 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=31.3s


[I 2025-05-01 02:04:35,212] Trial 25 pruned. 


▸ Trial 25 pruned at epoch 8

===== Trial 26 =====
 lr=6.24e-04, wd=2.29e-04, blocks=2, heads=3, segs=5, step_size=10, gamma=0.71


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁▁▁▁▁▁▁
train_loss,█▅▆▁▃▁▂
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▂▃▁▃▃▂

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06726
validation_accuracy,0.43168
validation_loss,1.07549


Epoch 001 | train_loss=1.0736 acc=0.4066 | val_loss=1.0785 acc=0.4317 | time=28.1s
Epoch 002 | train_loss=1.0687 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=27.8s
Epoch 003 | train_loss=1.0693 acc=0.4237 | val_loss=1.0743 acc=0.4317 | time=28.0s
Epoch 004 | train_loss=1.0697 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=27.8s
Epoch 005 | train_loss=1.0668 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=28.0s
Epoch 006 | train_loss=1.0673 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=27.8s
Epoch 007 | train_loss=1.0693 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=27.9s


[I 2025-05-01 02:08:20,628] Trial 26 pruned. 


▸ Trial 26 pruned at epoch 8

===== Trial 27 =====
 lr=8.31e-04, wd=7.73e-05, blocks=3, heads=2, segs=5, step_size=30, gamma=0.61


0,1
epoch,▁▂▃▅▆▇█
train_accuracy,▁█▆████
train_loss,█▃▄▄▁▁▄
validation_accuracy,▁▁▁▁▁▁▁
validation_loss,█▃▁▂▁▁▂

0,1
epoch,7.0
train_accuracy,0.43107
train_loss,1.06928
validation_accuracy,0.43168
validation_loss,1.07458


Epoch 001 | train_loss=1.0716 acc=0.4295 | val_loss=1.0743 acc=0.4317 | time=31.3s
Epoch 002 | train_loss=1.0693 acc=0.4311 | val_loss=1.0799 acc=0.4317 | time=31.2s
Epoch 003 | train_loss=1.0708 acc=0.4311 | val_loss=1.0762 acc=0.4317 | time=31.4s
Epoch 004 | train_loss=1.0686 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=31.1s
Epoch 005 | train_loss=1.0607 acc=0.4322 | val_loss=1.0776 acc=0.4317 | time=31.3s
Epoch 006 | train_loss=1.0708 acc=0.4159 | val_loss=1.0837 acc=0.4317 | time=31.5s
Epoch 007 | train_loss=1.0683 acc=0.4311 | val_loss=1.0716 acc=0.4317 | time=31.3s
Epoch 008 | train_loss=1.0399 acc=0.4707 | val_loss=0.9963 acc=0.5388 | time=31.3s
Epoch 009 | train_loss=0.9786 acc=0.5619 | val_loss=0.9826 acc=0.5668 | time=31.4s
Epoch 010 | train_loss=0.9644 acc=0.5662 | val_loss=0.9586 acc=0.5683 | time=31.4s
Epoch 011 | train_loss=0.9340 acc=0.5845 | val_loss=0.9447 acc=0.5575 | time=31.6s
Epoch 012 | train_loss=0.9251 acc=0.6000 | val_loss=0.9461 acc=0.5730 | time=31.6s
Epoc

[I 2025-05-01 02:17:17,623] Trial 27 pruned. 


▸ Trial 27 pruned at epoch 17

===== Trial 28 =====
 lr=4.38e-04, wd=1.11e-04, blocks=3, heads=2, segs=5, step_size=20, gamma=0.80


0,1
epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
train_accuracy,▁▂▂▂▂▁▂▃▆▆▇▇▇███
train_loss,███████▇▅▄▃▃▂▁▁▁
validation_accuracy,▁▁▁▁▁▁▁▆▇▇▆▇▇▆██
validation_loss,██████▇▄▃▂▂▂▁▃▁▁

0,1
epoch,16.0
train_accuracy,0.61515
train_loss,0.88463
validation_accuracy,0.59627
validation_loss,0.92711


Epoch 001 | train_loss=1.0738 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.6s
Epoch 002 | train_loss=1.0718 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=31.4s
Epoch 003 | train_loss=1.0693 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=31.5s
Epoch 004 | train_loss=1.0668 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=31.5s


[I 2025-05-01 02:19:57,158] Trial 28 pruned. 


▸ Trial 28 pruned at epoch 5

===== Trial 29 =====
 lr=5.77e-04, wd=6.79e-04, blocks=1, heads=2, segs=15, step_size=20, gamma=0.56


0,1
epoch,▁▃▆█
train_accuracy,▁▁▁▁
train_loss,█▆▄▁
validation_accuracy,▁▁▁▁
validation_loss,▁▇██

0,1
epoch,4.0
train_accuracy,0.43107
train_loss,1.06681
validation_accuracy,0.43168
validation_loss,1.07611


Epoch 001 | train_loss=1.0696 acc=0.4245 | val_loss=1.0744 acc=0.4317 | time=18.2s
Epoch 002 | train_loss=1.0708 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=18.3s
Epoch 003 | train_loss=1.0679 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=18.2s
Epoch 004 | train_loss=1.0682 acc=0.4311 | val_loss=1.0768 acc=0.4317 | time=18.4s
Epoch 005 | train_loss=1.0685 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=18.4s
Epoch 006 | train_loss=1.0664 acc=0.4311 | val_loss=1.0784 acc=0.4317 | time=18.2s
Epoch 007 | train_loss=1.0689 acc=0.4311 | val_loss=1.0796 acc=0.4317 | time=18.8s
Epoch 008 | train_loss=1.0329 acc=0.4913 | val_loss=1.0455 acc=0.5342 | time=18.5s
Epoch 009 | train_loss=0.9497 acc=0.5794 | val_loss=0.9591 acc=0.5668 | time=18.8s
Epoch 010 | train_loss=0.9223 acc=0.5969 | val_loss=0.9393 acc=0.5652 | time=18.6s
Epoch 011 | train_loss=0.9109 acc=0.6031 | val_loss=0.9785 acc=0.5745 | time=18.7s
Epoch 012 | train_loss=0.8828 acc=0.6124 | val_loss=0.9780 acc=0.5621 | time=18.9s
Epoc

[I 2025-05-01 02:26:09,950] Trial 29 pruned. 


▸ Trial 29 pruned at epoch 20

===== Trial 30 =====
 lr=3.14e-04, wd=1.66e-04, blocks=2, heads=3, segs=10, step_size=20, gamma=0.66


0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
train_accuracy,▁▁▁▁▁▁▁▃▅▆▆▆▆▇▇▆▇▇█
train_loss,███████▇▆▅▅▄▄▃▃▃▂▂▁
validation_accuracy,▁▁▁▁▁▁▁▅▆▆▆▆▇█▇▆█▇█
validation_loss,███████▇▃▂▄▄▁▄▁▂▂▂▂

0,1
epoch,19.0
train_accuracy,0.69126
train_loss,0.72615
validation_accuracy,0.62112
validation_loss,0.94252


Epoch 001 | train_loss=1.0702 acc=0.4249 | val_loss=1.0793 acc=0.3416 | time=28.4s
Epoch 002 | train_loss=1.0696 acc=0.4237 | val_loss=1.0742 acc=0.4317 | time=28.6s
Epoch 003 | train_loss=1.0681 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=28.5s
Epoch 004 | train_loss=1.0690 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=28.7s
Epoch 005 | train_loss=1.0667 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=28.3s
Epoch 006 | train_loss=1.0673 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=28.2s
Epoch 007 | train_loss=1.0668 acc=0.4311 | val_loss=1.0766 acc=0.4317 | time=28.3s
Epoch 008 | train_loss=1.0668 acc=0.4311 | val_loss=1.0720 acc=0.4317 | time=28.5s
Epoch 009 | train_loss=1.0378 acc=0.4819 | val_loss=0.9859 acc=0.5543 | time=28.4s
Epoch 010 | train_loss=0.9629 acc=0.5682 | val_loss=0.9659 acc=0.5714 | time=28.3s
Epoch 011 | train_loss=0.9239 acc=0.5988 | val_loss=0.9472 acc=0.5854 | time=28.4s
Epoch 012 | train_loss=0.8993 acc=0.6027 | val_loss=0.9550 acc=0.5870 | time=28.5s
Epoc

[I 2025-05-01 02:35:40,692] Trial 30 pruned. 


▸ Trial 30 pruned at epoch 20

===== Best Trial =====
best_val_loss   = 0.575340
best_val_acc    = 0.7671
best_train_loss = 0.3076
best_train_acc  = 0.8816
best params:
  lr: 0.0009667592898435953
  weight_decay: 9.995420002148595e-05
  num_blocks: 3
  num_heads: 2
  num_segments: 5
  step_size: 30
  gamma: 0.4573463771689297


In [None]:
print("hello")

hello


In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Hyperparameter ranges and fixed settings ─────────────────────────
LR_MIN, LR_MAX     = 1e-4, 1e-3
WD_MIN, WD_MAX     = 1e-5, 1e-3
NUM_FILTERS        = 120
NUM_BLOCK_CHOICES  = [1, 2, 3]
NUM_HEAD_CHOICES   = [2, 3, 4]
SEGMENT_CHOICES    = [5, 10, 15]

# ─── Training configuration ─────────────────────────
MAX_EPOCHS  = 200
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ─────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def objective_holdout(trial):
    # 1) Sample hyperparameters
    lr           = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_blocks   = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads    = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    print(f"\n===== Trial {trial.number} =====")
    print(f" lr={lr:.2e}, wd={weight_decay:.2e}, blocks={num_blocks}, "
          f"heads={num_heads}, segs={num_segments}")

    # 2) 데이터 로드
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds    = EEGDataset(DATA_DIR, train_meta)
    labels     = [d["label"] for d in train_meta]
    n_samples  = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # 3) 한 번의 Hold-out split (80/20)
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # 4) W&B init (optional)
    wandb.init(project="eeg-holdout-tuning", config=trial.params)

    # 5) Model / Optimizer / Loss
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    # 6) Train with early-stopping & pruning
    best_val_loss     = float("inf")
    epochs_no_improve = 0

    # **여기에 저장용 변수를 선언**
    best_train_loss = None
    best_train_acc  = None
    best_val_acc    = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()
        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward(); optimizer.step()
            tloss     += loss.item()
            tcorrect  += (logits.argmax(1) == y).sum().item()
            ttotal    += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — val —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                logits = model(X)
                loss   = criterion(logits, y)
                vloss    += loss.item()
                vcorrect += (logits.argmax(1) == y).sum().item()
                vtotal   += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # pruning
        trial.report(val_loss, epoch)
        if trial.should_prune():
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        print(f"Epoch {epoch:03d} | "
              f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
              f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
              f"time={elapsed:.1f}s")

        # early stopping & **최적시 train/val 지표 저장**
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # **최적 epoch 시 지표들을 user_attrs 에 저장**
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_trial",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_holdout.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout, n_trials=30)

    # — 최종적으로 best trial 의 값과 user_attrs 를 함께 출력 —
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss   = {best.value:.6f}")
    print(f"best_val_acc    = {best.user_attrs['best_val_acc']:.4f}")
    print(f"best_train_loss = {best.user_attrs['best_train_loss']:.4f}")
    print(f"best_train_acc  = {best.user_attrs['best_train_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


Attempting to create new mne-python configuration file:
/root/.mne/mne-python.json
Now using CUDA device 0
Enabling CUDA with 39.14 GiB available memory


[I 2025-04-30 19:01:36,475] A new study created in RDB with name: eeg_holdout_trial



===== Trial 0 =====
 lr=1.10e-04, wd=1.46e-04, blocks=1, heads=2, segs=10


Epoch 001 | train_loss=1.0711 acc=0.4311 | val_loss=1.0785 acc=0.4317 | time=330.3s
Epoch 002 | train_loss=1.0664 acc=0.4311 | val_loss=1.0804 acc=0.4317 | time=17.3s
Epoch 003 | train_loss=1.0679 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=17.3s
Epoch 004 | train_loss=1.0668 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=17.3s
Epoch 005 | train_loss=1.0685 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=17.1s
Epoch 006 | train_loss=1.0673 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=17.5s
Epoch 007 | train_loss=1.0673 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=17.5s
Epoch 008 | train_loss=1.0668 acc=0.4311 | val_loss=1.0770 acc=0.4317 | time=17.4s
Epoch 009 | train_loss=1.0669 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=17.4s
Epoch 010 | train_loss=1.0681 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=17.2s
Epoch 011 | train_loss=1.0662 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=17.2s
Epoch 012 | train_loss=1.0667 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=17.1s
Epo

[I 2025-04-30 19:22:37,535] Trial 0 finished with value: 0.9102347521554857 and parameters: {'lr': 0.00011016247004786675, 'weight_decay': 0.00014633689947303344, 'num_blocks': 1, 'num_heads': 2, 'num_segments': 10}. Best is trial 0 with value: 0.9102347521554857.



===== Trial 1 =====
 lr=1.05e-04, wd=6.82e-05, blocks=3, heads=2, segs=10


Epoch 001 | train_loss=1.0713 acc=0.4287 | val_loss=1.0756 acc=0.4317 | time=31.5s
Epoch 002 | train_loss=1.0664 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=31.7s
Epoch 003 | train_loss=1.0672 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=31.6s
Epoch 004 | train_loss=1.0668 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=31.5s
Epoch 005 | train_loss=1.0674 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=31.5s
Epoch 006 | train_loss=1.0669 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=31.6s
Epoch 007 | train_loss=1.0678 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=31.4s
Epoch 008 | train_loss=1.0674 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=31.5s
Epoch 009 | train_loss=1.0678 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=31.4s
Epoch 010 | train_loss=1.0668 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=31.7s
Epoch 011 | train_loss=1.0672 acc=0.4311 | val_loss=1.0750 acc=0.4317 | time=31.4s
Epoch 012 | train_loss=1.0668 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=31.5s
Epoc

[I 2025-04-30 19:34:44,533] Trial 1 finished with value: 1.0745091551826114 and parameters: {'lr': 0.00010539653046890808, 'weight_decay': 6.824872535774929e-05, 'num_blocks': 3, 'num_heads': 2, 'num_segments': 10}. Best is trial 0 with value: 0.9102347521554857.



===== Trial 2 =====
 lr=4.00e-04, wd=1.87e-04, blocks=2, heads=4, segs=15


Epoch 001 | train_loss=1.0720 acc=0.4276 | val_loss=1.0744 acc=0.4317 | time=33.6s
Epoch 002 | train_loss=1.0686 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=33.7s
Epoch 003 | train_loss=1.0692 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=33.8s
Epoch 004 | train_loss=1.0685 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=33.9s
Epoch 005 | train_loss=1.0695 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=33.6s
Epoch 006 | train_loss=1.0677 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=33.7s
Epoch 007 | train_loss=1.0697 acc=0.4311 | val_loss=1.0771 acc=0.4317 | time=33.7s
Epoch 008 | train_loss=1.0683 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=33.7s
Epoch 009 | train_loss=1.0697 acc=0.4311 | val_loss=1.0793 acc=0.4317 | time=33.5s
Epoch 010 | train_loss=1.0679 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=33.9s
Epoch 011 | train_loss=1.0676 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=33.7s
Epoch 012 | train_loss=1.0467 acc=0.4664 | val_loss=0.9727 acc=0.5652 | time=33.8s
Epoc

[I 2025-04-30 20:01:49,060] Trial 2 finished with value: 0.7152097299695015 and parameters: {'lr': 0.0003996888097196801, 'weight_decay': 0.00018677630423799053, 'num_blocks': 2, 'num_heads': 4, 'num_segments': 15}. Best is trial 2 with value: 0.7152097299695015.



===== Trial 3 =====
 lr=6.44e-04, wd=7.44e-04, blocks=3, heads=4, segs=10


Epoch 001 | train_loss=1.0720 acc=0.4245 | val_loss=1.0756 acc=0.4317 | time=46.9s
Epoch 002 | train_loss=1.0688 acc=0.4245 | val_loss=1.0752 acc=0.4317 | time=46.7s
Epoch 003 | train_loss=1.0689 acc=0.4202 | val_loss=1.0744 acc=0.4317 | time=46.7s
Epoch 004 | train_loss=1.0678 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=46.8s
Epoch 005 | train_loss=1.0678 acc=0.4311 | val_loss=1.0757 acc=0.4317 | time=46.6s
Epoch 006 | train_loss=1.0683 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=46.7s
Epoch 007 | train_loss=1.0681 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=46.8s
Epoch 008 | train_loss=1.0670 acc=0.4311 | val_loss=1.0762 acc=0.4317 | time=46.7s
Epoch 009 | train_loss=1.0675 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=46.7s
Epoch 010 | train_loss=1.0665 acc=0.4311 | val_loss=1.0755 acc=0.4317 | time=46.8s
Epoch 011 | train_loss=1.0671 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=46.7s
Epoch 012 | train_loss=1.0666 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=46.8s
Epoc

[I 2025-04-30 20:57:26,827] Trial 3 finished with value: 0.8160391095138732 and parameters: {'lr': 0.0006437813768189071, 'weight_decay': 0.0007437632759498076, 'num_blocks': 3, 'num_heads': 4, 'num_segments': 10}. Best is trial 2 with value: 0.7152097299695015.



===== Trial 4 =====
 lr=1.44e-04, wd=6.77e-04, blocks=3, heads=4, segs=5


Epoch 001 | train_loss=1.0684 acc=0.4318 | val_loss=1.0804 acc=0.4317 | time=46.6s
Epoch 002 | train_loss=1.0682 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=46.6s
Epoch 003 | train_loss=1.0670 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=46.6s
Epoch 004 | train_loss=1.0673 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=46.6s
Epoch 005 | train_loss=1.0672 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=46.6s
Epoch 006 | train_loss=1.0682 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=46.5s
Epoch 007 | train_loss=1.0665 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=46.6s
Epoch 008 | train_loss=1.0670 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=46.5s
Epoch 009 | train_loss=1.0677 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=46.7s
Epoch 010 | train_loss=1.0674 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=46.6s
Epoch 011 | train_loss=1.0663 acc=0.4311 | val_loss=1.0767 acc=0.4317 | time=46.5s
Epoch 012 | train_loss=1.0675 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=46.5s
Epoc

[I 2025-04-30 21:16:11,024] Trial 4 finished with value: 1.0742442863328117 and parameters: {'lr': 0.00014358978517331993, 'weight_decay': 0.0006773170584205079, 'num_blocks': 3, 'num_heads': 4, 'num_segments': 5}. Best is trial 2 with value: 0.7152097299695015.



===== Trial 5 =====
 lr=1.14e-04, wd=5.87e-04, blocks=3, heads=4, segs=10


Epoch 001 | train_loss=1.0696 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=46.8s
Epoch 002 | train_loss=1.0675 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=46.6s
Epoch 003 | train_loss=1.0678 acc=0.4311 | val_loss=1.0747 acc=0.4317 | time=46.8s
Epoch 004 | train_loss=1.0673 acc=0.4311 | val_loss=1.0746 acc=0.4317 | time=46.7s
Epoch 005 | train_loss=1.0675 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=46.7s
Epoch 006 | train_loss=1.0677 acc=0.4311 | val_loss=1.0745 acc=0.4317 | time=46.7s
Epoch 007 | train_loss=1.0677 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=46.8s
Epoch 008 | train_loss=1.0674 acc=0.4311 | val_loss=1.0751 acc=0.4317 | time=46.6s
Epoch 009 | train_loss=1.0672 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=46.8s
Epoch 010 | train_loss=1.0672 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=46.7s
Epoch 011 | train_loss=1.0669 acc=0.4311 | val_loss=1.0763 acc=0.4317 | time=46.7s
Epoch 012 | train_loss=1.0667 acc=0.4311 | val_loss=1.0761 acc=0.4317 | time=46.6s
Epoc

[I 2025-04-30 21:34:58,325] Trial 5 pruned. 


▸ Trial 5 pruned at epoch 24

===== Trial 6 =====
 lr=9.64e-04, wd=3.60e-05, blocks=1, heads=4, segs=10


Epoch 001 | train_loss=1.0708 acc=0.4217 | val_loss=1.0762 acc=0.4317 | time=20.5s
Epoch 002 | train_loss=1.0692 acc=0.4241 | val_loss=1.0773 acc=0.4317 | time=20.4s
Epoch 003 | train_loss=1.0696 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=20.5s
Epoch 004 | train_loss=1.0670 acc=0.4311 | val_loss=1.0760 acc=0.4317 | time=20.4s
Epoch 005 | train_loss=1.0269 acc=0.4843 | val_loss=0.9679 acc=0.5590 | time=20.4s
Epoch 006 | train_loss=0.9568 acc=0.5814 | val_loss=0.9761 acc=0.5388 | time=20.5s
Epoch 007 | train_loss=0.9153 acc=0.6016 | val_loss=0.9511 acc=0.5947 | time=20.4s
Epoch 008 | train_loss=0.9045 acc=0.6047 | val_loss=0.9230 acc=0.5870 | time=20.6s
Epoch 009 | train_loss=0.8755 acc=0.6210 | val_loss=0.9090 acc=0.5932 | time=20.6s
Epoch 010 | train_loss=0.8425 acc=0.6315 | val_loss=0.9071 acc=0.6165 | time=20.6s
Epoch 011 | train_loss=0.8397 acc=0.6400 | val_loss=0.9020 acc=0.6134 | time=20.6s
Epoch 012 | train_loss=0.8089 acc=0.6571 | val_loss=0.8799 acc=0.6196 | time=20.4s
Epoc

[I 2025-04-30 21:48:50,815] Trial 6 finished with value: 0.8566069319134667 and parameters: {'lr': 0.0009643783862149072, 'weight_decay': 3.604873795102363e-05, 'num_blocks': 1, 'num_heads': 4, 'num_segments': 10}. Best is trial 2 with value: 0.7152097299695015.



===== Trial 7 =====
 lr=2.04e-04, wd=1.79e-04, blocks=3, heads=4, segs=5


Epoch 001 | train_loss=1.0691 acc=0.4311 | val_loss=1.0758 acc=0.4317 | time=46.5s
Epoch 002 | train_loss=1.0674 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=46.5s
Epoch 003 | train_loss=1.0681 acc=0.4311 | val_loss=1.0742 acc=0.4317 | time=46.6s
Epoch 004 | train_loss=1.0661 acc=0.4311 | val_loss=1.0774 acc=0.4317 | time=46.5s
Epoch 005 | train_loss=1.0684 acc=0.4311 | val_loss=1.0743 acc=0.4317 | time=46.5s
Epoch 006 | train_loss=1.0674 acc=0.4311 | val_loss=1.0748 acc=0.4317 | time=46.6s
Epoch 007 | train_loss=1.0674 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=46.6s
Epoch 008 | train_loss=1.0677 acc=0.4311 | val_loss=1.0749 acc=0.4317 | time=46.5s
Epoch 009 | train_loss=1.0664 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=46.5s
Epoch 010 | train_loss=1.0664 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=46.5s
Epoch 011 | train_loss=1.0660 acc=0.4311 | val_loss=1.0754 acc=0.4317 | time=46.6s
Epoch 012 | train_loss=1.0665 acc=0.4311 | val_loss=1.0756 acc=0.4317 | time=46.5s
Epoc

[I 2025-04-30 22:34:44,697] Trial 7 finished with value: 0.7561915225925899 and parameters: {'lr': 0.00020419417762750276, 'weight_decay': 0.00017949537675946488, 'num_blocks': 3, 'num_heads': 4, 'num_segments': 5}. Best is trial 2 with value: 0.7152097299695015.



===== Trial 8 =====
 lr=2.63e-04, wd=6.35e-04, blocks=2, heads=3, segs=5


Epoch 001 | train_loss=1.0713 acc=0.4311 | val_loss=1.0744 acc=0.4317 | time=27.8s
Epoch 002 | train_loss=1.0657 acc=0.4311 | val_loss=1.0765 acc=0.4317 | time=28.2s
Epoch 003 | train_loss=1.0693 acc=0.4311 | val_loss=1.0759 acc=0.4317 | time=27.8s
Epoch 004 | train_loss=1.0677 acc=0.4311 | val_loss=1.0752 acc=0.4317 | time=27.7s
Epoch 005 | train_loss=1.0678 acc=0.4311 | val_loss=1.0753 acc=0.4317 | time=27.8s


[W 2025-04-30 22:37:25,936] Trial 8 failed with parameters: {'lr': 0.0002627823846413076, 'weight_decay': 0.0006345325565238551, 'num_blocks': 2, 'num_heads': 3, 'num_segments': 5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-8-f3a55d92be6f>", line 111, in objective_holdout
    tloss     += loss.item()
                 ^^^^^^^^^^^
KeyboardInterrupt
[W 2025-04-30 22:37:25,938] Trial 8 failed with value None.


KeyboardInterrupt: 

### Search the best Hyperparameter using 5-Fold Cross-Validation

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Hyperparameter ranges and fixed settings ─────────────────────────
LR_MIN, LR_MAX     = 1e-4, 1e-3
WD_MIN, WD_MAX     = 1e-5, 1e-3
NUM_FILTERS        = 120
NUM_BLOCK_CHOICES  = [1, 2, 3]
NUM_HEAD_CHOICES   = [2, 3, 4]
SEGMENT_CHOICES    = [5, 10, 15]

# ─── Training configuration ─────────────────────────
MAX_EPOCHS  = 200
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))

# ─── Data paths & device ─────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def objective_holdout(trial):
    # 1) Sample hyperparameters
    lr            = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay  = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_blocks    = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads     = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments  = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)
    # New: sample scheduler hyperparameters
    step_size     = trial.suggest_int("step_size", 10, 100, step=10)
    gamma         = trial.suggest_float("gamma", 0.1, 0.9)

    print(f"\n===== Trial {trial.number} =====")
    print(
        f" lr={lr:.2e}, wd={weight_decay:.2e}, blocks={num_blocks}, "
        f"heads={num_heads}, segs={num_segments}, "
        f"step_size={step_size}, gamma={gamma:.2f}"
    )

    # 2) Load metadata and build dataset
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta  = [d for d in all_meta if d["type"] == "train"]
    full_ds     = EEGDataset(DATA_DIR, train_meta)
    labels      = [d["label"] for d in train_meta]
    n_samples   = len(full_ds)
    input_length = full_ds[0][0].shape[-1]

    # 3) Single hold-out split (80/20), stratified
    train_idx, val_idx = train_test_split(
        list(range(n_samples)),
        test_size=0.2,
        stratify=labels,
        random_state=42
    )
    train_loader = DataLoader(
        Subset(full_ds, train_idx),
        batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS
    )
    val_loader = DataLoader(
        Subset(full_ds, val_idx),
        batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS
    )

    # 4) W&B init
    wandb.init(project="eeg-holdout-tuning", config=trial.params)

    # 5) Build model, optimizer, loss
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = num_heads,
        num_blocks   = num_blocks,
        num_segments = num_segments,
        num_classes  = 3
    ).to(DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay
    )
    criterion = nn.CrossEntropyLoss()

    # 6) Scheduler with sampled step_size & gamma
    scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=step_size,
        gamma=gamma
    )

    # 7) Training loop with early stopping & pruning
    best_val_loss     = float("inf")
    epochs_no_improve = 0
    # store best‐epoch metrics
    best_train_loss = best_train_acc = best_val_acc = None

    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()

        # — train —
        model.train()
        tloss = tcorrect = ttotal = 0
        for X, y in train_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()
            tloss    += loss.item()
            tcorrect += (logits.argmax(1) == y).sum().item()
            ttotal   += y.size(0)
        train_loss = tloss / len(train_loader)
        train_acc  = tcorrect / ttotal

        # — validate —
        model.eval()
        vloss = vcorrect = vtotal = 0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                logits = model(X)
                loss   = criterion(logits, y)
                vloss    += loss.item()
                vcorrect += (logits.argmax(1) == y).sum().item()
                vtotal   += y.size(0)
        val_loss = vloss / len(val_loader)
        val_acc  = vcorrect / vtotal
        elapsed  = time.time() - t0

        # report & prune
        trial.report(val_loss, epoch)
        if trial.should_prune():
            print(f"▸ Trial {trial.number} pruned at epoch {epoch}")
            raise optuna.TrialPruned()

        print(
            f"Epoch {epoch:03d} | "
            f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
            f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
            f"time={elapsed:.1f}s"
        )

        # step the scheduler
        scheduler.step()

        # early stopping & record best
        if val_loss < best_val_loss:
            best_val_loss     = val_loss
            epochs_no_improve = 0
            best_train_loss   = train_loss
            best_train_acc    = train_acc
            best_val_acc      = val_acc
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"★ Early stopping at epoch {epoch}")
                break

    # store best‐epoch metrics
    trial.set_user_attr("best_train_loss", best_train_loss)
    trial.set_user_attr("best_train_acc",  best_train_acc)
    trial.set_user_attr("best_val_acc",    best_val_acc)

    wandb.finish()
    gc.collect()
    return best_val_loss


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        study_name="eeg_holdout_trial",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_holdout.db",
        load_if_exists=True
    )
    study.optimize(objective_holdout, n_trials=30)

    # print best trial & metrics
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"best_val_loss   = {best.value:.6f}")
    print(f"best_val_acc    = {best.user_attrs['best_val_acc']:.4f}")
    print(f"best_train_loss = {best.user_attrs['best_train_loss']:.4f}")
    print(f"best_train_acc  = {best.user_attrs['best_train_acc']:.4f}")
    print("best params:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")


In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedKFold

import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Hyperparameter ranges and fixed settings ─────────────────────────
LR_MIN, LR_MAX     = 1e-4, 1e-3             # learning rate search bounds
WD_MIN, WD_MAX     = 1e-5, 1e-3             # weight decay search bounds
NUM_FILTERS        = 120                    # fixed number of convolutional filters
NUM_BLOCK_CHOICES = [2, 3]
NUM_HEAD_CHOICES   = [2, 3, 4]                 # choices for attention heads
SEGMENT_CHOICES    = [5, 10, 15]                    # choices for segments (fixed here)

# ─── Training configuration ─────────────────────────
N_FOLDS     = 5                             # number of CV folds
MAX_EPOCHS  = 100                           # maximal epochs per fold
PATIENCE    = 20                            # early stopping patience
BATCH_SIZE  = 32                            # batch size
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))  # data loader workers

# ─── Data paths & device ─────────────────────────
DATA_DIR   = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE = "labels.json"
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def objective(trial):
    """
    Optuna objective function: samples hyperparams, runs K-fold CV,
    reports validation loss for pruning, and returns average val_loss.
    """
    # 1) Sample hyperparameters from defined ranges
    lr           = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_filters  = NUM_FILTERS
    num_blocks   = trial.suggest_categorical("num_blocks", NUM_BLOCK_CHOICES)
    num_heads    = trial.suggest_categorical("num_heads", NUM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    print(f"\n===== Trial {trial.number} start =====")
    print(f"  lr={lr:.2e}, weight_decay={weight_decay:.2e}, "
          f"num_blocks={num_blocks}, num_heads={num_heads}, num_segments={num_segments}")

    # 2) Load metadata and build the full dataset to infer input_length
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds    = EEGDataset(DATA_DIR, train_meta)
    labels     = [d["label"] for d in train_meta]
    n_samples  = len(full_ds)

    # Determine original sequence length before any convolution
    input_length = full_ds[0][0].shape[-1]

    # 3) Initialize Weights & Biases for logging (one run per trial)
    wandb.init(project="eeg-cv-tuning-trial-3", config=trial.params)

    # 4) Set up Stratified K-Fold cross-validation
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    # Dictionary to collect per-fold metrics
    fold_metrics = {k: [] for k in ["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"]}

    # Loop over each fold
    for fold, (train_idx, val_idx) in enumerate(skf.split(range(n_samples), labels)):
        print(f"\n--- Fold {fold} ---")

        # Create DataLoaders for this fold
        train_loader = DataLoader(
            Subset(full_ds, train_idx),
            batch_size=BATCH_SIZE,
            shuffle=True,
            num_workers=NUM_WORKERS
        )
        val_loader = DataLoader(
            Subset(full_ds, val_idx),
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=NUM_WORKERS
        )

        # 4a) Instantiate model, optimizer, and loss
        model = EEGformer(
            in_channels  = 19,
            input_length = input_length,
            kernel_size  = 10,
            num_filters  = num_filters,
            num_heads    = num_heads,
            num_blocks   = num_blocks,
            num_segments = num_segments,
            num_classes  = 3
        ).to(DEVICE)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.CrossEntropyLoss()

        # Variables for early stopping
        best_val_loss     = float("inf")
        epochs_no_improve = 0
        best_epoch        = 0
        best_train_l = best_train_a = best_val_a = None

        # 4b) Training loop per epoch
        for epoch in range(1, MAX_EPOCHS + 1):
            t0 = time.time()

            # — Training phase —
            model.train()
            train_loss_sum = train_correct = train_total = 0
            for X, y in train_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                optimizer.zero_grad()
                logits = model(X)
                loss   = criterion(logits, y)
                loss.backward()
                optimizer.step()

                train_loss_sum += loss.item()
                train_correct  += (logits.argmax(1) == y).sum().item()
                train_total    += y.size(0)

            train_loss = train_loss_sum / len(train_loader)
            train_acc  = train_correct / train_total

            # — Validation phase —
            model.eval()
            val_loss_sum = val_correct = val_total = 0
            with torch.no_grad():
                for X, y in val_loader:
                    X, y = X.to(DEVICE), y.to(DEVICE)
                    logits = model(X)
                    loss   = criterion(logits, y)
                    val_loss_sum += loss.item()
                    val_correct  += (logits.argmax(1) == y).sum().item()
                    val_total    += y.size(0)

            val_loss = val_loss_sum / len(val_loader)
            val_acc  = val_correct / val_total
            epoch_time = time.time() - t0

            # Report intermediate objective value to Optuna for pruning
            step = fold * MAX_EPOCHS + epoch
            trial.report(val_loss, step=step)
            if trial.should_prune():
                print(f"▸ Trial pruned at fold {fold}, epoch {epoch}")
                raise optuna.TrialPruned()

            # Log metrics and epoch time
            print(
                f"[Fold {fold}] Epoch {epoch:03d} "
                f"train_loss={train_loss:.4f} train_acc={train_acc:.4f} | "
                f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} | "
                f"time={epoch_time:.1f}s"
            )

            # Early stopping check
            if val_loss < best_val_loss:
                best_val_loss     = val_loss
                best_epoch        = epoch
                epochs_no_improve = 0
                best_train_l      = train_loss
                best_train_a      = train_acc
                best_val_a        = val_acc
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= PATIENCE:
                    print(f"[Fold {fold}] early stopping at epoch {epoch}")
                    raise optuna.TrialPruned()

        # Record this fold's best metrics
        for key, value in zip(
            ["train_loss","train_acc","val_loss","val_acc","best_epoch"],
            [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]
        ):
            fold_metrics[key].append(value)
            trial.set_user_attr(f"fold{fold}_{key}", value)

        # Free GPU memory before next fold
        del model, optimizer, train_loader, val_loader
        torch.cuda.empty_cache()
        gc.collect()

    # 5) Compute average metrics across folds and finish the trial
    avg = lambda key: sum(fold_metrics[key]) / N_FOLDS
    for key in fold_metrics:
        trial.set_user_attr(f"avg_{key}", avg(key))

    wandb.finish()
    return avg("val_loss")


if __name__ == "__main__":
    # Allow safe multiprocessing on Windows
    multiprocessing.freeze_support()

    # Create or load an Optuna study
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=1),
        study_name="eegformer_optuna_cv_trial_3",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eeg_optuna_trial_3.db",
        load_if_exists=True
    )
    # Run hyperparameter optimization
    study.optimize(objective, n_trials=10)

    # Print out best trial results
    best = study.best_trial
    print("\n===== Best Trial =====")
    print(f"avg_val_loss   = {best.value:.6f}")
    print(f"avg_train_acc  = {best.user_attrs['avg_train_acc']:.4f}")
    print(f"avg_val_acc    = {best.user_attrs['avg_val_acc']:.4f}")
    print("best hyperparameters:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")

    # ─── Retrain on full training data using best hyperparameters ─────────────────────────
    print("\nRetraining on full TRAIN set with best hyperparams…")
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    full_meta   = [d for d in all_meta if d["type"] == "train"]
    full_ds     = EEGDataset(DATA_DIR, full_meta)
    full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

    # Determine full input length
    input_length = full_ds[0][0].shape[-1]

    # Re-instantiate model with best params
    model = EEGformer(
        in_channels  = 19,
        input_length = input_length,
        kernel_size  = 10,
        num_filters  = NUM_FILTERS,
        num_heads    = best.params["num_heads"],
        num_blocks   = best.params["num_blocks"],
        num_segments = best.params["num_segments"],
        num_classes  = 3
    ).to(DEVICE)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=best.params["lr"],
        weight_decay=best.params["weight_decay"]
    )
    criterion = nn.CrossEntropyLoss()

    # Final full-dataset training loop
    for epoch in range(1, MAX_EPOCHS + 1):
        t0 = time.time()
        model.train()
        loss_sum = correct = total = 0
        for X, y in full_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            correct  += (logits.argmax(1) == y).sum().item()
            total    += y.size(0)

        avg_loss = loss_sum / len(full_loader)
        acc      = correct / total
        epoch_time = time.time() - t0
        print(f"[Full Train] Epoch {epoch:03d} | loss={avg_loss:.4f} | acc={acc:.4f} | time={epoch_time:.1f}s")

    # Save the final checkpoint
    ckpt_dir = '/content/drive/MyDrive/2025_Lab_Research/checkpoints'
    os.makedirs(ckpt_dir, exist_ok=True)
    ckpt_path = os.path.join(ckpt_dir, 'eegformer_best_3.pth')
    torch.save(model.state_dict(), ckpt_path)
    print(f"💾 Saved best model to {ckpt_path}")


[I 2025-04-30 09:49:03,595] A new study created in RDB with name: eegformer_optuna_cv_trial_3



===== Trial 0 start =====
  lr=3.15e-05, weight_decay=1.45e-06, num_blocks=3, num_heads=3, num_segments=25



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0734 train_acc=0.4311 | val_loss=1.0814 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 002 train_loss=1.0672 train_acc=0.4311 | val_loss=1.0830 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 003 train_loss=1.0674 train_acc=0.4311 | val_loss=1.0846 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 004 train_loss=1.0662 train_acc=0.4311 | val_loss=1.0832 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 005 train_loss=1.0661 train_acc=0.4311 | val_loss=1.0829 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 006 train_loss=1.0667 train_acc=0.4311 | val_loss=1.0842 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 007 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0855 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 008 train_loss=1.0665 train_acc=0.4311 | val_loss=1.0838 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 009 train_loss=1.0668 train_acc=0.4311 | val_loss=1.0835 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 010 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0829 val_acc=0.43

[I 2025-04-30 10:02:44,729] Trial 0 pruned. 


[Fold 0] Epoch 021 train_loss=1.0671 train_acc=0.4311 | val_loss=1.0846 val_acc=0.4317 | time=38.9s
[Fold 0] early stopping at epoch 21

===== Trial 1 start =====
  lr=1.40e-05, weight_decay=5.99e-05, num_blocks=3, num_heads=3, num_segments=25



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0761 train_acc=0.4221 | val_loss=1.0810 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 002 train_loss=1.0676 train_acc=0.4311 | val_loss=1.0829 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 003 train_loss=1.0675 train_acc=0.4311 | val_loss=1.0843 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 004 train_loss=1.0679 train_acc=0.4311 | val_loss=1.0841 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 005 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0831 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 006 train_loss=1.0669 train_acc=0.4311 | val_loss=1.0836 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 007 train_loss=1.0669 train_acc=0.4311 | val_loss=1.0833 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 008 train_loss=1.0661 train_acc=0.4311 | val_loss=1.0834 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 009 train_loss=1.0663 train_acc=0.4311 | val_loss=1.0846 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 010 train_loss=1.0664 train_acc=0.4311 | val_loss=1.0842 val_acc=0.43

[I 2025-04-30 10:16:25,412] Trial 1 pruned. 


[Fold 0] Epoch 021 train_loss=1.0659 train_acc=0.4311 | val_loss=1.0836 val_acc=0.4317 | time=39.0s
[Fold 0] early stopping at epoch 21

===== Trial 2 start =====
  lr=8.77e-04, weight_decay=6.29e-05, num_blocks=3, num_heads=3, num_segments=5



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0719 train_acc=0.4276 | val_loss=1.0924 val_acc=0.4317 | time=38.4s
[Fold 0] Epoch 002 train_loss=1.0687 train_acc=0.4311 | val_loss=1.0810 val_acc=0.4317 | time=38.4s
[Fold 0] Epoch 003 train_loss=1.0702 train_acc=0.4217 | val_loss=1.0813 val_acc=0.4317 | time=38.1s
[Fold 0] Epoch 004 train_loss=1.0696 train_acc=0.4311 | val_loss=1.0813 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 005 train_loss=1.0676 train_acc=0.4311 | val_loss=1.0811 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 006 train_loss=1.0687 train_acc=0.4311 | val_loss=1.0813 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 007 train_loss=1.0682 train_acc=0.4311 | val_loss=1.0850 val_acc=0.4317 | time=38.1s
[Fold 0] Epoch 008 train_loss=1.0674 train_acc=0.4311 | val_loss=1.0890 val_acc=0.4317 | time=38.1s
[Fold 0] Epoch 009 train_loss=1.0664 train_acc=0.4311 | val_loss=1.0878 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 010 train_loss=1.0664 train_acc=0.4311 | val_loss=1.0813 val_acc=0.43

[I 2025-04-30 10:55:59,622] Trial 2 pruned. 


[Fold 0] Epoch 061 train_loss=0.2811 train_acc=0.8936 | val_loss=0.9505 val_acc=0.7438 | time=39.2s
[Fold 0] early stopping at epoch 61

===== Trial 3 start =====
  lr=1.01e-04, weight_decay=3.52e-05, num_blocks=3, num_heads=3, num_segments=5



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0695 train_acc=0.4311 | val_loss=1.0828 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 002 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0826 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 003 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0843 val_acc=0.4317 | time=38.4s
[Fold 0] Epoch 004 train_loss=1.0671 train_acc=0.4311 | val_loss=1.0826 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 005 train_loss=1.0659 train_acc=0.4311 | val_loss=1.0842 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 006 train_loss=1.0669 train_acc=0.4311 | val_loss=1.0821 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 007 train_loss=1.0672 train_acc=0.4311 | val_loss=1.0827 val_acc=0.4317 | time=38.1s
[Fold 0] Epoch 008 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0836 val_acc=0.4317 | time=38.1s
[Fold 0] Epoch 009 train_loss=1.0667 train_acc=0.4311 | val_loss=1.0830 val_acc=0.4317 | time=38.1s
[Fold 0] Epoch 010 train_loss=1.0670 train_acc=0.4311 | val_loss=1.0837 val_acc=0.43

[I 2025-04-30 11:28:33,788] Trial 3 pruned. 


[Fold 0] Epoch 051 train_loss=0.4530 train_acc=0.8315 | val_loss=1.1364 val_acc=0.5683 | time=38.2s
[Fold 0] early stopping at epoch 51

===== Trial 4 start =====
  lr=8.06e-05, weight_decay=3.80e-05, num_blocks=3, num_heads=3, num_segments=25



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0725 train_acc=0.4318 | val_loss=1.0820 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 002 train_loss=1.0667 train_acc=0.4311 | val_loss=1.0840 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 003 train_loss=1.0677 train_acc=0.4311 | val_loss=1.0885 val_acc=0.4317 | time=38.6s
[Fold 0] Epoch 004 train_loss=1.0669 train_acc=0.4311 | val_loss=1.0848 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 005 train_loss=1.0665 train_acc=0.4311 | val_loss=1.0840 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 006 train_loss=1.0665 train_acc=0.4311 | val_loss=1.0825 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 007 train_loss=1.0674 train_acc=0.4311 | val_loss=1.0838 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 008 train_loss=1.0668 train_acc=0.4311 | val_loss=1.0832 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 009 train_loss=1.0676 train_acc=0.4311 | val_loss=1.0815 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 010 train_loss=1.0676 train_acc=0.4311 | val_loss=1.0828 val_acc=0.43

[I 2025-04-30 11:47:27,443] Trial 4 pruned. 


[Fold 0] Epoch 029 train_loss=0.9629 train_acc=0.5196 | val_loss=1.1921 val_acc=0.3773 | time=38.9s
[Fold 0] early stopping at epoch 29

===== Trial 5 start =====
  lr=1.57e-04, weight_decay=4.96e-05, num_blocks=3, num_heads=3, num_segments=5



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0707 train_acc=0.4303 | val_loss=1.0817 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 002 train_loss=1.0683 train_acc=0.4311 | val_loss=1.0859 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 003 train_loss=1.0677 train_acc=0.4311 | val_loss=1.0842 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 004 train_loss=1.0682 train_acc=0.4311 | val_loss=1.0855 val_acc=0.4317 | time=38.1s
[Fold 0] Epoch 005 train_loss=1.0678 train_acc=0.4311 | val_loss=1.0862 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 006 train_loss=1.0673 train_acc=0.4311 | val_loss=1.0831 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 007 train_loss=1.0668 train_acc=0.4311 | val_loss=1.0840 val_acc=0.4317 | time=38.4s
[Fold 0] Epoch 008 train_loss=1.0663 train_acc=0.4311 | val_loss=1.0825 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 009 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0834 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 010 train_loss=1.0674 train_acc=0.4311 | val_loss=1.0850 val_acc=0.43

[I 2025-04-30 12:27:04,585] Trial 5 pruned. 


[Fold 0] Epoch 062 train_loss=0.0862 train_acc=0.9682 | val_loss=1.3088 val_acc=0.7050 | time=39.1s
[Fold 0] early stopping at epoch 62

===== Trial 6 start =====
  lr=5.74e-06, weight_decay=7.79e-06, num_blocks=3, num_heads=3, num_segments=5



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0975 train_acc=0.3724 | val_loss=1.0935 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 002 train_loss=1.0897 train_acc=0.4311 | val_loss=1.0886 val_acc=0.4317 | time=38.4s
[Fold 0] Epoch 003 train_loss=1.0832 train_acc=0.4311 | val_loss=1.0854 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 004 train_loss=1.0792 train_acc=0.4311 | val_loss=1.0833 val_acc=0.4317 | time=38.5s
[Fold 0] Epoch 005 train_loss=1.0766 train_acc=0.4311 | val_loss=1.0821 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 006 train_loss=1.0742 train_acc=0.4311 | val_loss=1.0815 val_acc=0.4317 | time=38.4s
[Fold 0] Epoch 007 train_loss=1.0725 train_acc=0.4311 | val_loss=1.0811 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 008 train_loss=1.0714 train_acc=0.4311 | val_loss=1.0808 val_acc=0.4317 | time=38.2s
[Fold 0] Epoch 009 train_loss=1.0703 train_acc=0.4311 | val_loss=1.0806 val_acc=0.4317 | time=38.3s
[Fold 0] Epoch 010 train_loss=1.0686 train_acc=0.4311 | val_loss=1.0806 val_acc=0.43

[I 2025-04-30 12:45:43,104] Trial 6 pruned. 


[Fold 0] Epoch 029 train_loss=1.0657 train_acc=0.4311 | val_loss=1.0832 val_acc=0.4317 | time=38.2s
[Fold 0] early stopping at epoch 29

===== Trial 7 start =====
  lr=1.05e-06, weight_decay=1.58e-05, num_blocks=3, num_heads=3, num_segments=25



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0914 train_acc=0.4140 | val_loss=1.0914 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 002 train_loss=1.0860 train_acc=0.4311 | val_loss=1.0881 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 003 train_loss=1.0823 train_acc=0.4311 | val_loss=1.0858 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 004 train_loss=1.0789 train_acc=0.4311 | val_loss=1.0841 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 005 train_loss=1.0765 train_acc=0.4311 | val_loss=1.0830 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 006 train_loss=1.0744 train_acc=0.4311 | val_loss=1.0823 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 007 train_loss=1.0730 train_acc=0.4311 | val_loss=1.0817 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 008 train_loss=1.0715 train_acc=0.4311 | val_loss=1.0814 val_acc=0.4317 | time=39.1s
[Fold 0] Epoch 009 train_loss=1.0713 train_acc=0.4311 | val_loss=1.0811 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 010 train_loss=1.0703 train_acc=0.4311 | val_loss=1.0810 val_acc=0.43

[I 2025-04-30 13:06:37,715] Trial 7 pruned. 


[Fold 0] Epoch 032 train_loss=1.0661 train_acc=0.4311 | val_loss=1.0830 val_acc=0.4317 | time=38.9s
[Fold 0] early stopping at epoch 32

===== Trial 8 start =====
  lr=7.76e-06, weight_decay=4.45e-05, num_blocks=3, num_heads=3, num_segments=25



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0812 train_acc=0.4311 | val_loss=1.0811 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 002 train_loss=1.0698 train_acc=0.4311 | val_loss=1.0812 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 003 train_loss=1.0663 train_acc=0.4311 | val_loss=1.0824 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 004 train_loss=1.0661 train_acc=0.4311 | val_loss=1.0832 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 005 train_loss=1.0664 train_acc=0.4311 | val_loss=1.0837 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 006 train_loss=1.0666 train_acc=0.4311 | val_loss=1.0838 val_acc=0.4317 | time=39.1s
[Fold 0] Epoch 007 train_loss=1.0659 train_acc=0.4311 | val_loss=1.0833 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 008 train_loss=1.0669 train_acc=0.4311 | val_loss=1.0837 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 009 train_loss=1.0658 train_acc=0.4311 | val_loss=1.0833 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 010 train_loss=1.0670 train_acc=0.4311 | val_loss=1.0839 val_acc=0.43

[I 2025-04-30 13:20:23,847] Trial 8 pruned. 


[Fold 0] Epoch 021 train_loss=1.0659 train_acc=0.4311 | val_loss=1.0839 val_acc=0.4317 | time=38.9s
[Fold 0] early stopping at epoch 21

===== Trial 9 start =====
  lr=2.54e-06, weight_decay=1.58e-05, num_blocks=3, num_heads=3, num_segments=25



--- Fold 0 ---
[Fold 0] Epoch 001 train_loss=1.0822 train_acc=0.4311 | val_loss=1.0845 val_acc=0.4317 | time=38.9s
[Fold 0] Epoch 002 train_loss=1.0763 train_acc=0.4311 | val_loss=1.0824 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 003 train_loss=1.0735 train_acc=0.4311 | val_loss=1.0813 val_acc=0.4317 | time=38.8s
[Fold 0] Epoch 004 train_loss=1.0708 train_acc=0.4311 | val_loss=1.0809 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 005 train_loss=1.0691 train_acc=0.4311 | val_loss=1.0808 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 006 train_loss=1.0681 train_acc=0.4311 | val_loss=1.0809 val_acc=0.4317 | time=39.0s
[Fold 0] Epoch 007 train_loss=1.0680 train_acc=0.4311 | val_loss=1.0811 val_acc=0.4317 | time=39.1s
[Fold 0] Epoch 008 train_loss=1.0673 train_acc=0.4311 | val_loss=1.0813 val_acc=0.4317 | time=38.7s
[Fold 0] Epoch 009 train_loss=1.0668 train_acc=0.4311 | val_loss=1.0815 val_acc=0.4317 | time=38.6s
[Fold 0] Epoch 010 train_loss=1.0672 train_acc=0.4311 | val_loss=1.0817 val_acc=0.43

[I 2025-04-30 13:36:46,965] Trial 9 pruned. 


[Fold 0] Epoch 025 train_loss=1.0668 train_acc=0.4311 | val_loss=1.0835 val_acc=0.4317 | time=39.0s
[Fold 0] early stopping at epoch 25


ValueError: Record does not exist.

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedKFold

import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Constants ─────────────────────────
LR_MIN, LR_MAX = 2e-5, 8e-5
WD_MIN, WD_MAX = 3e-5, 1e-4
FILTER_MIN, FILTER_MAX, FILTER_STEP = 60, 180, 60
HEAD_CHOICES    = [2, 4]
SEGMENT_CHOICES = [5, 15, 25]

N_FOLDS     = 5
MAX_EPOCHS  = 100
PATIENCE    = 30
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))
DATA_DIR    = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE  = "labels.json"

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def objective(trial):
    # Hyperparameter search space setup
    lr = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_filters = trial.suggest_int("num_filters", FILTER_MIN, FILTER_MAX, step=FILTER_STEP)
    rtm_blocks = trial.suggest_int("rtm_blocks", 1, 3)
    stm_blocks = trial.suggest_int("stm_blocks", 1, 3)
    ttm_blocks = trial.suggest_int("ttm_blocks", 1, 3)
    rtm_heads = trial.suggest_categorical("rtm_heads", HEAD_CHOICES)
    stm_heads = trial.suggest_categorical("stm_heads", HEAD_CHOICES)
    ttm_heads = trial.suggest_categorical("ttm_heads", HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    # Pruning condition: only proceed if num_filters is divisible by heads
    for h in (rtm_heads, stm_heads, ttm_heads):
        if num_filters % h != 0:
            raise optuna.TrialPruned()

    # Initialize wandb
    wandb.init(project="eeg-cv-tuning-trial_9", config=trial.params)

    print(f"\n========================= Trial {trial.number} =========================")
    print(f"Testing with hyperparameters: {trial.params}")

    # Data preparation
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds = EEGDataset(DATA_DIR, train_meta)
    labels = [d["label"] for d in train_meta]
    n_samples = len(full_ds)

    # StratifiedKFold setup
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    fold_metrics = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "best_epoch": []}

    for fold, (train_idx, val_idx) in enumerate(skf.split(range(n_samples), labels)):
        # Fold separation
        print(f"\n========================= Fold {fold} =========================")

        # Data loader setup
        train_loader = DataLoader(Subset(full_ds, train_idx), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
        val_loader = DataLoader(Subset(full_ds, val_idx), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

        # Model and optimizer setup
        model = EEGformer(
            num_classes=3,
            in_channels=19,
            kernel_size=10,
            num_filters=num_filters,
            rtm_blocks=rtm_blocks,
            stm_blocks=stm_blocks,
            ttm_blocks=ttm_blocks,
            rtm_heads=rtm_heads,
            stm_heads=stm_heads,
            ttm_heads=ttm_heads,
            num_segments=num_segments
        ).to(DEVICE)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.CrossEntropyLoss()

        best_val_loss = float("inf")
        epochs_no_improve = 0
        best_epoch = 0
        best_train_l = best_train_a = best_val_a = None
        last_log_time = time.time()

        # Epoch-wise training
        for epoch in range(1, MAX_EPOCHS + 1):
            model.train()
            tl_sum = t_corr = t_tot = 0
            for X, y in train_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                optimizer.zero_grad()
                logits = model(X)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()
                tl_sum += loss.item()
                t_corr += (logits.argmax(1) == y).sum().item()
                t_tot += y.size(0)

            train_loss = tl_sum / len(train_loader)
            train_acc = t_corr / t_tot

            model.eval()
            vl_sum = v_corr = v_tot = 0
            with torch.no_grad():
                for X, y in val_loader:
                    X, y = X.to(DEVICE), y.to(DEVICE)
                    logits = model(X)
                    loss = criterion(logits, y)
                    vl_sum += loss.item()
                    v_corr += (logits.argmax(1) == y).sum().item()
                    v_tot += y.size(0)

            val_loss = vl_sum / len(val_loader)
            val_acc = v_corr / v_tot

            step = fold * MAX_EPOCHS + epoch
            trial.report(val_loss, step=step)

            # Pruning check
            if trial.should_prune():
                print(f"\u274c Trial {trial.number} pruned at fold {fold}, epoch {epoch}")
                # Report the metrics before returning early
                for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                 [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                    fold_metrics[k].append(v)
                    trial.set_user_attr(f"fold{fold}_{k}", v)
                raise optuna.TrialPruned()  # End trial completely if pruned

            now = time.time()
            print(f"[Fold {fold}] Epoch {epoch:03d} | train_loss={train_loss:.4f} train_acc={train_acc:.4f} | "
                  f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} | time={now - last_log_time:.1f}s")
            last_log_time = now

            # Early stopping: if validation loss does not improve
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                epochs_no_improve = 0
                best_train_l = train_loss
                best_train_a = train_acc
                best_val_a = val_acc
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= PATIENCE:
                    print(f"[Fold {fold}] Early stopping at epoch {epoch}, best was {best_epoch}")
                    # Report the metrics before returning early
                    for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                     [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                        fold_metrics[k].append(v)
                        trial.set_user_attr(f"fold{fold}_{k}", v)
                    return best_val_loss  # End trial completely if early stopping

        # Record results for the fold
        for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                         [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
            fold_metrics[k].append(v)
            trial.set_user_attr(f"fold{fold}_{k}", v)

        del model, optimizer, train_loader, val_loader
        torch.mps.empty_cache() if DEVICE.type == "mps" else torch.cuda.empty_cache()
        gc.collect()

    # Calculate average metrics
    avg = lambda k: sum(fold_metrics[k]) / N_FOLDS
    for key in ["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"]:
        trial.set_user_attr(f"avg_{key}", avg(key))

    wandb.finish()
    return avg("val_loss")


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=1),
        study_name="eegformer_optuna_cv_3",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eegformer_optuna_cv_3.db",
        load_if_exists=True
    )
    study.optimize(objective, n_trials=8)

    best = study.best_trial
    print("\n===== Best Trial Results =====")
    print(f"avg_val_loss   = {best.value:.6f}")
    print(f"avg_train_loss = {best.user_attrs['avg_train_loss']:.6f}")
    print(f"avg_train_acc  = {best.user_attrs['avg_train_acc']:.4f}")
    print(f"avg_val_acc    = {best.user_attrs['avg_val_acc']:.4f}")
    print(f"avg_best_epoch = {best.user_attrs['avg_best_epoch']:.1f}")
    print("best hyperparameters:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")
    print("per-fold best metrics:")
    for f in range(N_FOLDS):
        print(
            f"  Fold {f}: epoch={best.user_attrs[f'fold{f}_best_epoch']}, "
            f"t_loss={best.user_attrs[f'fold{f}_train_loss']:.4f}, "
            f"t_acc={best.user_attrs[f'fold{f}_train_acc']:.4f}, "
            f"v_loss={best.user_attrs[f'fold{f}_val_loss']:.4f}, "
            f"v_acc={best.user_attrs[f'fold{f}_val_acc']:.4f}"
        )


[I 2025-04-28 01:54:11,503] A new study created in RDB with name: eegformer_optuna_cv_3



Testing with hyperparameters: {'lr': 6.273724067568952e-05, 'weight_decay': 5.8339962733943407e-05, 'num_filters': 120, 'rtm_blocks': 1, 'stm_blocks': 3, 'ttm_blocks': 1, 'rtm_heads': 4, 'stm_heads': 2, 'ttm_heads': 2, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0687 train_acc=0.4318 | val_loss=1.0820 val_acc=0.4317 | time=1308.5s
[Fold 0] Epoch 002 | train_loss=1.0534 train_acc=0.4590 | val_loss=1.0284 val_acc=0.5435 | time=61.5s
[Fold 0] Epoch 003 | train_loss=1.0157 train_acc=0.5153 | val_loss=1.0012 val_acc=0.5481 | time=64.1s
[Fold 0] Epoch 004 | train_loss=0.9863 train_acc=0.5480 | val_loss=0.9923 val_acc=0.5528 | time=62.7s
[Fold 0] Epoch 005 | train_loss=0.9767 train_acc=0.5515 | val_loss=0.9871 val_acc=0.5652 | time=60.3s
[Fold 0] Epoch 006 | train_loss=0.9585 train_acc=0.5666 | val_loss=0.9754 val_acc=0.5652 | time=60.8s
[Fold 0] Epoch 007 | train_loss=0.9426 train_acc=0.5689 | val_loss=0.9539 val_acc=0.5668 | time=61.8s
[Fold 0] Epoch 008 | train_loss=0.9240 train

[W 2025-04-28 03:07:14,457] Trial 0 failed with parameters: {'lr': 6.273724067568952e-05, 'weight_decay': 5.8339962733943407e-05, 'num_filters': 120, 'rtm_blocks': 1, 'stm_blocks': 3, 'ttm_blocks': 1, 'rtm_heads': 4, 'stm_heads': 2, 'ttm_heads': 2, 'num_segments': 5} because of the following error: The value None could not be cast to float..
[W 2025-04-28 03:07:14,458] Trial 0 failed with value None.


[Fold 0] Epoch 051 | train_loss=0.2137 train_acc=0.9227 | val_loss=0.5547 val_acc=0.8199 | time=61.4s
[Fold 0] Early stopping at epoch 51, best was 36



Testing with hyperparameters: {'lr': 2.1557163203444662e-05, 'weight_decay': 0.000159517315156258, 'num_filters': 120, 'rtm_blocks': 1, 'stm_blocks': 1, 'ttm_blocks': 3, 'rtm_heads': 2, 'stm_heads': 4, 'ttm_heads': 4, 'num_segments': 25}

[Fold 0] Epoch 001 | train_loss=1.0685 train_acc=0.4311 | val_loss=1.0841 val_acc=0.4317 | time=59.4s
[Fold 0] Epoch 002 | train_loss=1.0668 train_acc=0.4311 | val_loss=1.0834 val_acc=0.4317 | time=60.0s
[Fold 0] Epoch 003 | train_loss=1.0662 train_acc=0.4311 | val_loss=1.0836 val_acc=0.4317 | time=62.5s
[Fold 0] Epoch 004 | train_loss=1.0651 train_acc=0.4311 | val_loss=1.0864 val_acc=0.4317 | time=61.6s
[Fold 0] Epoch 005 | train_loss=1.0668 train_acc=0.4311 | val_loss=1.0853 val_acc=0.4317 | time=59.7s
[Fold 0] Epoch 006 | train_loss=1.0657 train_acc=0.4311 | val_loss=1.0818 val_acc=0.4317 | time=60.3s
[Fold 0] Epoch 007 | train_loss=1.0655 train_acc=0.4311 | val_loss=1.0818 val_acc=0.4317 | time=61.5s
[Fold 0] Epoch 008 | train_loss=1.0634 train_a

[W 2025-04-28 04:39:30,295] Trial 1 failed with parameters: {'lr': 2.1557163203444662e-05, 'weight_decay': 0.000159517315156258, 'num_filters': 120, 'rtm_blocks': 1, 'stm_blocks': 1, 'ttm_blocks': 3, 'rtm_heads': 2, 'stm_heads': 4, 'ttm_heads': 4, 'num_segments': 25} because of the following error: The value None could not be cast to float..
[W 2025-04-28 04:39:30,297] Trial 1 failed with value None.


[Fold 0] Epoch 094 | train_loss=0.1312 train_acc=0.9546 | val_loss=0.6095 val_acc=0.8199 | time=56.7s
[Fold 0] Early stopping at epoch 94, best was 79



Testing with hyperparameters: {'lr': 1.2400547483173527e-05, 'weight_decay': 7.79511927028319e-05, 'num_filters': 120, 'rtm_blocks': 3, 'stm_blocks': 3, 'ttm_blocks': 1, 'rtm_heads': 2, 'stm_heads': 4, 'ttm_heads': 4, 'num_segments': 15}

[Fold 0] Epoch 001 | train_loss=1.0852 train_acc=0.4155 | val_loss=1.0800 val_acc=0.4317 | time=72.3s
[Fold 0] Epoch 002 | train_loss=1.0672 train_acc=0.4311 | val_loss=1.0815 val_acc=0.4317 | time=72.3s
[Fold 0] Epoch 003 | train_loss=1.0667 train_acc=0.4311 | val_loss=1.0817 val_acc=0.4317 | time=72.3s
[Fold 0] Epoch 004 | train_loss=1.0649 train_acc=0.4311 | val_loss=1.0811 val_acc=0.4317 | time=71.7s
[Fold 0] Epoch 005 | train_loss=1.0639 train_acc=0.4311 | val_loss=1.0798 val_acc=0.4317 | time=72.2s
[Fold 0] Epoch 006 | train_loss=1.0617 train_acc=0.4330 | val_loss=1.0696 val_acc=0.4317 | time=71.7s
[Fold 0] Epoch 007 | train_loss=1.0396 train_acc=0.4913 | val_loss=1.0129 val_acc=0.5668 | time=72.2s
[Fold 0] Epoch 008 | train_loss=1.0156 train_a

In [None]:
from eeg_dataset import EEGDataset
from models import EEGformer

Attempting to create new mne-python configuration file:
/root/.mne/mne-python.json
Now using CUDA device 0
Enabling CUDA with 39.14 GiB available memory


In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedKFold

import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Constants ─────────────────────────
LR_MIN, LR_MAX = 2e-5, 8e-5
WD_MIN, WD_MAX = 3e-5, 1e-4
FILTER_MIN = FILTER_MAX = 120

# blocks -> [1,2,3] / heads -> [2,3]
RTM_BLOCK_CHOICES = [1, 2, 3]
STM_BLOCK_CHOICES = [1, 2, 3]
TTM_BLOCK_CHOICES = [1, 2, 3]

RTM_HEAD_CHOICES = [2, 3, 4]
STM_HEAD_CHOICES = [2, 3, 4]
TTM_HEAD_CHOICES = [2, 3, 4]

SEGMENT_CHOICES = [5]

N_FOLDS     = 5
MAX_EPOCHS  = 100
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))
DATA_DIR    = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE  = "labels.json"

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def objective(trial):
    # Hyperparameter search space setup
    lr = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_filters = 120
    rtm_blocks = trial.suggest_categorical("rtm_blocks", RTM_BLOCK_CHOICES)
    stm_blocks = trial.suggest_categorical("stm_blocks", STM_BLOCK_CHOICES)
    ttm_blocks = trial.suggest_categorical("ttm_blocks", TTM_BLOCK_CHOICES)
    rtm_heads = trial.suggest_categorical("rtm_heads", RTM_HEAD_CHOICES)
    stm_heads = trial.suggest_categorical("stm_heads", STM_HEAD_CHOICES)
    ttm_heads = trial.suggest_categorical("ttm_heads", TTM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    # Pruning condition: only proceed if num_filters is divisible by heads
    for h in (rtm_heads, stm_heads, ttm_heads):
        if num_filters % h != 0:
            raise optuna.TrialPruned()

    # Initialize wandb
    wandb.init(project="eeg-cv-tuning-trial_10", config=trial.params)

    print(f"\n========================= Trial {trial.number} =========================")
    print(f"Testing with hyperparameters: {trial.params}")

    # Data preparation
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds = EEGDataset(DATA_DIR, train_meta)
    labels = [d["label"] for d in train_meta]
    n_samples = len(full_ds)

    # StratifiedKFold setup
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    fold_metrics = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "best_epoch": []}

    for fold, (train_idx, val_idx) in enumerate(skf.split(range(n_samples), labels)):
        # Fold separation
        print(f"\n========================= Fold {fold} =========================")

        # Data loader setup
        train_loader = DataLoader(Subset(full_ds, train_idx), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
        val_loader = DataLoader(Subset(full_ds, val_idx), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

        # Model and optimizer setup
        model = EEGformer(
            num_classes=3,
            in_channels=19,
            kernel_size=10,
            num_filters=num_filters,
            rtm_blocks=rtm_blocks,
            stm_blocks=stm_blocks,
            ttm_blocks=ttm_blocks,
            rtm_heads=rtm_heads,
            stm_heads=stm_heads,
            ttm_heads=ttm_heads,
            num_segments=num_segments
        ).to(DEVICE)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.CrossEntropyLoss()

        best_val_loss = float("inf")
        epochs_no_improve = 0
        best_epoch = 0
        best_train_l = best_train_a = best_val_a = None
        last_log_time = time.time()

        # Epoch-wise training
        for epoch in range(1, MAX_EPOCHS + 1):
            model.train()
            tl_sum = t_corr = t_tot = 0
            for X, y in train_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                optimizer.zero_grad()
                logits = model(X)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()
                tl_sum += loss.item()
                t_corr += (logits.argmax(1) == y).sum().item()
                t_tot += y.size(0)

            train_loss = tl_sum / len(train_loader)
            train_acc = t_corr / t_tot

            model.eval()
            vl_sum = v_corr = v_tot = 0
            with torch.no_grad():
                for X, y in val_loader:
                    X, y = X.to(DEVICE), y.to(DEVICE)
                    logits = model(X)
                    loss = criterion(logits, y)
                    vl_sum += loss.item()
                    v_corr += (logits.argmax(1) == y).sum().item()
                    v_tot += y.size(0)

            val_loss = vl_sum / len(val_loader)
            val_acc = v_corr / v_tot

            step = fold * MAX_EPOCHS + epoch
            trial.report(val_loss, step=step)

            # Pruning check
            if trial.should_prune():
                print(f"\u274c Trial {trial.number} pruned at fold {fold}, epoch {epoch}")
                # Report the metrics before returning early
                for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                 [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                    fold_metrics[k].append(v)
                    trial.set_user_attr(f"fold{fold}_{k}", v)
                raise optuna.TrialPruned()  # End trial completely if pruned

            now = time.time()
            print(f"[Fold {fold}] Epoch {epoch:03d} | train_loss={train_loss:.4f} train_acc={train_acc:.4f} | "
                  f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} | time={now - last_log_time:.1f}s")
            last_log_time = now

            # Early stopping: if validation loss does not improve
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                epochs_no_improve = 0
                best_train_l = train_loss
                best_train_a = train_acc
                best_val_a = val_acc
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= PATIENCE:
                    print(f"[Fold {fold}] Early stopping at epoch {epoch}, best was {best_epoch}")
                    # Report the metrics before returning early
                    for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                     [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                        fold_metrics[k].append(v)
                        trial.set_user_attr(f"fold{fold}_{k}", v)
                    return best_val_loss  # End trial completely if early stopping

        # Record results for the fold
        for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                         [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
            fold_metrics[k].append(v)
            trial.set_user_attr(f"fold{fold}_{k}", v)

        del model, optimizer, train_loader, val_loader
        torch.mps.empty_cache() if DEVICE.type == "mps" else torch.cuda.empty_cache()
        gc.collect()

    # Calculate average metrics
    avg = lambda k: sum(fold_metrics[k]) / N_FOLDS
    for key in ["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"]:
        trial.set_user_attr(f"avg_{key}", avg(key))

    wandb.finish()
    return avg("val_loss")


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=1),
        study_name="eegformer_optuna_cv_4",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eegformer_optuna_cv_4.db",
        load_if_exists=True
    )
    study.optimize(objective, n_trials=10)

    best = study.best_trial
    print("\n===== Best Trial Results =====")
    print(f"avg_val_loss   = {best.value:.6f}")
    print(f"avg_train_loss = {best.user_attrs['avg_train_loss']:.6f}")
    print(f"avg_train_acc  = {best.user_attrs['avg_train_acc']:.4f}")
    print(f"avg_val_acc    = {best.user_attrs['avg_val_acc']:.4f}")
    print(f"avg_best_epoch = {best.user_attrs['avg_best_epoch']:.1f}")
    print("best hyperparameters:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")
    print("per-fold best metrics:")
    for f in range(N_FOLDS):
        print(
            f"  Fold {f}: epoch={best.user_attrs[f'fold{f}_best_epoch']}, "
            f"t_loss={best.user_attrs[f'fold{f}_train_loss']:.4f}, "
            f"t_acc={best.user_attrs[f'fold{f}_train_acc']:.4f}, "
            f"v_loss={best.user_attrs[f'fold{f}_val_loss']:.4f}, "
            f"v_acc={best.user_attrs[f'fold{f}_val_acc']:.4f}"
        )

    # ─── 전체 데이터(train)로 재학습 + loss·accuracy 출력 + 저장 ─────────────────
    print("\nRetraining on full TRAIN dataset with best params…")

    # 1) train 메타만 골라서 로드
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    full_meta   = [d for d in all_meta if d["type"] == "train"]
    full_ds     = EEGDataset(DATA_DIR, full_meta)
    full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

    # 2) 모델·옵티마이저 재설정
    model = EEGformer(
        num_classes=3,
        in_channels=19,
        kernel_size=10,
        num_filters=120,
        rtm_blocks=best.params["rtm_blocks"],
        stm_blocks=best.params["stm_blocks"],
        ttm_blocks=best.params["ttm_blocks"],
        rtm_heads= best.params["rtm_heads"],
        stm_heads= best.params["stm_heads"],
        ttm_heads= best.params["ttm_heads"],
        num_segments=best.params["num_segments"]
    ).to(DEVICE)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=best.params["lr"],
        weight_decay=best.params["weight_decay"]
    )
    criterion = nn.CrossEntropyLoss()

    # 3) MAX_EPOCHS 만큼 전체 학습하며 loss·accuracy 출력
    for epoch in range(1, MAX_EPOCHS + 1):
        model.train()
        loss_sum = 0.0
        correct  = 0
        total    = 0
        for X, y in full_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            correct  += (logits.argmax(1) == y).sum().item()
            total    += y.size(0)

        avg_loss = loss_sum / len(full_loader)
        acc      = correct / total
        print(f"[Full Train] Epoch {epoch:03d} | loss={avg_loss:.4f} | acc={acc:.4f}")

    # 4) 체크포인트 저장
    ckpt_dir = '/content/drive/MyDrive/2025_Lab_Research/checkpoints'
    os.makedirs(ckpt_dir, exist_ok=True)
    ckpt_path = os.path.join(ckpt_dir, 'eegformer_best.pth')
    torch.save(model.state_dict(), ckpt_path)
    print(f"💾 Saved best model to {ckpt_path}")


[I 2025-04-28 07:20:44,501] A new study created in RDB with name: eegformer_optuna_cv_4



Testing with hyperparameters: {'lr': 7.00045782400034e-05, 'weight_decay': 8.734906501291418e-05, 'rtm_blocks': 1, 'stm_blocks': 2, 'ttm_blocks': 2, 'rtm_heads': 2, 'stm_heads': 3, 'ttm_heads': 2, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0691 train_acc=0.4225 | val_loss=1.0870 val_acc=0.4317 | time=726.1s
[Fold 0] Epoch 002 | train_loss=1.0673 train_acc=0.4311 | val_loss=1.0835 val_acc=0.4317 | time=18.7s
[Fold 0] Epoch 003 | train_loss=1.0675 train_acc=0.4311 | val_loss=1.0879 val_acc=0.4317 | time=18.4s
[Fold 0] Epoch 004 | train_loss=1.0671 train_acc=0.4311 | val_loss=1.0844 val_acc=0.4317 | time=18.6s
[Fold 0] Epoch 005 | train_loss=1.0672 train_acc=0.4311 | val_loss=1.0842 val_acc=0.4317 | time=18.2s
[Fold 0] Epoch 006 | train_loss=1.0672 train_acc=0.4311 | val_loss=1.0854 val_acc=0.4317 | time=18.2s
[Fold 0] Epoch 007 | train_loss=1.0665 train_acc=0.4311 | val_loss=1.0836 val_acc=0.4317 | time=18.2s
[Fold 0] Epoch 008 | train_loss=1.0662 train_acc=0.4311 | val_loss=

[I 2025-04-28 07:52:34,910] Trial 0 finished with value: 0.7909612684022813 and parameters: {'lr': 7.00045782400034e-05, 'weight_decay': 8.734906501291418e-05, 'rtm_blocks': 1, 'stm_blocks': 2, 'ttm_blocks': 2, 'rtm_heads': 2, 'stm_heads': 3, 'ttm_heads': 2, 'num_segments': 5}. Best is trial 0 with value: 0.7909612684022813.


[Fold 0] Epoch 065 | train_loss=0.1856 train_acc=0.9262 | val_loss=1.1634 val_acc=0.7189 | time=17.5s
[Fold 0] Early stopping at epoch 65, best was 45



Testing with hyperparameters: {'lr': 4.9554878139823925e-05, 'weight_decay': 8.001690176093237e-05, 'rtm_blocks': 1, 'stm_blocks': 3, 'ttm_blocks': 2, 'rtm_heads': 2, 'stm_heads': 2, 'ttm_heads': 2, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0684 train_acc=0.4311 | val_loss=1.0891 val_acc=0.4317 | time=17.9s
[Fold 0] Epoch 002 | train_loss=1.0669 train_acc=0.4311 | val_loss=1.0846 val_acc=0.4317 | time=17.7s
[Fold 0] Epoch 003 | train_loss=1.0657 train_acc=0.4311 | val_loss=1.0803 val_acc=0.4317 | time=17.6s
[Fold 0] Epoch 004 | train_loss=1.0626 train_acc=0.4338 | val_loss=1.0817 val_acc=0.4317 | time=17.9s
[Fold 0] Epoch 005 | train_loss=1.0621 train_acc=0.4361 | val_loss=1.0733 val_acc=0.4410 | time=17.8s
[Fold 0] Epoch 006 | train_loss=1.0556 train_acc=0.4408 | val_loss=1.0692 val_acc=0.4394 | time=17.7s
[Fold 0] Epoch 007 | train_loss=1.0537 train_acc=0.4396 | val_loss=1.0701 val_acc=0.4410 | time=17.8s
[Fold 0] Epoch 008 | train_loss=1.0549 train_acc=0.4381 | val_loss

[I 2025-04-28 08:17:28,502] Trial 1 finished with value: 0.659756180076372 and parameters: {'lr': 4.9554878139823925e-05, 'weight_decay': 8.001690176093237e-05, 'rtm_blocks': 1, 'stm_blocks': 3, 'ttm_blocks': 2, 'rtm_heads': 2, 'stm_heads': 2, 'ttm_heads': 2, 'num_segments': 5}. Best is trial 1 with value: 0.659756180076372.


[Fold 0] Epoch 084 | train_loss=0.3070 train_acc=0.8819 | val_loss=0.7979 val_acc=0.7438 | time=17.8s
[Fold 0] Early stopping at epoch 84, best was 64



Testing with hyperparameters: {'lr': 4.573338352305618e-05, 'weight_decay': 6.452939826551289e-05, 'rtm_blocks': 1, 'stm_blocks': 2, 'ttm_blocks': 3, 'rtm_heads': 2, 'stm_heads': 2, 'ttm_heads': 3, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0695 train_acc=0.4272 | val_loss=1.0843 val_acc=0.4317 | time=18.0s
[Fold 0] Epoch 002 | train_loss=1.0662 train_acc=0.4311 | val_loss=1.0843 val_acc=0.4317 | time=18.1s
[Fold 0] Epoch 003 | train_loss=1.0664 train_acc=0.4311 | val_loss=1.0828 val_acc=0.4317 | time=17.9s
[Fold 0] Epoch 004 | train_loss=1.0670 train_acc=0.4311 | val_loss=1.0826 val_acc=0.4317 | time=17.6s
[Fold 0] Epoch 005 | train_loss=1.0650 train_acc=0.4330 | val_loss=1.0809 val_acc=0.4317 | time=17.7s
[Fold 0] Epoch 006 | train_loss=1.0398 train_acc=0.4835 | val_loss=0.9997 val_acc=0.5730 | time=17.5s
[Fold 0] Epoch 007 | train_loss=0.9702 train_acc=0.5588 | val_loss=0.9545 val_acc=0.5885 | time=17.4s
[Fold 0] Epoch 008 | train_loss=0.9485 train_acc=0.5775 | val_loss=

[I 2025-04-28 08:38:38,644] Trial 2 finished with value: 0.6143529968602317 and parameters: {'lr': 4.573338352305618e-05, 'weight_decay': 6.452939826551289e-05, 'rtm_blocks': 1, 'stm_blocks': 2, 'ttm_blocks': 3, 'rtm_heads': 2, 'stm_heads': 2, 'ttm_heads': 3, 'num_segments': 5}. Best is trial 2 with value: 0.6143529968602317.


[Fold 0] Epoch 072 | train_loss=0.2957 train_acc=0.8734 | val_loss=0.7236 val_acc=0.7609 | time=17.2s
[Fold 0] Early stopping at epoch 72, best was 52



Testing with hyperparameters: {'lr': 4.6507943057617634e-05, 'weight_decay': 5.965386496463809e-05, 'rtm_blocks': 2, 'stm_blocks': 2, 'ttm_blocks': 1, 'rtm_heads': 2, 'stm_heads': 3, 'ttm_heads': 3, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0724 train_acc=0.4237 | val_loss=1.0827 val_acc=0.4317 | time=17.7s
[Fold 0] Epoch 002 | train_loss=1.0665 train_acc=0.4311 | val_loss=1.0806 val_acc=0.4317 | time=17.9s
[Fold 0] Epoch 003 | train_loss=1.0670 train_acc=0.4311 | val_loss=1.0829 val_acc=0.4317 | time=18.0s
[Fold 0] Epoch 004 | train_loss=1.0665 train_acc=0.4346 | val_loss=1.0801 val_acc=0.4317 | time=17.7s
[Fold 0] Epoch 005 | train_loss=1.0662 train_acc=0.4311 | val_loss=1.0803 val_acc=0.4317 | time=17.7s
[Fold 0] Epoch 006 | train_loss=1.0669 train_acc=0.4311 | val_loss=1.0760 val_acc=0.4317 | time=17.8s
[Fold 0] Epoch 007 | train_loss=1.0528 train_acc=0.4641 | val_loss=1.0627 val_acc=0.4736 | time=17.7s
[Fold 0] Epoch 008 | train_loss=1.0134 train_acc=0.5309 | val_loss

[I 2025-04-28 09:00:30,740] Trial 3 finished with value: 0.6841881253889629 and parameters: {'lr': 4.6507943057617634e-05, 'weight_decay': 5.965386496463809e-05, 'rtm_blocks': 2, 'stm_blocks': 2, 'ttm_blocks': 1, 'rtm_heads': 2, 'stm_heads': 3, 'ttm_heads': 3, 'num_segments': 5}. Best is trial 2 with value: 0.6143529968602317.


[Fold 0] Epoch 073 | train_loss=0.2454 train_acc=0.9076 | val_loss=0.8950 val_acc=0.7469 | time=18.0s
[Fold 0] Early stopping at epoch 73, best was 53



Testing with hyperparameters: {'lr': 3.230793658846425e-05, 'weight_decay': 5.926736468642624e-05, 'rtm_blocks': 3, 'stm_blocks': 3, 'ttm_blocks': 3, 'rtm_heads': 3, 'stm_heads': 3, 'ttm_heads': 2, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0760 train_acc=0.4190 | val_loss=1.0810 val_acc=0.4317 | time=19.2s
[Fold 0] Epoch 002 | train_loss=1.0667 train_acc=0.4311 | val_loss=1.0843 val_acc=0.4317 | time=19.2s
[Fold 0] Epoch 003 | train_loss=1.0673 train_acc=0.4311 | val_loss=1.0840 val_acc=0.4317 | time=18.9s
[Fold 0] Epoch 004 | train_loss=1.0674 train_acc=0.4311 | val_loss=1.0829 val_acc=0.4317 | time=19.1s
[Fold 0] Epoch 005 | train_loss=1.0661 train_acc=0.4311 | val_loss=1.0794 val_acc=0.4317 | time=19.0s
[Fold 0] Epoch 006 | train_loss=1.0636 train_acc=0.4342 | val_loss=1.0596 val_acc=0.4876 | time=19.4s
[Fold 0] Epoch 007 | train_loss=1.0326 train_acc=0.5091 | val_loss=1.0104 val_acc=0.5512 | time=19.0s
[Fold 0] Epoch 008 | train_loss=0.9775 train_acc=0.5647 | val_loss=

[I 2025-04-28 10:30:31,085] Trial 4 finished with value: 0.6840153265567053 and parameters: {'lr': 3.230793658846425e-05, 'weight_decay': 5.926736468642624e-05, 'rtm_blocks': 3, 'stm_blocks': 3, 'ttm_blocks': 3, 'rtm_heads': 3, 'stm_heads': 3, 'ttm_heads': 2, 'num_segments': 5}. Best is trial 2 with value: 0.6143529968602317.


[Fold 2] Epoch 079 | train_loss=0.4713 train_acc=0.7973 | val_loss=0.7145 val_acc=0.6910 | time=19.2s
[Fold 2] Early stopping at epoch 79, best was 59



Testing with hyperparameters: {'lr': 7.961654507774643e-05, 'weight_decay': 6.2742708650523e-05, 'rtm_blocks': 1, 'stm_blocks': 1, 'ttm_blocks': 3, 'rtm_heads': 2, 'stm_heads': 2, 'ttm_heads': 2, 'num_segments': 5}



[I 2025-04-28 10:30:52,207] Trial 5 pruned. 


❌ Trial 5 pruned at fold 0, epoch 1



Testing with hyperparameters: {'lr': 4.900194549517511e-05, 'weight_decay': 5.1682408123998705e-05, 'rtm_blocks': 1, 'stm_blocks': 3, 'ttm_blocks': 1, 'rtm_heads': 3, 'stm_heads': 3, 'ttm_heads': 3, 'num_segments': 5}



[I 2025-04-28 10:31:13,790] Trial 6 pruned. 


❌ Trial 6 pruned at fold 0, epoch 1



Testing with hyperparameters: {'lr': 7.757394242051711e-05, 'weight_decay': 4.7076266580008185e-05, 'rtm_blocks': 1, 'stm_blocks': 2, 'ttm_blocks': 2, 'rtm_heads': 3, 'stm_heads': 2, 'ttm_heads': 3, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0689 train_acc=0.4311 | val_loss=1.0835 val_acc=0.4317 | time=17.6s
[Fold 0] Epoch 002 | train_loss=1.0670 train_acc=0.4311 | val_loss=1.0840 val_acc=0.4317 | time=17.7s


[I 2025-04-28 10:32:10,128] Trial 7 pruned. 


❌ Trial 7 pruned at fold 0, epoch 3



Testing with hyperparameters: {'lr': 2.8656576209052045e-05, 'weight_decay': 6.975343786437316e-05, 'rtm_blocks': 2, 'stm_blocks': 2, 'ttm_blocks': 2, 'rtm_heads': 3, 'stm_heads': 3, 'ttm_heads': 3, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0683 train_acc=0.4311 | val_loss=1.0800 val_acc=0.4317 | time=18.0s
[Fold 0] Epoch 002 | train_loss=1.0678 train_acc=0.4311 | val_loss=1.0813 val_acc=0.4317 | time=17.8s
[Fold 0] Epoch 003 | train_loss=1.0669 train_acc=0.4311 | val_loss=1.0842 val_acc=0.4317 | time=18.0s
[Fold 0] Epoch 004 | train_loss=1.0670 train_acc=0.4315 | val_loss=1.0857 val_acc=0.4317 | time=18.1s
[Fold 0] Epoch 005 | train_loss=1.0650 train_acc=0.4311 | val_loss=1.0864 val_acc=0.4317 | time=18.2s


[I 2025-04-28 10:34:01,857] Trial 8 pruned. 


❌ Trial 8 pruned at fold 0, epoch 6



Testing with hyperparameters: {'lr': 5.956687649545858e-05, 'weight_decay': 3.544220567159663e-05, 'rtm_blocks': 2, 'stm_blocks': 1, 'ttm_blocks': 3, 'rtm_heads': 2, 'stm_heads': 2, 'ttm_heads': 3, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0691 train_acc=0.4311 | val_loss=1.0825 val_acc=0.4317 | time=17.7s
[Fold 0] Epoch 002 | train_loss=1.0667 train_acc=0.4311 | val_loss=1.0808 val_acc=0.4317 | time=17.7s
[Fold 0] Epoch 003 | train_loss=1.0668 train_acc=0.4311 | val_loss=1.0794 val_acc=0.4317 | time=17.6s
[Fold 0] Epoch 004 | train_loss=1.0630 train_acc=0.4439 | val_loss=1.0588 val_acc=0.5326 | time=17.7s
[Fold 0] Epoch 005 | train_loss=1.0626 train_acc=0.4276 | val_loss=1.0787 val_acc=0.4317 | time=17.5s
[Fold 0] Epoch 006 | train_loss=1.0507 train_acc=0.4567 | val_loss=1.1327 val_acc=0.3432 | time=17.5s
[Fold 0] Epoch 007 | train_loss=1.0730 train_acc=0.4175 | val_loss=1.0827 val_acc=0.4317 | time=17.8s


[I 2025-04-28 10:36:27,042] Trial 9 pruned. 


❌ Trial 9 pruned at fold 0, epoch 8

===== Best Trial Results =====
avg_val_loss   = 0.614353


KeyError: 'avg_train_loss'

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedKFold

import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Constants ─────────────────────────
LR_MIN, LR_MAX = 2e-5, 8e-5
WD_MIN, WD_MAX = 3e-5, 1e-4
FILTER_MIN = FILTER_MAX = 120

# blocks -> [1,2,3] / heads -> [2,3,4]
RTM_BLOCK_CHOICES = [1, 2, 3]
STM_BLOCK_CHOICES = [1, 2, 3]
TTM_BLOCK_CHOICES = [1, 2, 3]

RTM_HEAD_CHOICES = [2, 3, 4]
STM_HEAD_CHOICES = [2, 3, 4]
TTM_HEAD_CHOICES = [2, 3, 4]

SEGMENT_CHOICES = [5, 15]

N_FOLDS     = 5
MAX_EPOCHS  = 100
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))
DATA_DIR    = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE  = "labels.json"

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def objective(trial):
    # Hyperparameter search space setup
    lr = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_filters = 120
    rtm_blocks = trial.suggest_categorical("rtm_blocks", RTM_BLOCK_CHOICES)
    stm_blocks = trial.suggest_categorical("stm_blocks", STM_BLOCK_CHOICES)
    ttm_blocks = trial.suggest_categorical("ttm_blocks", TTM_BLOCK_CHOICES)
    rtm_heads = trial.suggest_categorical("rtm_heads", RTM_HEAD_CHOICES)
    stm_heads = trial.suggest_categorical("stm_heads", STM_HEAD_CHOICES)
    ttm_heads = trial.suggest_categorical("ttm_heads", TTM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    # Pruning condition: only proceed if num_filters is divisible by heads
    for h in (rtm_heads, stm_heads, ttm_heads):
        if num_filters % h != 0:
            raise optuna.TrialPruned()

    # Initialize wandb
    wandb.init(project="eeg-cv-tuning-trial_12", config=trial.params)

    print(f"\n========================= Trial {trial.number} =========================")
    print(f"Testing with hyperparameters: {trial.params}")

    # Data preparation
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds = EEGDataset(DATA_DIR, train_meta)
    labels = [d["label"] for d in train_meta]
    n_samples = len(full_ds)

    # StratifiedKFold setup
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    fold_metrics = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "best_epoch": []}

    for fold, (train_idx, val_idx) in enumerate(skf.split(range(n_samples), labels)):
        # Fold separation
        print(f"\n========================= Fold {fold} =========================")

        # Data loader setup
        train_loader = DataLoader(Subset(full_ds, train_idx), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
        val_loader = DataLoader(Subset(full_ds, val_idx), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

        # Model and optimizer setup
        model = EEGformer(
            num_classes=3,
            in_channels=19,
            kernel_size=10,
            num_filters=num_filters,
            rtm_blocks=rtm_blocks,
            stm_blocks=stm_blocks,
            ttm_blocks=ttm_blocks,
            rtm_heads=rtm_heads,
            stm_heads=stm_heads,
            ttm_heads=ttm_heads,
            num_segments=num_segments
        ).to(DEVICE)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.CrossEntropyLoss()

        best_val_loss = float("inf")
        epochs_no_improve = 0
        best_epoch = 0
        best_train_l = best_train_a = best_val_a = None
        last_log_time = time.time()

        # Epoch-wise training
        for epoch in range(1, MAX_EPOCHS + 1):
            model.train()
            tl_sum = t_corr = t_tot = 0
            for X, y in train_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                optimizer.zero_grad()
                logits = model(X)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()
                tl_sum += loss.item()
                t_corr += (logits.argmax(1) == y).sum().item()
                t_tot += y.size(0)

            train_loss = tl_sum / len(train_loader)
            train_acc = t_corr / t_tot

            model.eval()
            vl_sum = v_corr = v_tot = 0
            with torch.no_grad():
                for X, y in val_loader:
                    X, y = X.to(DEVICE), y.to(DEVICE)
                    logits = model(X)
                    loss = criterion(logits, y)
                    vl_sum += loss.item()
                    v_corr += (logits.argmax(1) == y).sum().item()
                    v_tot += y.size(0)

            val_loss = vl_sum / len(val_loader)
            val_acc = v_corr / v_tot

            step = fold * MAX_EPOCHS + epoch
            trial.report(val_loss, step=step)

            # Pruning check
            if trial.should_prune():
                print(f"\u274c Trial {trial.number} pruned at fold {fold}, epoch {epoch}")
                # Report the metrics before returning early
                for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                 [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                    fold_metrics[k].append(v)
                    trial.set_user_attr(f"fold{fold}_{k}", v)
                raise optuna.TrialPruned()  # End trial completely if pruned

            now = time.time()
            print(f"[Fold {fold}] Epoch {epoch:03d} | train_loss={train_loss:.4f} train_acc={train_acc:.4f} | "
                  f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} | time={now - last_log_time:.1f}s")
            last_log_time = now

            # Early stopping: if validation loss does not improve
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                epochs_no_improve = 0
                best_train_l = train_loss
                best_train_a = train_acc
                best_val_a = val_acc
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= PATIENCE:
                    print(f"[Fold {fold}] Early stopping at epoch {epoch}, best was {best_epoch}")
                    # Report the metrics before returning early
                    for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                     [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                        fold_metrics[k].append(v)
                        trial.set_user_attr(f"fold{fold}_{k}", v)
                    break  # Break if early stopping

        # Record results for the fold
        for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                         [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
            fold_metrics[k].append(v)
            trial.set_user_attr(f"fold{fold}_{k}", v)

        del model, optimizer, train_loader, val_loader
        torch.mps.empty_cache() if DEVICE.type == "mps" else torch.cuda.empty_cache()
        gc.collect()

    # Calculate average metrics
    avg = lambda k: sum(fold_metrics[k]) / N_FOLDS
    for key in ["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"]:
        trial.set_user_attr(f"avg_{key}", avg(key))

    wandb.finish()
    return avg("val_loss")


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=1),
        study_name="eegformer_optuna_cv_4",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eegformer_optuna_cv_5.db",
        load_if_exists=True
    )
    study.optimize(objective, n_trials=10)

    best = study.best_trial
    print("\n===== Best Trial Results =====")
    print(f"avg_val_loss   = {best.value:.6f}")
    print(f"avg_train_loss = {best.user_attrs['avg_train_loss']:.6f}")
    print(f"avg_train_acc  = {best.user_attrs['avg_train_acc']:.4f}")
    print(f"avg_val_acc    = {best.user_attrs['avg_val_acc']:.4f}")
    print(f"avg_best_epoch = {best.user_attrs['avg_best_epoch']:.1f}")
    print("best hyperparameters:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")
    print("per-fold best metrics:")
    for f in range(N_FOLDS):
        print(
            f"  Fold {f}: epoch={best.user_attrs[f'fold{f}_best_epoch']}, "
            f"t_loss={best.user_attrs[f'fold{f}_train_loss']:.4f}, "
            f"t_acc={best.user_attrs[f'fold{f}_train_acc']:.4f}, "
            f"v_loss={best.user_attrs[f'fold{f}_val_loss']:.4f}, "
            f"v_acc={best.user_attrs[f'fold{f}_val_acc']:.4f}"
        )

    # ─── 전체 데이터(train)로 재학습 + loss·accuracy 출력 + 저장 ─────────────────
    print("\nRetraining on full TRAIN dataset with best params…")

    # 1) train 메타만 골라서 로드
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    full_meta   = [d for d in all_meta if d["type"] == "train"]
    full_ds     = EEGDataset(DATA_DIR, full_meta)
    full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

    # 2) 모델·옵티마이저 재설정
    model = EEGformer(
        num_classes=3,
        in_channels=19,
        kernel_size=10,
        num_filters=120,
        rtm_blocks=best.params["rtm_blocks"],
        stm_blocks=best.params["stm_blocks"],
        ttm_blocks=best.params["ttm_blocks"],
        rtm_heads= best.params["rtm_heads"],
        stm_heads= best.params["stm_heads"],
        ttm_heads= best.params["ttm_heads"],
        num_segments=best.params["num_segments"]
    ).to(DEVICE)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=best.params["lr"],
        weight_decay=best.params["weight_decay"]
    )
    criterion = nn.CrossEntropyLoss()

    # 3) MAX_EPOCHS 만큼 전체 학습하며 loss·accuracy 출력
    for epoch in range(1, MAX_EPOCHS + 1):
        model.train()
        loss_sum = 0.0
        correct  = 0
        total    = 0
        for X, y in full_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            correct  += (logits.argmax(1) == y).sum().item()
            total    += y.size(0)

        avg_loss = loss_sum / len(full_loader)
        acc      = correct / total
        print(f"[Full Train] Epoch {epoch:03d} | loss={avg_loss:.4f} | acc={acc:.4f}")

    # 4) 체크포인트 저장
    ckpt_dir = '/content/drive/MyDrive/2025_Lab_Research/checkpoints'
    os.makedirs(ckpt_dir, exist_ok=True)
    ckpt_path = os.path.join(ckpt_dir, 'eegformer_best.pth')
    torch.save(model.state_dict(), ckpt_path)
    print(f"💾 Saved best model to {ckpt_path}")


Attempting to create new mne-python configuration file:
/root/.mne/mne-python.json
Now using CUDA device 0
Enabling CUDA with 39.14 GiB available memory


[I 2025-04-28 21:33:58,944] A new study created in RDB with name: eegformer_optuna_cv_4



Testing with hyperparameters: {'lr': 3.306335426726131e-05, 'weight_decay': 6.37539648704434e-05, 'rtm_blocks': 3, 'stm_blocks': 2, 'ttm_blocks': 3, 'rtm_heads': 3, 'stm_heads': 3, 'ttm_heads': 2, 'num_segments': 15}

[Fold 0] Epoch 001 | train_loss=1.0683 train_acc=0.4311 | val_loss=1.0845 val_acc=0.4317 | time=250.0s
[Fold 0] Epoch 002 | train_loss=1.0669 train_acc=0.4311 | val_loss=1.0853 val_acc=0.4317 | time=18.0s
[Fold 0] Epoch 003 | train_loss=1.0669 train_acc=0.4311 | val_loss=1.0818 val_acc=0.4317 | time=18.1s
[Fold 0] Epoch 004 | train_loss=1.0666 train_acc=0.4311 | val_loss=1.0820 val_acc=0.4317 | time=18.1s
[Fold 0] Epoch 005 | train_loss=1.0663 train_acc=0.4311 | val_loss=1.0851 val_acc=0.4317 | time=18.2s
[Fold 0] Epoch 006 | train_loss=1.0663 train_acc=0.4311 | val_loss=1.0822 val_acc=0.4317 | time=18.1s
[Fold 0] Epoch 007 | train_loss=1.0677 train_acc=0.4311 | val_loss=1.0823 val_acc=0.4317 | time=17.9s
[Fold 0] Epoch 008 | train_loss=1.0672 train_acc=0.4311 | val_loss

[I 2025-04-28 23:45:36,764] Trial 0 finished with value: 1.23324632417588 and parameters: {'lr': 3.306335426726131e-05, 'weight_decay': 6.37539648704434e-05, 'rtm_blocks': 3, 'stm_blocks': 2, 'ttm_blocks': 3, 'rtm_heads': 3, 'stm_heads': 3, 'ttm_heads': 2, 'num_segments': 15}. Best is trial 0 with value: 1.23324632417588.



Testing with hyperparameters: {'lr': 5.4741533345840924e-05, 'weight_decay': 6.0702711367702194e-05, 'rtm_blocks': 1, 'stm_blocks': 1, 'ttm_blocks': 2, 'rtm_heads': 3, 'stm_heads': 4, 'ttm_heads': 2, 'num_segments': 5}

[Fold 0] Epoch 001 | train_loss=1.0832 train_acc=0.4171 | val_loss=1.0814 val_acc=0.4317 | time=17.3s
[Fold 0] Epoch 002 | train_loss=1.0687 train_acc=0.4276 | val_loss=1.0813 val_acc=0.4317 | time=17.2s
[Fold 0] Epoch 003 | train_loss=1.0653 train_acc=0.4361 | val_loss=1.0778 val_acc=0.4317 | time=17.5s
[Fold 0] Epoch 004 | train_loss=1.0536 train_acc=0.4571 | val_loss=1.0373 val_acc=0.5435 | time=17.5s
[Fold 0] Epoch 005 | train_loss=1.0091 train_acc=0.5297 | val_loss=1.0274 val_acc=0.5233 | time=17.6s
[Fold 0] Epoch 006 | train_loss=0.9954 train_acc=0.5371 | val_loss=0.9988 val_acc=0.5512 | time=17.8s
[Fold 0] Epoch 007 | train_loss=0.9675 train_acc=0.5623 | val_loss=0.9701 val_acc=0.5745 | time=17.4s
[Fold 0] Epoch 008 | train_loss=0.9667 train_acc=0.5550 | val_los

[W 2025-04-29 00:46:21,576] Trial 1 failed with parameters: {'lr': 5.4741533345840924e-05, 'weight_decay': 6.0702711367702194e-05, 'rtm_blocks': 1, 'stm_blocks': 1, 'ttm_blocks': 2, 'rtm_heads': 3, 'stm_heads': 4, 'ttm_heads': 2, 'num_segments': 5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-8-d413b19da3d1>", line 123, in objective
    tl_sum += loss.item()
              ^^^^^^^^^^^
KeyboardInterrupt
[W 2025-04-29 00:46:21,579] Trial 1 failed with value None.


KeyboardInterrupt: 

### Optuna 범위 줄여보기

- RTM Block = [1, 2]
- STM Block = [2, 3]
- TTM Block = [1, 2]

- RTM Head = [2, 4]
- STM Head = [2, 4]
- TTM Head = [2, 4]

- Segment Choices = [5]

In [None]:
import os
import json
import time
import gc
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedKFold

import optuna
import wandb

from eeg_dataset import EEGDataset
from models import EEGformer

# ─── Constants ─────────────────────────
LR_MIN, LR_MAX = 2e-5, 8e-5
WD_MIN, WD_MAX = 3e-5, 1e-4
FILTER_MIN = FILTER_MAX = 120

# blocks -> [1,2,3] / heads -> [2,3,4]
RTM_BLOCK_CHOICES = [1, 2]
STM_BLOCK_CHOICES = [2, 3]
TTM_BLOCK_CHOICES = [1, 2]

RTM_HEAD_CHOICES = [2, 4]
STM_HEAD_CHOICES = [2, 4]
TTM_HEAD_CHOICES = [2, 4]

SEGMENT_CHOICES = [5]

N_FOLDS     = 5
MAX_EPOCHS  = 100
PATIENCE    = 20
BATCH_SIZE  = 32
NUM_WORKERS = max(1, min(4, os.cpu_count() - 1))
DATA_DIR    = '/content/drive/MyDrive/2025_Lab_Research/model-data'
LABEL_FILE  = "labels.json"

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def objective(trial):
    # Hyperparameter search space setup
    lr = trial.suggest_float("lr", LR_MIN, LR_MAX, log=True)
    weight_decay = trial.suggest_float("weight_decay", WD_MIN, WD_MAX, log=True)
    num_filters = 120
    rtm_blocks = trial.suggest_categorical("rtm_blocks", RTM_BLOCK_CHOICES)
    stm_blocks = trial.suggest_categorical("stm_blocks", STM_BLOCK_CHOICES)
    ttm_blocks = trial.suggest_categorical("ttm_blocks", TTM_BLOCK_CHOICES)
    rtm_heads = trial.suggest_categorical("rtm_heads", RTM_HEAD_CHOICES)
    stm_heads = trial.suggest_categorical("stm_heads", STM_HEAD_CHOICES)
    ttm_heads = trial.suggest_categorical("ttm_heads", TTM_HEAD_CHOICES)
    num_segments = trial.suggest_categorical("num_segments", SEGMENT_CHOICES)

    # Pruning condition: only proceed if num_filters is divisible by heads
    for h in (rtm_heads, stm_heads, ttm_heads):
        if num_filters % h != 0:
            raise optuna.TrialPruned()

    # Initialize wandb
    wandb.init(project="eeg-cv-tuning-trial_12", config=trial.params)

    print(f"\n========================= Trial {trial.number} =========================")
    print(f"Testing with hyperparameters: {trial.params}")

    # Data preparation
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    train_meta = [d for d in all_meta if d["type"] == "train"]
    full_ds = EEGDataset(DATA_DIR, train_meta)
    labels = [d["label"] for d in train_meta]
    n_samples = len(full_ds)

    # StratifiedKFold setup
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    fold_metrics = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "best_epoch": []}

    for fold, (train_idx, val_idx) in enumerate(skf.split(range(n_samples), labels)):
        # Fold separation
        print(f"\n========================= Fold {fold} =========================")

        # Data loader setup
        train_loader = DataLoader(Subset(full_ds, train_idx), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
        val_loader = DataLoader(Subset(full_ds, val_idx), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

        # Model and optimizer setup
        model = EEGformer(
            num_classes=3,
            in_channels=19,
            kernel_size=10,
            num_filters=num_filters,
            rtm_blocks=rtm_blocks,
            stm_blocks=stm_blocks,
            ttm_blocks=ttm_blocks,
            rtm_heads=rtm_heads,
            stm_heads=stm_heads,
            ttm_heads=ttm_heads,
            num_segments=num_segments
        ).to(DEVICE)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.CrossEntropyLoss()

        best_val_loss = float("inf")
        epochs_no_improve = 0
        best_epoch = 0
        best_train_l = best_train_a = best_val_a = None
        last_log_time = time.time()

        # Epoch-wise training
        for epoch in range(1, MAX_EPOCHS + 1):
            model.train()
            tl_sum = t_corr = t_tot = 0
            for X, y in train_loader:
                X, y = X.to(DEVICE), y.to(DEVICE)
                optimizer.zero_grad()
                logits = model(X)
                loss = criterion(logits, y)
                loss.backward()
                optimizer.step()
                tl_sum += loss.item()
                t_corr += (logits.argmax(1) == y).sum().item()
                t_tot += y.size(0)

            train_loss = tl_sum / len(train_loader)
            train_acc = t_corr / t_tot

            model.eval()
            vl_sum = v_corr = v_tot = 0
            with torch.no_grad():
                for X, y in val_loader:
                    X, y = X.to(DEVICE), y.to(DEVICE)
                    logits = model(X)
                    loss = criterion(logits, y)
                    vl_sum += loss.item()
                    v_corr += (logits.argmax(1) == y).sum().item()
                    v_tot += y.size(0)

            val_loss = vl_sum / len(val_loader)
            val_acc = v_corr / v_tot

            step = fold * MAX_EPOCHS + epoch
            trial.report(val_loss, step=step)

            # Pruning check
            if trial.should_prune():
                print(f"\u274c Trial {trial.number} pruned at fold {fold}, epoch {epoch}")
                # Report the metrics before returning early
                for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                 [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                    fold_metrics[k].append(v)
                    trial.set_user_attr(f"fold{fold}_{k}", v)
                raise optuna.TrialPruned()  # End trial completely if pruned

            now = time.time()
            print(f"[Fold {fold}] Epoch {epoch:03d} | train_loss={train_loss:.4f} train_acc={train_acc:.4f} | "
                  f"val_loss={val_loss:.4f} val_acc={val_acc:.4f} | time={now - last_log_time:.1f}s")
            last_log_time = now

            # Early stopping: if validation loss does not improve
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                epochs_no_improve = 0
                best_train_l = train_loss
                best_train_a = train_acc
                best_val_a = val_acc
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= PATIENCE:
                    print(f"[Fold {fold}] Early stopping at epoch {epoch}, best was {best_epoch}")
                    # Report the metrics before returning early
                    for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                                     [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
                        fold_metrics[k].append(v)
                        trial.set_user_attr(f"fold{fold}_{k}", v)
                    break  # Break if early stopping

        # Record results for the fold
        for k, v in zip(["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"],
                         [best_train_l, best_train_a, best_val_loss, best_val_a, best_epoch]):
            fold_metrics[k].append(v)
            trial.set_user_attr(f"fold{fold}_{k}", v)

        del model, optimizer, train_loader, val_loader
        torch.mps.empty_cache() if DEVICE.type == "mps" else torch.cuda.empty_cache()
        gc.collect()

    # Calculate average metrics
    avg = lambda k: sum(fold_metrics[k]) / N_FOLDS
    for key in ["train_loss", "train_acc", "val_loss", "val_acc", "best_epoch"]:
        trial.set_user_attr(f"avg_{key}", avg(key))

    wandb.finish()
    return avg("val_loss")


if __name__ == "__main__":
    multiprocessing.freeze_support()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(),
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=1),
        study_name="eegformer_optuna_cv_4",
        storage="sqlite:////content/drive/MyDrive/2025_Lab_Research/eegformer_optuna_cv_5.db",
        load_if_exists=True
    )
    study.optimize(objective, n_trials=10)

    best = study.best_trial
    print("\n===== Best Trial Results =====")
    print(f"avg_val_loss   = {best.value:.6f}")
    print(f"avg_train_loss = {best.user_attrs['avg_train_loss']:.6f}")
    print(f"avg_train_acc  = {best.user_attrs['avg_train_acc']:.4f}")
    print(f"avg_val_acc    = {best.user_attrs['avg_val_acc']:.4f}")
    print(f"avg_best_epoch = {best.user_attrs['avg_best_epoch']:.1f}")
    print("best hyperparameters:")
    for k, v in best.params.items():
        print(f"  {k}: {v}")
    print("per-fold best metrics:")
    for f in range(N_FOLDS):
        print(
            f"  Fold {f}: epoch={best.user_attrs[f'fold{f}_best_epoch']}, "
            f"t_loss={best.user_attrs[f'fold{f}_train_loss']:.4f}, "
            f"t_acc={best.user_attrs[f'fold{f}_train_acc']:.4f}, "
            f"v_loss={best.user_attrs[f'fold{f}_val_loss']:.4f}, "
            f"v_acc={best.user_attrs[f'fold{f}_val_acc']:.4f}"
        )

    # ─── 전체 데이터(train)로 재학습 + loss·accuracy 출력 + 저장 ─────────────────
    print("\nRetraining on full TRAIN dataset with best params…")

    # 1) train 메타만 골라서 로드
    with open(os.path.join(DATA_DIR, LABEL_FILE), "r") as f:
        all_meta = json.load(f)
    full_meta   = [d for d in all_meta if d["type"] == "train"]
    full_ds     = EEGDataset(DATA_DIR, full_meta)
    full_loader = DataLoader(full_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

    # 2) 모델·옵티마이저 재설정
    model = EEGformer(
        num_classes=3,
        in_channels=19,
        kernel_size=10,
        num_filters=120,
        rtm_blocks=best.params["rtm_blocks"],
        stm_blocks=best.params["stm_blocks"],
        ttm_blocks=best.params["ttm_blocks"],
        rtm_heads= best.params["rtm_heads"],
        stm_heads= best.params["stm_heads"],
        ttm_heads= best.params["ttm_heads"],
        num_segments=best.params["num_segments"]
    ).to(DEVICE)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=best.params["lr"],
        weight_decay=best.params["weight_decay"]
    )
    criterion = nn.CrossEntropyLoss()

    # 3) MAX_EPOCHS 만큼 전체 학습하며 loss·accuracy 출력
    for epoch in range(1, MAX_EPOCHS + 1):
        model.train()
        loss_sum = 0.0
        correct  = 0
        total    = 0
        for X, y in full_loader:
            X, y = X.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X)
            loss   = criterion(logits, y)
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            correct  += (logits.argmax(1) == y).sum().item()
            total    += y.size(0)

        avg_loss = loss_sum / len(full_loader)
        acc      = correct / total
        print(f"[Full Train] Epoch {epoch:03d} | loss={avg_loss:.4f} | acc={acc:.4f}")

    # 4) 체크포인트 저장
    ckpt_dir = '/content/drive/MyDrive/2025_Lab_Research/checkpoints'
    os.makedirs(ckpt_dir, exist_ok=True)
    ckpt_path = os.path.join(ckpt_dir, 'eegformer_best.pth')
    torch.save(model.state_dict(), ckpt_path)
    print(f"💾 Saved best model to {ckpt_path}")
