In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!git clone https://github.com/NguyenPhanNhatLan/medical_re.git

Cloning into 'medical_re'...
remote: Enumerating objects: 149, done.[K
remote: Counting objects: 100% (149/149), done.[K
remote: Compressing objects: 100% (114/114), done.[K
remote: Total 149 (delta 65), reused 93 (delta 30), pack-reused 0 (from 0)[K
Receiving objects: 100% (149/149), 4.75 MiB | 3.36 MiB/s, done.
Resolving deltas: 100% (65/65), done.


In [3]:
!mv /content/medical_re /content/drive/MyDrive/medical_re


mv: inter-device move failed: '/content/medical_re' to '/content/drive/MyDrive/medical_re/medical_re'; unable to remove target: Directory not empty


In [4]:
!find /content -maxdepth 2 -type d | grep medical_re


/content/medical_re
/content/medical_re/data
/content/medical_re/.git
/content/medical_re/train
/content/medical_re/datasets
/content/medical_re/encoder
/content/medical_re/configs
/content/medical_re/clean
/content/medical_re/models


In [5]:
%cd /content/drive/MyDrive/medical_re


/content/drive/MyDrive/medical_re


In [6]:
import torch
print(torch.cuda.is_available()) # Phải trả về True

True


In [8]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from encoder.vihealth_encoder import ViHealthBERTEncoder
from datasets.bert_es_dataset import BERTESDataset
from models.bert_es import BERTES
import config as cfg
import json
import torch.nn as nn
import random
import numpy as np
from typing import Dict, List

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [10]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)

In [11]:
TRAIN_PATH ="./data/train.json"
DEV_PATH ="./data/dev.json"
TEST_PATH ="./data/test.json"

In [12]:
encoder = ViHealthBERTEncoder()
tokenizer = encoder.tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 78ac893e-2e69-4977-8a01-c4828de80cac)')' thrown while requesting HEAD https://huggingface.co/demdecuong/vihealthbert-base-word/resolve/main/tokenizer.json
Retrying in 1s [Retry 1/5].


pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [13]:
with open(TRAIN_PATH, "r", encoding="utf-8") as f:
    train_data = json.load(f)

labels = sorted({ex["relation"] for ex in train_data})
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

print("num labels:", len(label2id))
print("example:", list(label2id.items())[:10])

num labels: 5
example: [('CAUSE_DISEASE', 0), ('DIAGNOSTIC_DISEASE', 1), ('DISEASE_SYMPTOM', 2), ('DISEASE_TREATMENT', 3), ('NO_RELATION', 4)]


In [14]:
train_ds = BERTESDataset(
    json_path=TRAIN_PATH,
    encoder=encoder,
    label2id=label2id,
    max_length=cfg.MODEL_MAX_LEN,
)


In [15]:
def bertes_collate_fn(batch: List[Dict], pad_token_id: int) -> Dict[str, torch.Tensor]:

    input_ids = [x["input_ids"] for x in batch]
    attention_mask = [x["attention_mask"] for x in batch]

    e1_pos = torch.tensor([x["e1_pos"] for x in batch], dtype=torch.long)
    e2_pos = torch.tensor([x["e2_pos"] for x in batch], dtype=torch.long)
    labels = torch.tensor([x["label_id"] for x in batch], dtype=torch.long)

    # Pad về cùng độ dài
    input_ids = pad_sequence(
        input_ids, batch_first=True, padding_value=pad_token_id
    )
    attention_mask = pad_sequence(
        attention_mask, batch_first=True, padding_value=0
    )

    return {
        "input_ids": input_ids,          # [B, L]
        "attention_mask": attention_mask,  # [B, L]
        "e1_pos": e1_pos,               # [B]
        "e2_pos": e2_pos,               # [B]
        "label_id": labels,             # [B]
    }


In [16]:
pad_id = encoder.tokenizer.pad_token_id
if pad_id is None:
    # một số tokenizer không có pad_token -> fallback
    pad_id = 0

train_loader = DataLoader(
    train_ds,
    batch_size=cfg.BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    collate_fn=lambda b: bertes_collate_fn(b, pad_token_id=pad_id),
)


In [17]:
dev_ds = BERTESDataset(
    json_path=DEV_PATH,
    encoder=encoder,
    label2id=label2id,
    max_length=cfg.MODEL_MAX_LEN
)

dev_loader = DataLoader(
    dev_ds,
    batch_size=cfg.BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    collate_fn=lambda b: bertes_collate_fn(b, pad_token_id=pad_id)
)

In [18]:
import copy
import itertools
import torch
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup

# ===== GRID =====
LR_GRID = [1e-5, 2e-5, 3e-5, 5e-5]
DROPOUT_GRID = [0.1, 0.2, 0.3]
WARMUP_RATIO_GRID = [0.0, 0.05, 0.1]

# ===== FIX PARAMS =====
WEIGHT_DECAY = 0.01
GRAD_CLIP_NORM = 1.0

# Search nhanh để khỏi quá lâu
SEARCH_EPOCHS = 1          # 2 hoặc 3
SEARCH_PATIENCE = 1        # early-stop mạnh trong search
SEARCH_MIN_DELTA = 1e-4

USE_AMP = torch.cuda.is_available()
print("USE_AMP:", USE_AMP)


USE_AMP: True


In [19]:
def build_model(dropout):
    return BERTES(
        encoder=encoder,
        hidden_size=encoder.hidden_size,
        num_labels=len(label2id),
        dropout_rate=dropout
    )


In [20]:
@torch.no_grad()
def evaluate_loss(model, dataloader, criterion, device):
    model.eval()
    total_loss, n = 0.0, 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        e1_pos = batch["e1_pos"].to(device)
        e2_pos = batch["e2_pos"].to(device)
        labels = batch["label_id"].to(device)

        logits = model(input_ids, attention_mask, e1_pos, e2_pos)
        loss = criterion(logits, labels)

        total_loss += loss.item()
        n += 1

    return total_loss / max(n, 1)


In [21]:
def train_one_config(lr, dropout, warmup_ratio, train_loader, dev_loader, criterion, device):
    model = build_model(dropout).to(device)

    # (Tùy bạn) đang đóng băng encoder
    for p in model.encoder.parameters():
        p.requires_grad = False

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=WEIGHT_DECAY
    )

    total_steps = len(train_loader) * SEARCH_EPOCHS
    warmup_steps = int(total_steps * warmup_ratio)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    scaler = torch.amp.GradScaler('cuda', enabled=USE_AMP)

    best_val = float("inf")
    best_state = None
    patience = 0

    for epoch in range(1, SEARCH_EPOCHS + 1):
        model.train()
        train_bar = tqdm(
            train_loader,
            desc=f"train e{epoch} (lr={lr:.1e}, dr={dropout})",
            dynamic_ncols=True
        )

        for batch in train_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            e1_pos = batch["e1_pos"].to(device)
            e2_pos = batch["e2_pos"].to(device)
            labels = batch["label_id"].to(device)

            optimizer.zero_grad(set_to_none=True)

            with torch.amp.autocast('cuda', enabled=USE_AMP):
                logits = model(input_ids, attention_mask, e1_pos, e2_pos)
                loss = criterion(logits, labels)

            scaler.scale(loss).backward()

            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)

            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            train_bar.set_postfix(loss=f"{loss.item():.4f}")

        # ===== VALIDATION =====
        val_loss = evaluate_loss(model, dev_loader, criterion, device)
        print(f"Epoch {epoch} - Val Loss: {val_loss:.4f}")

        if (best_val - val_loss) > SEARCH_MIN_DELTA:
            best_val = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= SEARCH_PATIENCE:
                print("Early stopping triggered!")
                break

    return best_val, best_state


Random Search


In [22]:
criterion = nn.CrossEntropyLoss()

In [23]:
batch = next(iter(train_loader))
for k, v in batch.items():
    print(k, v.shape)

input_ids torch.Size([16, 256])
attention_mask torch.Size([16, 256])
e1_pos torch.Size([16])
e2_pos torch.Size([16])
label_id torch.Size([16])


In [None]:
# combos = list(itertools.product(LR_GRID, DROPOUT_GRID, WARMUP_RATIO_GRID))
# combos = random.sample(combos, k=6)
# best = {"val_loss": float("inf"), "cfg": None, "state": None}
# results = []

# grid_bar = tqdm(combos, desc="GRID", total=len(combos))

# for lr, dr, wr in grid_bar:
#     grid_bar.set_postfix(lr=f"{lr:.1e}", dr=dr, wr=wr)

#     val_loss, state = train_one_config(
#         lr=lr,
#         dropout=dr,
#         warmup_ratio=wr,
#         train_loader=train_loader,
#         dev_loader=dev_loader,
#         criterion=criterion,
#         device=device
#     )

#     results.append({"lr": lr, "dropout": dr, "warmup_ratio": wr, "val_loss": val_loss})
#     tqdm.write(f"[DONE] lr={lr:.1e}, dr={dr}, wr={wr} => best_val={val_loss:.4f}")

#     if val_loss < best["val_loss"]:
#         best["val_loss"] = val_loss
#         best["cfg"] = {"lr": lr, "dropout": dr, "warmup_ratio": wr}
#         best["state"] = copy.deepcopy(state)

# print("\nBEST CONFIG:", best["cfg"], "best_val_loss=", best["val_loss"])


In [None]:
# import pandas as pd

# # Tạo bảng tổng hợp
# df_results = pd.DataFrame(results)
# df_results = df_results.sort_values(by="val_loss", ascending=False)

# print("\nBảng xếp hạng các cấu hình:")
# print(df_results)

Optuna

In [None]:
import optuna

In [None]:
def objective(trial):
    # 1. Khai báo không gian tìm kiếm (Search Space)
    # suggest_float cho phép thử các giá trị liên tục, không bị bó buộc như Grid Search
    lr = trial.suggest_float("lr", 1e-5, 5e-5, log=True)
    dropout = trial.suggest_float("dropout", 0.0, 0.4)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)

    # Bạn có thể thử cả các kiến trúc khác nếu muốn
    # weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-2, log=True)

    # 2. Gọi hàm train (Dùng SEARCH_EPOCHS nhỏ để tìm kiếm nhanh)
    # Lưu ý: train_one_config cần trả về val_loss
    val_loss, _ = train_one_config(
        lr=lr,
        dropout=dropout,
        warmup_ratio=warmup_ratio,
        train_loader=train_loader,
        dev_loader=dev_loader,
        criterion=criterion,
        device=device
    )

    # 3. Giải phóng bộ nhớ sau mỗi Trial để tránh tràn VRAM
    torch.cuda.empty_cache()
    import gc
    gc.collect()

    return val_loss

In [None]:
# Tạo Study: mục tiêu là minimize (giảm tối đa) Val Loss
study = optuna.create_study(direction="minimize", study_name="vihealthbert_re_tuning")
study.enqueue_trial({
    "lr": 2e-05,
    "dropout": 0.2,
    "warmup_ratio": 0.1
})
# Bắt đầu chạy
study.optimize(objective, n_trials=15)

print("-" * 30)
print("KẾT QUẢ TỐI ƯU NHẤT:")
print(f"  Val Loss thấp nhất: {study.best_value:.4f}")
print(f"  Bộ tham số: {study.best_params}")

{'lr': 4.826450477069401e-05, 'dropout': 0.14933945953848662, 'warmup_ratio': 0.19920969783107204}

In [33]:

def get_predictions(model, dataloader, device):
    model.eval()

    all_labels = []
    all_preds = []
    all_probs = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            e1_pos = batch["e1_pos"].to(device)
            e2_pos = batch["e2_pos"].to(device)
            labels = batch["label_id"].to(device)

            logits = model(input_ids, attention_mask, e1_pos, e2_pos)
            probs = torch.softmax(logits, dim=-1)
            preds = torch.argmax(probs, dim=-1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    return np.array(all_labels), np.array(all_preds), np.array(all_probs)


In [34]:
def precision_at_k(y_true, y_probs, k=5):
    correct = 0
    total = len(y_true)

    topk_preds = np.argsort(-y_probs, axis=1)[:, :k]

    for i in range(total):
        if y_true[i] in topk_preds[i]:
            correct += 1

    return correct / max(total, 1)


In [35]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    average_precision_score
)

def evaluate_full_metrics(model, dataloader, device, num_classes):
    y_true, y_pred, y_probs = get_predictions(model, dataloader, device)

    macro_precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    micro_precision = precision_score(y_true, y_pred, average="micro", zero_division=0)

    macro_recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
    micro_recall = recall_score(y_true, y_pred, average="micro", zero_division=0)

    macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    micro_f1 = f1_score(y_true, y_pred, average="micro", zero_division=0)

    y_true_onehot = np.eye(num_classes)[y_true]
    pr_auc = average_precision_score(y_true_onehot, y_probs, average="macro")

    metrics = {
        "macro_precision": macro_precision,
        "micro_precision": micro_precision,
        "macro_recall": macro_recall,
        "micro_recall": micro_recall,
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "pr_auc": pr_auc
    }

    print("\n===== VALIDATION METRICS =====")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")
    print("==============================\n")

    return metrics


In [36]:
BEST_PARAMS = {
    "lr": 4.826450477069401e-05,
    "dropout": 0.14933945953848662,
    "warmup_ratio": 0.19920969783107204
}


In [37]:
import copy
from tqdm import tqdm

def train_final_model_bertes(
    lr, dropout, warmup_ratio,
    epochs, train_loader, dev_loader,
    criterion, device,
    select_metric="macro_f1"
):
    # ----- BUILD MODEL -----
    model = build_model(dropout).to(device)

    # mở hết parameters để fine-tune
    for p in model.parameters():
        p.requires_grad = True

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=WEIGHT_DECAY
    )

    total_steps = len(train_loader) * epochs
    warmup_steps = int(total_steps * warmup_ratio)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    scaler = torch.amp.GradScaler("cuda", enabled=USE_AMP)

    best_score = -1.0
    best_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        total_train_loss = 0.0

        train_bar = tqdm(
            train_loader,
            desc=f"[Final Train] Epoch {epoch}/{epochs}",
            dynamic_ncols=True
        )

        for batch in train_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            e1_pos = batch["e1_pos"].to(device)
            e2_pos = batch["e2_pos"].to(device)
            labels = batch["label_id"].to(device)

            optimizer.zero_grad(set_to_none=True)

            with torch.amp.autocast("cuda", enabled=USE_AMP):
                logits = model(input_ids, attention_mask, e1_pos, e2_pos)
                loss = criterion(logits, labels)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_train_loss += loss.item()
            train_bar.set_postfix(loss=f"{loss.item():.4f}")

        # -------- VALIDATION FULL METRICS --------
        val_results = evaluate_full_metrics(
            model, dev_loader, device, num_classes=len(label2id)
        )

        current_score = val_results.get(select_metric, 0.0)
        avg_train_loss = total_train_loss / len(train_loader)

        print(f"\n[Epoch {epoch}] "
              f"Train Loss: {avg_train_loss:.4f} | "
              f"Dev {select_metric}: {current_score:.4f}")

        if current_score > best_score:
            best_score = current_score
            best_state = copy.deepcopy(model.state_dict())
            print(f"--> [SAVE] New best model: {select_metric} = {best_score:.4f}")

    # Load best weights
    if best_state is not None:
        model.load_state_dict(best_state)
        print(f"\n✅ Restored best model with {select_metric}: {best_score:.4f}")

    return model


In [38]:
final_model = train_final_model_bertes(
    lr=BEST_PARAMS["lr"],
    dropout=BEST_PARAMS["dropout"],
    warmup_ratio=BEST_PARAMS["warmup_ratio"],
    epochs=10,
    train_loader=train_loader,
    dev_loader=dev_loader,
    criterion=criterion,
    device=device,
    select_metric="macro_f1"
)


[Final Train] Epoch 1/10: 100%|██████████| 1950/1950 [07:36<00:00,  4.27it/s, loss=0.0226]



===== VALIDATION METRICS =====
macro_precision: 0.8662
micro_precision: 0.9174
macro_recall: 0.8510
micro_recall: 0.9174
macro_f1: 0.8576
micro_f1: 0.9174
pr_auc: 0.9216


[Epoch 1] Train Loss: 0.4050 | Dev macro_f1: 0.8576
--> [SAVE] New best model: macro_f1 = 0.8576


[Final Train] Epoch 2/10: 100%|██████████| 1950/1950 [07:33<00:00,  4.30it/s, loss=0.1098]



===== VALIDATION METRICS =====
macro_precision: 0.9280
micro_precision: 0.9320
macro_recall: 0.8451
micro_recall: 0.9320
macro_f1: 0.8809
micro_f1: 0.9320
pr_auc: 0.9273


[Epoch 2] Train Loss: 0.2594 | Dev macro_f1: 0.8809
--> [SAVE] New best model: macro_f1 = 0.8809


[Final Train] Epoch 3/10: 100%|██████████| 1950/1950 [07:34<00:00,  4.29it/s, loss=0.1347]



===== VALIDATION METRICS =====
macro_precision: 0.9021
micro_precision: 0.9315
macro_recall: 0.8688
micro_recall: 0.9315
macro_f1: 0.8829
micro_f1: 0.9315
pr_auc: 0.9426


[Epoch 3] Train Loss: 0.2238 | Dev macro_f1: 0.8829
--> [SAVE] New best model: macro_f1 = 0.8829


[Final Train] Epoch 4/10: 100%|██████████| 1950/1950 [07:32<00:00,  4.31it/s, loss=0.0545]



===== VALIDATION METRICS =====
macro_precision: 0.8954
micro_precision: 0.9351
macro_recall: 0.8781
micro_recall: 0.9351
macro_f1: 0.8862
micro_f1: 0.9351
pr_auc: 0.9359


[Epoch 4] Train Loss: 0.1783 | Dev macro_f1: 0.8862
--> [SAVE] New best model: macro_f1 = 0.8862


[Final Train] Epoch 5/10: 100%|██████████| 1950/1950 [07:32<00:00,  4.31it/s, loss=0.3974]



===== VALIDATION METRICS =====
macro_precision: 0.9371
micro_precision: 0.9477
macro_recall: 0.8870
micro_recall: 0.9477
macro_f1: 0.9083
micro_f1: 0.9477
pr_auc: 0.9455


[Epoch 5] Train Loss: 0.1502 | Dev macro_f1: 0.9083
--> [SAVE] New best model: macro_f1 = 0.9083


[Final Train] Epoch 6/10: 100%|██████████| 1950/1950 [07:31<00:00,  4.32it/s, loss=0.0434]



===== VALIDATION METRICS =====
macro_precision: 0.9527
micro_precision: 0.9554
macro_recall: 0.8991
micro_recall: 0.9554
macro_f1: 0.9223
micro_f1: 0.9554
pr_auc: 0.9535


[Epoch 6] Train Loss: 0.1168 | Dev macro_f1: 0.9223
--> [SAVE] New best model: macro_f1 = 0.9223


[Final Train] Epoch 7/10: 100%|██████████| 1950/1950 [07:31<00:00,  4.32it/s, loss=0.1867]



===== VALIDATION METRICS =====
macro_precision: 0.9495
micro_precision: 0.9556
macro_recall: 0.8989
micro_recall: 0.9556
macro_f1: 0.9213
micro_f1: 0.9556
pr_auc: 0.9535


[Epoch 7] Train Loss: 0.0976 | Dev macro_f1: 0.9213


[Final Train] Epoch 8/10: 100%|██████████| 1950/1950 [07:31<00:00,  4.32it/s, loss=0.0387]



===== VALIDATION METRICS =====
macro_precision: 0.9445
micro_precision: 0.9546
macro_recall: 0.9038
micro_recall: 0.9546
macro_f1: 0.9209
micro_f1: 0.9546
pr_auc: 0.9693


[Epoch 8] Train Loss: 0.0868 | Dev macro_f1: 0.9209


[Final Train] Epoch 9/10: 100%|██████████| 1950/1950 [07:32<00:00,  4.31it/s, loss=0.0785]



===== VALIDATION METRICS =====
macro_precision: 0.9554
micro_precision: 0.9577
macro_recall: 0.9022
micro_recall: 0.9577
macro_f1: 0.9257
micro_f1: 0.9577
pr_auc: 0.9637


[Epoch 9] Train Loss: 0.0791 | Dev macro_f1: 0.9257
--> [SAVE] New best model: macro_f1 = 0.9257


[Final Train] Epoch 10/10: 100%|██████████| 1950/1950 [07:31<00:00,  4.32it/s, loss=0.0003]



===== VALIDATION METRICS =====
macro_precision: 0.9560
micro_precision: 0.9579
macro_recall: 0.9035
micro_recall: 0.9579
macro_f1: 0.9266
micro_f1: 0.9579
pr_auc: 0.9644


[Epoch 10] Train Loss: 0.0724 | Dev macro_f1: 0.9266
--> [SAVE] New best model: macro_f1 = 0.9266

✅ Restored best model with macro_f1: 0.9266
