In [2]:
''' 
================================================================================
XLM-RoBERTa Aspect-Based Sentiment Classification
================================================================================
PROJECT : NLP-Driven ABSA for Gastronomy Tourism Insights in Malaysia
PIPELINE: Pipelined-ABSA (Decoupled) — Step 3: Sentiment Classification
INPUT   : Dataset/aspect_categorization.pkl   (output of Notebook 5)
OUTPUT  : models/xlm_roberta_absa_best.pt     (best checkpoint)
          results/training_metrics.json        (loss/acc curves)

ACADEMIC JUSTIFICATION
----------------------
- XLM-RoBERTa (Conneau et al., 2020): Pre-trained on 100 languages including
  Malay and Chinese. Superior zero/few-shot cross-lingual transfer vs.
  monolingual BERT, critical for Manglish code-switching.
- Aspect-Conditioned Input (Sun et al., 2019): We prepend the aspect category
  to the segment text as "[aspect] [SEP] [segment]". This forces the model to
  learn aspect-specific sentiment representations rather than general polarity.
- Class-Weighted Loss (Japkowicz & Stephen, 2002): Our dataset is severely
  imbalanced (89% positive). We use the inverse-frequency weights computed in
  Notebook 4 to prevent the model from trivially predicting "positive".
- Weak Supervision (Ratner et al., 2016): Star ratings are noisy proxies for
  sentiment. The consistency filtering in Notebook 4 already removed the worst
  offenders (4.1% noise). Residual noise is tolerable for fine-tuning.
================================================================================
'''



# STAGE 0: Environment & Dependency Verification

In [3]:
# pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0

In [4]:
# Fix PyTorch DLL loading issue on Windows
import os
import platform
if platform.system() == "Windows":
    import ctypes
    from importlib.util import find_spec
    try:
        if (spec := find_spec("torch")) and spec.origin and os.path.exists(
            dll_path := os.path.join(os.path.dirname(spec.origin), "lib", "c10.dll")
        ):
            ctypes.CDLL(os.path.normpath(dll_path))
    except Exception:
        pass

In [5]:
import sys
import importlib

REQUIRED = {
    "torch": "PyTorch",
    "transformers": "HuggingFace Transformers",
    "pandas": "Pandas",
    "numpy": "NumPy",
    "sklearn": "Scikit-Learn",
}


def check_environment():
    """Verify all required packages are installed and print versions.

    Why:
        Explicit environment checks prevent cryptic import errors mid-training,
        which is especially costly when running on GPU with long epoch times.
    """
    print("=" * 70)
    print("ENVIRONMENT CHECK")
    print("=" * 70)
    all_ok = True
    for module_name, display_name in REQUIRED.items():
        try:
            mod = importlib.import_module(module_name)
            version = getattr(mod, "__version__", "unknown")
            print(f"  ✓  {display_name:<30} v{version}")
        except ImportError:
            print(f"  ✗  {display_name:<30} NOT INSTALLED")
            all_ok = False

    # Special check: torch CUDA availability
    import torch

    cuda_avail = torch.cuda.is_available()
    device_name = torch.cuda.get_device_name(0) if cuda_avail else "CPU only"
    print(f"\n  GPU Available: {cuda_avail}  →  {device_name}")
    print(f"  Python:        {sys.version}")
    print("=" * 70)

    if not all_ok:
        raise RuntimeError(
            "Some packages are missing. Install them before continuing."
        )
    return torch.device("cuda" if cuda_avail else "cpu")


DEVICE = check_environment()

ENVIRONMENT CHECK
  ✓  PyTorch                        v2.8.0+cpu
  ✓  HuggingFace Transformers       v4.41.2
  ✓  Pandas                         v2.2.3
  ✓  NumPy                          v2.2.0
  ✓  Scikit-Learn                   v1.5.1

  GPU Available: False  →  CPU only
  Python:        3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]


# STAGE 1: Hyperparameter Configuration (Single Source of Truth)

In [6]:
from dataclasses import dataclass, field
from typing import List


@dataclass
class TrainingConfig:
    """Central configuration object for the entire training run.

    Why a dataclass:
        Keeps hyperparameters serializable (can be logged to JSON for
        reproducibility) and gives IDE auto-completion — important when
        iterating quickly on a GPU budget.
    """

    # --- Model -----------------------------------------------------------
    model_name: str = "xlm-roberta-base"
    # "xlm-roberta-base" (278M params) is the default.
    # Switch to "xlm-roberta-large" (550M) if GPU memory allows (≥16 GB).

    num_labels: int = 2  # 0 = negative, 1 = positive

    # --- Data ------------------------------------------------------------
    data_path: str = r"C:/Users/Ong Hui Ling/Dropbox/PC/Documents/Github/Aspect-Based-Sentiment-Analysis/Dataset/aspect_categorization.pkl"
    max_seq_length: int = 128
    # 128 covers ~95th percentile of your segment lengths (median ~54 words).
    # Increase to 256 only if you see significant truncation in logs.

    test_size: float = 0.15  # 15% held out for evaluation
    val_size: float = 0.10   # 10% for early-stopping validation
    random_seed: int = 42

    # --- Training --------------------------------------------------------
    batch_size: int = 32       # Reduce to 16 if OOM on your GPU
    learning_rate: float = 2e-5  # Classic BERT fine-tuning sweet spot
    num_epochs: int = 5
    warmup_ratio: float = 0.1  # 10% of total steps used for LR warm-up
    weight_decay: float = 0.01

    # --- Class Weights (from Notebook 4: compute_class_weight) -----------
    # Negative (class 0) weight: 4.5448
    # Positive (class 1) weight: 0.5618
    class_weights: List[float] = field(default_factory=lambda: [4.5448, 0.5618])

    # --- Output ----------------------------------------------------------
    output_dir: str = "models"
    best_model_path: str = "models/xlm_roberta_absa_best.pt"
    metrics_path: str = "results/training_metrics.json"


CFG = TrainingConfig()
print(f"\n✓ Config loaded. Model: {CFG.model_name} | Epochs: {CFG.num_epochs} | "
      f"Batch: {CFG.batch_size} | LR: {CFG.learning_rate}")



✓ Config loaded. Model: xlm-roberta-base | Epochs: 5 | Batch: 32 | LR: 2e-05


# STAGE 2: Data Loading & Preprocessing

In [7]:
# ==============================================================================
# Transforms aspect_categorization.pkl → train/val/test splits
# ready for the PyTorch DataLoader.
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Label encoding map (explicit is better than implicit)
LABEL2ID = {"negative": 0, "positive": 1}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}


def load_and_prepare_data(cfg: TrainingConfig) -> dict:
    """Load aspect segments and split into train / val / test.

    Why stratified split:
        With 89/11 class imbalance, a random split can accidentally create
        a val/test set with zero or near-zero negative samples. Stratification
        guarantees each split mirrors the overall class ratio.

    Why we filter out multi-aspect segments for training:
        When a segment maps to multiple aspects (e.g., [FOOD, AMBIENCE]),
        the weak label (derived from the whole review) is even noisier for
        that segment. We keep only single-aspect segments for cleaner
        supervision signal. Multi-aspect segments can still be predicted
        at inference time.

    Args:
        cfg: TrainingConfig instance.

    Returns:
        Dict with keys: 'train', 'val', 'test' — each a DataFrame.
    """
    print("\n" + "=" * 70)
    print("LOADING & SPLITTING DATA")
    print("=" * 70)

    df = pd.read_pickle(cfg.data_path)
    print(f"  Raw segments loaded: {len(df):,}")

    # --- Filter to single-aspect segments for cleaner weak supervision ----
    df["num_aspects"] = df["Aspect_Labels"].apply(len)
    df_single = df[df["num_aspects"] == 1].copy()
    df_single["aspect"] = df_single["Aspect_Labels"].apply(lambda x: x[0])
    print(f"  Single-aspect segments (for training): {len(df_single):,}")
    print(f"  Multi-aspect segments (dropped for training): "
          f"{len(df) - len(df_single):,}")

    # --- Encode labels ---------------------------------------------------
    df_single["label"] = df_single["Sentiment_Label"].map(LABEL2ID)

    # Sanity check: no NaN labels
    assert df_single["label"].isna().sum() == 0, (
        "Found NaN labels! Check Sentiment_Label column values."
    )

    print(f"\n  Label distribution:")
    print(df_single["label"].value_counts().sort_index().to_string(
        index=True).replace("0", "  0 (negative)").replace("1", "  1 (positive)")
    )

    # --- Train / Val / Test split (two-stage stratified) -----------------
    # Stage 1: Separate test set
    df_trainval, df_test = train_test_split(
        df_single,
        test_size=cfg.test_size,
        stratify=df_single["label"],
        random_state=cfg.random_seed,
    )
    # Stage 2: Split remainder into train + val
    # Adjust val_size relative to the remaining data
    adjusted_val_size = cfg.val_size / (1.0 - cfg.test_size)
    df_train, df_val = train_test_split(
        df_trainval,
        test_size=adjusted_val_size,
        stratify=df_trainval["label"],
        random_state=cfg.random_seed,
    )

    splits = {"train": df_train, "val": df_val, "test": df_test}

    print(f"\n  Split sizes:")
    for name, split_df in splits.items():
        pos = (split_df["label"] == 1).sum()
        neg = (split_df["label"] == 0).sum()
        print(f"    {name:<6}: {len(split_df):>7,} rows "
              f"| pos: {pos:,} ({pos/len(split_df)*100:.1f}%) "
              f"| neg: {neg:,} ({neg/len(split_df)*100:.1f}%)")

    return splits


DATA = load_and_prepare_data(CFG)



LOADING & SPLITTING DATA
  Raw segments loaded: 132,637
  Single-aspect segments (for training): 100,557
  Multi-aspect segments (dropped for training): 32,080

  Label distribution:
label
  0 (negative)     7673
  1 (positive)    92884

  Split sizes:
    train :  75,417 rows | pos: 69,662 (92.4%) | neg: 5,755 (7.6%)
    val   :  10,056 rows | pos: 9,289 (92.4%) | neg: 767 (7.6%)
    test  :  15,084 rows | pos: 13,933 (92.4%) | neg: 1,151 (7.6%)


# STAGE 3: PyTorch Dataset Class

In [9]:
# ==============================================================================
# Wraps a DataFrame split into a tokenized, aspect-conditioned dataset.
# ==============================================================================

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer


class ABSADataset(Dataset):
    """Aspect-conditioned sentiment dataset for XLM-RoBERTa.

    Input Format (Sun et al., 2019 — Aspect-Based Sentiment):
        Input:  "[aspect] </s></s> [segment text]"
        Label:  0 (negative) or 1 (positive)

    Why "</s></s>" (double SEP):
        XLM-RoBERTa uses </s> as its separator token (unlike BERT's [SEP]).
        The double </s></s> pattern is the standard way RoBERTa-family models
        denote a sentence boundary — this is baked into its pre-training.

    Example:
        Input text:  "FOOD </s></s> the nasi lemak was incredibly sedap"
        Tokenized:   <s> FOOD </s> </s> the nasi lemak was incredibly sedap </s>
        Label:       1 (positive)

    Args:
        df: DataFrame with columns ['aspect', 'Segment', 'label'].
        tokenizer: HuggingFace tokenizer for xlm-roberta.
        max_length: Maximum token length (default 128).
    """

    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: AutoTokenizer,
        max_length: int = 128,
    ):
        self.texts = df["Segment"].tolist()
        self.aspects = df["aspect"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> dict:
        """Tokenize a single (aspect, segment) pair on-the-fly.

        Why on-the-fly tokenization (not pre-tokenized):
            For datasets of this size (~100K), pre-tokenizing and caching
            in memory is faster but uses ~2-3 GB RAM. On-the-fly keeps
            memory footprint low and simplifies the code. If training
            speed becomes the bottleneck, switch to a pre-tokenized cache.

        Returns:
            Dict with 'input_ids', 'attention_mask', 'labels' tensors.
        """
        aspect = self.aspects[idx]
        segment = self.texts[idx]
        label = self.labels[idx]

        # --- Construct aspect-conditioned input --------------------------
        # Format: "ASPECT_LABEL </s></s> segment_text"
        # The aspect is uppercased to visually distinguish it as a
        # "prompt token" — the model learns to treat it as a conditioning
        # signal rather than natural language.
        conditioned_text = f"{aspect.upper()} </s></s> {segment}"

        # --- Tokenize ----------------------------------------------------
        encoding = self.tokenizer(
            conditioned_text,
            max_length=self.max_length,
            padding="max_length",       # Pad all sequences to max_length
            truncation=True,            # Truncate if longer
            return_tensors="pt",        # Return PyTorch tensors directly
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),       # (max_length,)
            "attention_mask": encoding["attention_mask"].squeeze(0),  # (max_length,)
            "labels": torch.tensor(label, dtype=torch.long),     # scalar
        }


def build_dataloaders(data: dict, cfg: TrainingConfig) -> dict:
    """Instantiate tokenizer, datasets, and DataLoaders.

    Args:
        data: Dict with 'train', 'val', 'test' DataFrames.
        cfg: TrainingConfig.

    Returns:
        Dict with 'train', 'val', 'test' DataLoaders and 'tokenizer'.
    """
    print("\n" + "=" * 70)
    print("BUILDING DATALOADERS")
    print("=" * 70)

    # Load tokenizer (downloads ~1 MB vocab file on first run)
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    print(f"  ✓ Tokenizer loaded: {cfg.model_name}")
    print(f"    Vocab size: {tokenizer.vocab_size:,}")

    loaders = {}
    datasets_info = {}

    for split_name, df in data.items():
        dataset = ABSADataset(df, tokenizer, cfg.max_seq_length)

        # Training set uses shuffle; val/test do not
        is_train = split_name == "train"
        loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=cfg.batch_size,
            shuffle=is_train,
            num_workers=0,        # 0 = main process (safe on Windows)
            pin_memory=True,      # Faster CPU→GPU transfer
        )
        loaders[split_name] = loader
        datasets_info[split_name] = len(dataset)

    loaders["tokenizer"] = tokenizer

    print(f"\n  Dataset sizes & batches:")
    for name, size in datasets_info.items():
        n_batches = size // cfg.batch_size + (1 if size % cfg.batch_size else 0)
        print(f"    {name:<6}: {size:>7,} samples → {n_batches:>4,} batches "
              f"(batch_size={cfg.batch_size})")

    # --- Quick sanity check: decode one sample ---------------------------
    sample_batch = next(iter(loaders["train"]))
    sample_text = tokenizer.decode(
        sample_batch["input_ids"][0], skip_special_tokens=False
    )
    print(f"\n  Sample input (decoded):")
    print(f"    \"{sample_text}\"")
    print(f"    Label: {ID2LABEL[sample_batch['labels'][0].item()]}")

    return loaders


LOADERS = build_dataloaders(DATA, CFG)


BUILDING DATALOADERS


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

  ✓ Tokenizer loaded: xlm-roberta-base
    Vocab size: 250,002

  Dataset sizes & batches:
    train :  75,417 samples → 2,357 batches (batch_size=32)
    val   :  10,056 samples →  315 batches (batch_size=32)
    test  :  15,084 samples →  472 batches (batch_size=32)

  Sample input (decoded):
    "<s> FOOD</s></s> blue cheese panini</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"
    Label: positive




# STAGE 4: Model Definition

In [10]:
# ==============================================================================
# XLM-RoBERTa + a 2-class classification head + class-weighted loss.
# ==============================================================================

from transformers import AutoModelForSequenceClassification
import torch.nn as nn


class ABSASentimentClassifier(nn.Module):
    """XLM-RoBERTa with a classification head for binary sentiment.

    Why AutoModelForSequenceClassification instead of raw AutoModel:
        The "ForSequenceClassification" variant already includes:
          - The [CLS] token pooling (first token representation)
          - A dropout layer
          - A linear projection to num_labels
        Building these manually adds no value and risks subtle bugs
        (e.g., forgetting dropout → overfitting).

    Why we store class_weights on the model:
        This ensures the weights move to the correct device (CPU/GPU)
        alongside the model when .to(device) is called. Forgetting this
        is one of the most common PyTorch bugs.

    Args:
        cfg: TrainingConfig.
    """

    def __init__(self, cfg: TrainingConfig):
        super().__init__()
        self.backbone = AutoModelForSequenceClassification.from_pretrained(
            cfg.model_name,
            num_labels=cfg.num_labels,
        )
        # Register class weights as a buffer (not a parameter —
        # it won't be updated by the optimizer, but WILL move with .to())
        self.register_buffer(
            "class_weights",
            torch.tensor(cfg.class_weights, dtype=torch.float),
        )
        self.loss_fn = nn.CrossEntropyLoss(weight=self.class_weights)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: torch.Tensor = None,
    ) -> dict:
        """Forward pass.

        Args:
            input_ids: Token IDs (batch_size, seq_len).
            attention_mask: 1 for real tokens, 0 for padding (batch_size, seq_len).
            labels: Ground truth labels (batch_size,). Optional — if None,
                    only logits are returned (useful for inference).

        Returns:
            Dict with 'loss' (if labels provided) and 'logits'.
        """
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits  # (batch_size, num_labels)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}


def build_model(cfg: TrainingConfig) -> ABSASentimentClassifier:
    """Instantiate model and move to device."""
    print("\n" + "=" * 70)
    print("BUILDING MODEL")
    print("=" * 70)

    model = ABSASentimentClassifier(cfg)
    model = model.to(DEVICE)

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  ✓ Model loaded on {DEVICE}")
    print(f"    Total params:     {total_params:>12,}")
    print(f"    Trainable params: {trainable_params:>12,}")
    print(f"    Class weights:    {cfg.class_weights}")

    return model


MODEL = build_model(CFG)


BUILDING MODEL


W0131 14:57:07.220000 20524 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  ✓ Model loaded on cpu
    Total params:      278,045,186
    Trainable params:  278,045,186
    Class weights:    [4.5448, 0.5618]


# STAGE 5: Training Loop

In [11]:
import os
import json
import math
import time
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score, classification_report


def compute_metrics(predictions: list, labels: list) -> dict:
    """Compute accuracy, macro-F1, and per-class F1.

    Why macro-F1 over accuracy:
        With 89/11 imbalance, a model predicting "positive" always gets
        89% accuracy. Macro-F1 weights both classes equally, so it
        actually measures whether the model learned the minority class.

    Args:
        predictions: List of predicted class IDs.
        labels: List of ground-truth class IDs.

    Returns:
        Dict with 'accuracy', 'macro_f1', 'neg_f1', 'pos_f1'.
    """
    acc = accuracy_score(labels, predictions)
    macro_f1 = f1_score(labels, predictions, average="macro")
    per_class_f1 = f1_score(labels, predictions, average=None)  # [neg_f1, pos_f1]
    return {
        "accuracy": round(acc, 4),
        "macro_f1": round(macro_f1, 4),
        "neg_f1": round(per_class_f1[0], 4),
        "pos_f1": round(per_class_f1[1], 4),
    }


def train_epoch(model, dataloader, optimizer, scheduler, device) -> dict:
    """Run one full training epoch.

    Returns:
        Dict with 'loss' (average over all batches).
    """
    model.train()
    total_loss = 0.0
    num_batches = 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs["loss"]

        # Backward pass
        loss.backward()
        # Gradient clipping: prevents exploding gradients in transformers
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        num_batches += 1

    return {"loss": round(total_loss / num_batches, 6)}


@torch.no_grad()
def evaluate(model, dataloader, device) -> dict:
    """Run evaluation on val or test set (no gradient computation).

    Returns:
        Dict with 'loss', 'accuracy', 'macro_f1', 'neg_f1', 'pos_f1'.
    """
    model.eval()
    total_loss = 0.0
    num_batches = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        total_loss += outputs["loss"].item()
        num_batches += 1

        # Argmax → predicted class
        preds = torch.argmax(outputs["logits"], dim=-1)
        all_preds.extend(preds.cpu().numpy().tolist())
        all_labels.extend(labels.cpu().numpy().tolist())

    metrics = compute_metrics(all_preds, all_labels)
    metrics["loss"] = round(total_loss / num_batches, 6)

    return metrics


def train(model, loaders, cfg, device) -> dict:
    """Full training loop with early stopping on val macro-F1.

    Why early stopping on macro-F1 (not loss):
        Validation loss can continue decreasing even as the model starts
        overfitting to the majority class. Macro-F1 directly measures
        what we care about: balanced performance on both classes.

    Args:
        model: ABSASentimentClassifier (already on device).
        loaders: Dict with 'train', 'val', 'test' DataLoaders.
        cfg: TrainingConfig.
        device: torch.device.

    Returns:
        Dict with full training history (for plotting / logging).
    """
    print("\n" + "=" * 70)
    print("TRAINING")
    print("=" * 70)

    # --- Optimizer: AdamW (standard for transformer fine-tuning) --------
    optimizer = AdamW(
        model.parameters(),
        lr=cfg.learning_rate,
        weight_decay=cfg.weight_decay,
    )

    # --- Learning rate scheduler: linear warm-up then linear decay -------
    # Why: Transformers are sensitive to LR. A warm-up phase prevents
    # catastrophic early updates to pre-trained weights.
    total_steps = len(loaders["train"]) * cfg.num_epochs
    warmup_steps = int(total_steps * cfg.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
    )
    print(f"  Total training steps:  {total_steps:,}")
    print(f"  Warmup steps:          {warmup_steps:,}")

    # --- Output directory -------------------------------------------------
    os.makedirs(os.path.dirname(cfg.best_model_path) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.metrics_path) or ".", exist_ok=True)

    # --- Training history -------------------------------------------------
    history = {"train": [], "val": [], "test": None}
    best_val_f1 = -1.0
    patience = 2   # Stop if val macro-F1 doesn't improve for 2 epochs
    patience_counter = 0

    for epoch in range(1, cfg.num_epochs + 1):
        epoch_start = time.time()

        # --- Train ---
        train_metrics = train_epoch(model, loaders["train"], optimizer,
                                    scheduler, device)
        history["train"].append(train_metrics)

        # --- Validate ---
        val_metrics = evaluate(model, loaders["val"], device)
        history["val"].append(val_metrics)

        epoch_time = time.time() - epoch_start

        # --- Log -------------------------------------------------------------
        print(f"\n  Epoch {epoch}/{cfg.num_epochs}  ({epoch_time:.1f}s)")
        print(f"    Train Loss:      {train_metrics['loss']:.6f}")
        print(f"    Val  Loss:       {val_metrics['loss']:.6f}")
        print(f"    Val  Accuracy:   {val_metrics['accuracy']:.4f}")
        print(f"    Val  Macro-F1:   {val_metrics['macro_f1']:.4f}  "
              f"(neg: {val_metrics['neg_f1']:.4f} | "
              f"pos: {val_metrics['pos_f1']:.4f})")

        # --- Early stopping & best-model checkpoint -------------------------
        if val_metrics["macro_f1"] > best_val_f1:
            best_val_f1 = val_metrics["macro_f1"]
            patience_counter = 0
            # Save only the model state_dict (not the whole object)
            torch.save(model.state_dict(), cfg.best_model_path)
            print(f"    ★ New best model saved  (macro-F1: {best_val_f1:.4f})")
        else:
            patience_counter += 1
            print(f"    ✗ No improvement. Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print(f"\n  ⚡ Early stopping at epoch {epoch}.")
                break

    # --- Final evaluation on TEST set (only once, after training) -----------
    # Load the best checkpoint before evaluating
    print("\n" + "-" * 70)
    print("  LOADING BEST MODEL FOR FINAL TEST EVALUATION")
    print("-" * 70)
    model.load_state_dict(torch.load(cfg.best_model_path, map_location=device))

    test_metrics = evaluate(model, loaders["test"], device)
    history["test"] = test_metrics

    print(f"\n  ★ FINAL TEST RESULTS (best checkpoint):")
    print(f"    Test Loss:       {test_metrics['loss']:.6f}")
    print(f"    Test Accuracy:   {test_metrics['accuracy']:.4f}")
    print(f"    Test Macro-F1:   {test_metrics['macro_f1']:.4f}")
    print(f"      Negative F1:   {test_metrics['neg_f1']:.4f}")
    print(f"      Positive  F1:  {test_metrics['pos_f1']:.4f}")

    # --- Full classification report (for the report / thesis) --------------
    # Re-run test set to collect all predictions for the report
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loaders["test"]:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs["logits"], dim=-1)
            all_preds.extend(preds.cpu().numpy().tolist())
            all_labels.extend(batch["labels"].numpy().tolist())

    print(f"\n  Classification Report:")
    print(classification_report(
        all_labels, all_preds,
        target_names=["Negative", "Positive"],
    ))

    # --- Save training history as JSON ------------------------------------
    with open(cfg.metrics_path, "w") as f:
        json.dump(history, f, indent=2)
    print(f"  ✓ Training metrics saved to: {cfg.metrics_path}")

    return history

# STAGE 6: Execute Training

In [None]:
HISTORY = train(MODEL, LOADERS, CFG, DEVICE)

print("\n" + "=" * 70)
print("TRAINING COMPLETE")
print("=" * 70)
print(f"  Best model:   {CFG.best_model_path}")
print(f"  Metrics file: {CFG.metrics_path}")
print("=" * 70)


TRAINING
  Total training steps:  11,785
  Warmup steps:          1,178


