# XLM-RoBERTa Aspect-Based Sentiment Classification

- INPUT     : Dataset/aspect_categorization.pkl
- OUTPUT    : 
    - models/xlm_roberta_absa_best.pt     (best checkpoint)
    - results/training_metrics.json        (loss/acc curves)

## ACADEMIC JUSTIFICATION
- XLM-RoBERTa (Conneau et al., 2020): Pre-trained on 100 languages including Malay and Chinese. Superior zero/few-shot cross-lingual transfer vs. monolingual BERT, critical for Manglish code-switching.
- Aspect-Conditioned Input (Sun et al., 2019): We prepend the aspect category to the segment text as "[aspect] [SEP] [segment]". This forces the model to learn aspect-specific sentiment representations rather than general polarity.
- Class-Weighted Loss (Japkowicz & Stephen, 2002): Our dataset is severely imbalanced. We use the inverse-frequency weights method to prevent the model from trivially predicting "positive".
- Weak Supervision (Ratner et al., 2016): Star ratings are noisy proxies for sentiment. 

----------------------

# STAGE 0: Environment & Dependency Verification

In [None]:
# Connect to google drive
from google.colab import drive
import os

# 1. Mount Google Drive (To save the model checkpoints)
drive.mount('/content/drive')

# 2. Install Libraries 
!pip install transformers accelerate tokenizers -q

Mounted at /content/drive
/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis
üîÑ Updating your repo...
/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Aspect-Based-Sentiment-Analysis
Already up to date.
/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis
/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Aspect-Based-Sentiment-Analysis

READY! We are now running on Google's Computer.


In [None]:
# pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0

In [None]:
# Fix PyTorch DLL loading issue on Windows
import os
import platform
if platform.system() == "Windows":
    import ctypes
    from importlib.util import find_spec
    try:
        if (spec := find_spec("torch")) and spec.origin and os.path.exists(
            dll_path := os.path.join(os.path.dirname(spec.origin), "lib", "c10.dll")
        ):
            ctypes.CDLL(os.path.normpath(dll_path))
    except Exception:
        pass

In [None]:
import sys
import importlib

REQUIRED = {
    "torch": "PyTorch",
    "transformers": "HuggingFace Transformers",
    "pandas": "Pandas",
    "numpy": "NumPy",
    "sklearn": "Scikit-Learn",
}


def check_environment():
    """Verify all required packages are installed and print versions.

    Why:
        Explicit environment checks prevent cryptic import errors mid-training,
        which is especially costly when running on GPU with long epoch times.
    """
    print("=" * 70)
    print("ENVIRONMENT CHECK")
    print("=" * 70)
    all_ok = True
    for module_name, display_name in REQUIRED.items():
        try:
            mod = importlib.import_module(module_name)
            version = getattr(mod, "__version__", "unknown")
            print(f"  ‚úì  {display_name:<30} v{version}")
        except ImportError:
            print(f"  ‚úó  {display_name:<30} NOT INSTALLED")
            all_ok = False

    # Special check: torch CUDA availability
    import torch

    cuda_avail = torch.cuda.is_available()
    device_name = torch.cuda.get_device_name(0) if cuda_avail else "CPU only"
    print(f"\n  GPU Available: {cuda_avail}  ‚Üí  {device_name}")
    print(f"  Python:        {sys.version}")
    print("=" * 70)

    if not all_ok:
        raise RuntimeError(
            "Some packages are missing. Install them before continuing."
        )
    return torch.device("cuda" if cuda_avail else "cpu")


DEVICE = check_environment()

ENVIRONMENT CHECK
  ‚úì  PyTorch                        v2.8.0+cpu
  ‚úì  HuggingFace Transformers       v4.41.2
  ‚úì  Pandas                         v2.2.3
  ‚úì  NumPy                          v2.2.0
  ‚úì  Scikit-Learn                   v1.5.1

  GPU Available: False  ‚Üí  CPU only
  Python:        3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]


# STAGE 1: Hyperparameter Configuration (Single Source of Truth)

In [None]:
from dataclasses import dataclass, field
from typing import List, Optional


@dataclass
class TrainingConfig:
    """Central configuration object for the entire training run.

    Why a dataclass:
        Keeps hyperparameters serializable (can be logged to JSON for
        reproducibility) and gives IDE auto-completion ‚Äî important when
        iterating quickly on a GPU budget.
    """

    # --- Model -----------------------------------------------------------
    model_name: str = "xlm-roberta-base"
    # "xlm-roberta-base" (278M params) is the default.
    # Switch to "xlm-roberta-large" (550M) if GPU memory allows (‚â•16 GB).

    num_labels: int = 2  # 0 = negative, 1 = positive

    # --- Data ------------------------------------------------------------
    data_path: str = r"/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Dataset/aspect_categorization_before_filtering.pkl"
    max_seq_length: int = 128
    # 128 covers ~95th percentile of the segment lengths (median ~54 words).
    # Increase to 256 only if you see significant truncation in logs.

    test_size: float = 0.15  # 15% held out for evaluation
    val_size: float = 0.10   # 10% for early-stopping validation
    random_seed: int = 42

    # --- Training --------------------------------------------------------
    batch_size: int = 32      
    learning_rate: float = 1e-5  # Reduced from 2e-5 for better minority class learning
    num_epochs: int = 7        # Increased from 5 to give model more time
    warmup_ratio: float = 0.1  # 10% of total steps used for LR warm-up
    weight_decay: float = 0.01

    # --- Class Weights (computed dynamically from training data) ---------
    # Manual override for stronger minority class emphasis
    class_weights: Optional[List[float]] = field(default_factory=lambda: [6.0, 1.0])
    # Using field(default_factory) because lists are mutable and can't be direct defaults
    # Formula when None: n_samples / (n_classes * np.bincount(y))
    # This ensures minority class gets higher weight to balance gradients

    # --- Output ----------------------------------------------------------
    output_dir: str = "models"
    best_model_path: str = "/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models/xlm_roberta_absa_best_before_filtering.pt"
    metrics_path: str = "/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/results/training_metrics_before_filtering.json"

    # --- Gold Standard Dataset (for final evaluation) --------------------
    # Manually-annotated ground truth 
    gold_data_path: str = r"/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/Dataset/Final_Gold_Standard.csv"
    gold_results_path: str = "/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/results/gold_evaluation.json"


CFG = TrainingConfig()
print(f"\n‚úì Config loaded. Model: {CFG.model_name} | Epochs: {CFG.num_epochs} | "

      f"Batch: {CFG.batch_size} | LR: {CFG.learning_rate}")


‚úì Config loaded. Model: xlm-roberta-base | Epochs: 5 | Batch: 32 | LR: 2e-05


# STAGE 2: Data Loading & Preprocessing

In [None]:
# ==============================================================================
# Transforms aspect_categorization.pkl ‚Üí train/val/test splits
# ready for the PyTorch DataLoader.
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# Label encoding map (explicit is better than implicit)
LABEL2ID = {"negative": 0, "positive": 1}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}


def load_and_prepare_data(cfg: TrainingConfig) -> dict:
    """Load aspect segments and split into train / val / test.

    Why stratified split:
        With 89/11 class imbalance, a random split can accidentally create
        a val/test set with zero or near-zero negative samples. Stratification
        guarantees each split mirrors the overall class ratio.

    Why we filter out multi-aspect segments for training:
        When a segment maps to multiple aspects (e.g., [FOOD, AMBIENCE]),
        the weak label (derived from the whole review) is even noisier for
        that segment. We keep only single-aspect segments for cleaner
        supervision signal. Multi-aspect segments can still be predicted
        at inference time.

    Why we exclude gold standard review IDs:
        To prevent data leakage, we filter out any training segments that
        come from reviews already in the gold standard dataset. This ensures
        the model never sees text from gold reviews during training, making
        gold evaluation truly independent.

    Args:
        cfg: TrainingConfig instance.

    Returns:
        Dict with keys: 'train', 'val', 'test' ‚Äî each a DataFrame.
    """
    print("\n" + "=" * 70)
    print("LOADING & SPLITTING DATA")
    print("=" * 70)

    df = pd.read_pickle(cfg.data_path)
    print(f"  Raw segments loaded: {len(df):,}")

    # --- PREVENT DATA LEAKAGE: Exclude gold standard review IDs ----------
    print(f"\n  ‚ö†Ô∏è  DATA LEAKAGE PREVENTION:")
    print(f"  Loading gold standard to identify held-out review IDs...")
    
    try:
        gold_df = pd.read_csv(cfg.gold_data_path)
        
        # Extract unique Original_Review_IDs from gold dataset
        # Gold dataset may have 'Review_ID' or 'Original_Review_ID' column
        if 'Original_Review_ID' in gold_df.columns:
            gold_review_ids = set(gold_df['Original_Review_ID'].unique())
        elif 'Review_ID' in gold_df.columns:
            gold_review_ids = set(gold_df['Review_ID'].unique())
        else:
            print(f"  ‚ö†Ô∏è  Warning: Could not find review ID column in gold dataset")
            print(f"     Available columns: {list(gold_df.columns)}")
            print(f"     Proceeding without filtering (may cause data leakage!)")
            gold_review_ids = set()
        
        print(f"  ‚úì Gold dataset loaded: {len(gold_df):,} annotations")
        print(f"  ‚úì Unique review IDs in gold: {len(gold_review_ids):,}")
        
        # Filter out segments from gold review IDs
        n_before = len(df)
        df = df[~df['Original_Review_ID'].isin(gold_review_ids)].copy()
        n_after = len(df)
        n_removed = n_before - n_after
        
        print(f"  ‚úì Filtered out {n_removed:,} segments from gold reviews ({n_removed/n_before*100:.1f}%)")
        print(f"  ‚úì Training segments remaining: {n_after:,}")
        
    except Exception as e:
        print(f"  ‚úó Error loading gold dataset: {e}")
        print(f"     Proceeding without filtering (may cause data leakage!)")
    
    print(f"\n  Final training data: {len(df):,} segments")

    # --- Filter to single-aspect segments for cleaner weak supervision ----
    df["num_aspects"] = df["Aspect_Labels"].apply(len)
    df_single = df[df["num_aspects"] == 1].copy()
    df_single["aspect"] = df_single["Aspect_Labels"].apply(lambda x: x[0])
    
    n_multi = len(df) - len(df_single)
    pct_retained = (len(df_single) / len(df)) * 100
    
    print(f"\n  ‚ö†Ô∏è  FILTERING STRATEGY (for training only):")
    print(f"    Single-aspect segments:  {len(df_single):>7,} ({pct_retained:>5.1f}%) ‚Üí KEPT for training")
    print(f"    Multi-aspect segments:   {n_multi:>7,} ({100-pct_retained:>5.1f}%) ‚Üí DROPPED from training")
    print(f"    Segments remaining:      {len(df_single):>7,}")
    print(f"\n  üìä For inference/visualization: Use the FULL dataset (all segments)")
    print(f"     including multi-aspect ones to get complete review coverage.")

    # --- Encode labels ---------------------------------------------------
    df_single["label"] = df_single["Sentiment_Label"].map(LABEL2ID)

    # Sanity check: no NaN labels
    assert df_single["label"].isna().sum() == 0, (
        "Found NaN labels! Check Sentiment_Label column values."
    )

    print(f"\n  Label distribution:")
    print(df_single["label"].value_counts().sort_index().to_string(
        index=True).replace("0", "  0 (negative)").replace("1", "  1 (positive)")
    )

    # --- Train / Val / Test split (two-stage stratified) -----------------
    # Stage 1: Separate test set
    df_trainval, df_test = train_test_split(
        df_single,
        test_size=cfg.test_size,
        stratify=df_single["label"],
        random_state=cfg.random_seed,
    )
    # Stage 2: Split remainder into train + val
    # Adjust val_size relative to the remaining data
    adjusted_val_size = cfg.val_size / (1.0 - cfg.test_size)
    df_train, df_val = train_test_split(
        df_trainval,
        test_size=adjusted_val_size,
        stratify=df_trainval["label"],
        random_state=cfg.random_seed,
    )

    splits = {"train": df_train, "val": df_val, "test": df_test}

    print(f"\n  Split sizes:")
    for name, split_df in splits.items():
        pos = (split_df["label"] == 1).sum()
        neg = (split_df["label"] == 0).sum()
        print(f"    {name:<6}: {len(split_df):>7,} rows "
              f"| pos: {pos:,} ({pos/len(split_df)*100:.1f}%) "
              f"| neg: {neg:,} ({neg/len(split_df)*100:.1f}%)")

    # --- Compute Class Weights Dynamically -------------------------------
    # Check if class weights are manually set in config
    if cfg.class_weights is None:
        # Using inverse frequency weighting: n_samples / (n_classes * np.bincount(y))
        print(f"\n  Computing class weights from training data...")
        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=np.unique(df_train["label"]),
            y=df_train["label"]
        )
        cfg.class_weights = class_weights.tolist()
        print(f"  Class weights (auto-computed balanced):")
    else:
        print(f"\n  Using manually-set class weights from config:")
    
    for class_id, weight in enumerate(cfg.class_weights):
        print(f"    {ID2LABEL[class_id]:<10} (class {class_id}): {weight:.4f}")
    
    print(f"\n  Interpretation:")
    print(f"    Higher weight = minority class ‚Üí model penalized more for errors")
    if cfg.class_weights == [6.0, 1.0]:
        print(f"    ‚ö†Ô∏è  AGGRESSIVE WEIGHTING: Negative class errors cost 6x more!")
    print(f"    Formula (when auto): n_samples / (n_classes √ó count_per_class)")



    return splits

DATA = load_and_prepare_data(CFG)




LOADING & SPLITTING DATA
  Raw segments loaded: 132,637
  Single-aspect segments (for training): 100,557
  Multi-aspect segments (dropped for training): 32,080

  Label distribution:
label
  0 (negative)     7673
  1 (positive)    92884

  Split sizes:
    train :  75,417 rows | pos: 69,662 (92.4%) | neg: 5,755 (7.6%)
    val   :  10,056 rows | pos: 9,289 (92.4%) | neg: 767 (7.6%)
    test  :  15,084 rows | pos: 13,933 (92.4%) | neg: 1,151 (7.6%)


# STAGE 3: PyTorch Dataset Class

In [None]:
# ==============================================================================
# Wraps a DataFrame split into a tokenized, aspect-conditioned dataset.
# ==============================================================================

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding


class ABSADataset(Dataset):
    """Aspect-conditioned sentiment dataset for XLM-RoBERTa.

    Input Format (Sun et al., 2019 ‚Äî Aspect-Based Sentiment):
        Input:  "[aspect] </s></s> [segment text]"
        Label:  0 (negative) or 1 (positive)

    Why "</s></s>" (double SEP):
        XLM-RoBERTa uses </s> as its separator token (unlike BERT's [SEP]).
        The double </s></s> pattern is the standard way RoBERTa-family models
        denote a sentence boundary ‚Äî this is baked into its pre-training.

    Example:
        Input text:  "FOOD </s></s> the nasi lemak was incredibly sedap"
        Tokenized:   <s> FOOD </s> </s> the nasi lemak was incredibly sedap </s>
        Label:       1 (positive)

    Args:
        df: DataFrame with columns ['aspect', 'Segment', 'label'].
        tokenizer: HuggingFace tokenizer for xlm-roberta.
        max_length: Maximum token length (default 128).
    """

    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: AutoTokenizer,
        max_length: int = 128,
    ):
        self.texts = df["Segment"].tolist()
        self.aspects = df["aspect"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> dict:
        """Tokenize a single (aspect, segment) pair on-the-fly.

        Why on-the-fly tokenization (not pre-tokenized):
            For datasets of this size (~100K), pre-tokenizing and caching
            in memory is faster but uses ~2-3 GB RAM. On-the-fly keeps
            memory footprint low and simplifies the code. If training
            speed becomes the bottleneck, switch to a pre-tokenized cache.

        Returns:
            Dict with 'input_ids', 'attention_mask', 'labels' tensors.
        """
        aspect = self.aspects[idx]
        segment = self.texts[idx]
        label = self.labels[idx]

        # --- Construct aspect-conditioned input --------------------------
        # Format: "ASPECT_LABEL </s></s> segment_text"
        # The aspect is uppercased to visually distinguish it as a
        # "prompt token" ‚Äî the model learns to treat it as a conditioning
        # signal rather than natural language.
        conditioned_text = f"{aspect.upper()} </s></s> {segment}"

        # --- Tokenize ----------------------------------------------------
        encoding = self.tokenizer(
            conditioned_text,
            max_length=self.max_length,
            truncation=True,           
            padding=False,
            return_tensors=None,
        )

        return {
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"],
            "labels": label,
        }


def build_dataloaders(data: dict, cfg: TrainingConfig) -> dict:
    """Instantiate tokenizer, datasets, and DataLoaders.

    Args:
        data: Dict with 'train', 'val', 'test' DataFrames.
        cfg: TrainingConfig.

    Returns:
        Dict with 'train', 'val', 'test' DataLoaders and 'tokenizer'.
    """
    print("\n" + "=" * 70)
    print("BUILDING DATALOADERS")
    print("=" * 70)

    # Load tokenizer (downloads ~1 MB vocab file on first run)
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    print(f"  ‚úì Tokenizer loaded: {cfg.model_name}")
    print(f"    Vocab size: {tokenizer.vocab_size:,}")

    # OPTIMIZATION: Smart Collator handles dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    loaders = {}
    datasets_info = {}

    for split_name, df in data.items():
        dataset = ABSADataset(df, tokenizer, cfg.max_seq_length)

        # Training set uses shuffle; val/test do not
        is_train = split_name == "train"

        # OPTIMIZATION: Increase num_workers on Linux/Colab
        # If on Windows, keep at 0. If on Colab, use 2.
        import os
        workers = 2 if os.name == 'posix' else 0

        loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=cfg.batch_size,
            shuffle=is_train,
            collate_fn=data_collator,
            num_workers=workers,
            pin_memory=True,
        )
        loaders[split_name] = loader
        datasets_info[split_name] = len(dataset)

    loaders["tokenizer"] = tokenizer

    print(f"\n  Dataset sizes & batches:")
    for name, size in datasets_info.items():
        n_batches = size // cfg.batch_size + (1 if size % cfg.batch_size else 0)
        print(f"    {name:<6}: {size:>7,} samples ‚Üí {n_batches:>4,} batches "
              f"(batch_size={cfg.batch_size})")

    # --- Quick sanity check: decode one sample ---------------------------
    sample_batch = next(iter(loaders["train"]))
    sample_text = tokenizer.decode(
        sample_batch["input_ids"][0], skip_special_tokens=False
    )
    print(f"\n  Sample input (decoded):")
    print(f"    \"{sample_text}\"")
    print(f"    Label: {ID2LABEL[sample_batch['labels'][0].item()]}")

    return loaders


LOADERS = build_dataloaders(DATA, CFG)


BUILDING DATALOADERS
  ‚úì Tokenizer loaded: xlm-roberta-base
    Vocab size: 250,002

  Dataset sizes & batches:
    train :  75,417 samples ‚Üí 2,357 batches (batch_size=32)
    val   :  10,056 samples ‚Üí  315 batches (batch_size=32)
    test  :  15,084 samples ‚Üí  472 batches (batch_size=32)

  Sample input (decoded):
    "<s> FOOD</s></s> cooked to perfection</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"
    Label: positive


# STAGE 4: Model Definition

In [None]:
# ==============================================================================
# XLM-RoBERTa + a 2-class classification head + class-weighted loss.
# ==============================================================================

from transformers import AutoModelForSequenceClassification
import torch.nn as nn
import torch.nn.functional as F


class FocalLoss(nn.Module):
    """Focal Loss for handling extreme class imbalance (Lin et al., 2017).
    
    Academic Justification:
        Standard cross-entropy gives equal weight to all samples. With 89% positive
        reviews, most gradients come from easy-to-classify positives, causing the
        model to ignore hard negatives. Focal loss (Lin et al., 2017 - RetinaNet)
        down-weights easy examples and focuses learning on hard cases.
    
    Formula:
        FL(p_t) = -Œ±(1-p_t)^Œ≥ * log(p_t)
        where p_t = model confidence on true class
    
    Parameters:
        - Œ± (alpha): Weighting factor for class imbalance (0.25 = focus on minority)
        - Œ≥ (gamma): Focusing parameter (2.0 = strongly down-weight easy samples)
        - weight: Per-class weights (combines with focal mechanism)
    
    Why Œ≥=2.0:
        - Easy sample (p_t=0.99): Weight = (1-0.99)^2 = 0.0001 (nearly ignored)
        - Hard sample (p_t=0.51): Weight = (1-0.51)^2 = 0.24 (full attention)
        This forces the model to learn from challenging negatives.
    """
    def __init__(self, alpha=0.25, gamma=2.0, weight=None):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        # Register weight as buffer so it moves to GPU with the model
        # Use 'class_weight' to avoid conflict with nn.Module's 'weight' attribute
        if weight is not None:
            self.register_buffer('class_weight', weight)
        else:
            self.class_weight = None
        
    def forward(self, inputs, targets):
        """Compute focal loss.
        
        Args:
            inputs: Logits from model (batch_size, num_classes)
            targets: Ground truth labels (batch_size,)
            
        Returns:
            Scalar loss value
        """
        # Standard cross-entropy loss (unreduced)
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.class_weight)
        
        # Get model confidence on true class: p_t = exp(-CE)
        pt = torch.exp(-ce_loss)
        
        # Apply focal term: (1 - p_t)^gamma
        # Easy samples (high p_t) get low weight, hard samples get high weight
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        
        return focal_loss.mean()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BUILDING MODEL
  ‚úì Model loaded on cuda
    Total params:      278,045,186
    Trainable params:  278,045,186
    Class weights:    [4.5448, 0.5618]


In [None]:
class ABSASentimentClassifier(nn.Module):
    """XLM-RoBERTa with a classification head for binary sentiment.

    Why AutoModelForSequenceClassification instead of raw AutoModel:
        The "ForSequenceClassification" variant already includes:
          - The [CLS] token pooling (first token representation)
          - A dropout layer
          - A linear projection to num_labels
        Building these manually adds no value and risks subtle bugs
        (e.g., forgetting dropout ‚Üí overfitting).

    Why we store class_weights on the model:
        This ensures the weights move to the correct device (CPU/GPU)
        alongside the model when .to(device) is called. Forgetting this
        is one of the most common PyTorch bugs.

    Args:
        cfg: TrainingConfig.
    """

    def __init__(self, cfg: TrainingConfig):
        super().__init__()
        self.backbone = AutoModelForSequenceClassification.from_pretrained(
            cfg.model_name,
            num_labels=cfg.num_labels,
        )
        # Register class weights as a buffer (not a parameter ‚Äî
        # it won't be updated by the optimizer, but WILL move with .to())
        self.register_buffer(
            "class_weights",
            torch.tensor(cfg.class_weights, dtype=torch.float),
        )
        # Use Focal Loss instead of standard CrossEntropyLoss
        # alpha=0.25 focuses on minority class, gamma=2.0 down-weights easy examples
        self.loss_fn = FocalLoss(alpha=0.25, gamma=2.0, weight=self.class_weights)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: torch.Tensor = None,
    ) -> dict:
        """Forward pass.

        Args:
            input_ids: Token IDs (batch_size, seq_len).
            attention_mask: 1 for real tokens, 0 for padding (batch_size, seq_len).
            labels: Ground truth labels (batch_size,). Optional ‚Äî if None,
                    only logits are returned (useful for inference).

        Returns:
            Dict with 'loss' (if labels provided) and 'logits'.
        """
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits  # (batch_size, num_labels)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}


def build_model(cfg: TrainingConfig) -> ABSASentimentClassifier:
    """Instantiate model and move to device."""
    print("\n" + "=" * 70)
    print("BUILDING MODEL")
    print("=" * 70)

    model = ABSASentimentClassifier(cfg)
    model = model.to(DEVICE)

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  ‚úì Model loaded on {DEVICE}")
    print(f"    Total params:     {total_params:>12,}")
    print(f"    Trainable params: {trainable_params:>12,}")
    print(f"    Class weights:    {cfg.class_weights}")
    print(f"    Loss function:    Focal Loss (alpha=0.25, gamma=2.0)")

    return model


MODEL = build_model(CFG)

# STAGE 5: Training Loop

In [None]:
import os
import json
import math
import time
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, f1_score, classification_report
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm  

def compute_metrics(predictions: list, labels: list) -> dict:
    """Compute accuracy, macro-F1, and per-class F1.

    Why macro-F1 over accuracy:
        With 89/11 imbalance, a model predicting "positive" always gets
        89% accuracy. Macro-F1 weights both classes equally, so it
        actually measures whether the model learned the minority class.

    Args:
        predictions: List of predicted class IDs.
        labels: List of ground-truth class IDs.

    Returns:
        Dict with 'accuracy', 'macro_f1', 'neg_f1', 'pos_f1'.
    """
    acc = accuracy_score(labels, predictions)
    macro_f1 = f1_score(labels, predictions, average="macro")
    # Explicitly specify labels=[0, 1] to ensure we always get both classes
    # even if model predicts only one class (prevents IndexError)
    per_class_f1 = f1_score(labels, predictions, average=None, labels=[0, 1])
    return {
        "accuracy": round(acc, 4),
        "macro_f1": round(macro_f1, 4),
        "neg_f1": round(per_class_f1[0], 4),
        "pos_f1": round(per_class_f1[1], 4),
    }


def train_epoch(model, dataloader, optimizer, scheduler, device) -> dict:
    """Run one full training epoch.

    Returns:
        Dict with 'loss' (average over all batches).
    """
    model.train()
    total_loss = 0.0
    num_batches = 0

    # WRAP DATALOADER WITH TQDM FOR PROGRESS BAR
    # This creates the visual bar: [=====>      ] 45%
    progress_bar = tqdm(dataloader, desc="  Training", leave=False)

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs["loss"]

        # Backward pass
        loss.backward()
        # Gradient clipping: prevents exploding gradients in transformers
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        num_batches += 1

        # UPDATE PROGRESS BAR
        progress_bar.set_postfix({'loss': loss.item()})

    return {"loss": round(total_loss / num_batches, 6)}


@torch.no_grad()
def evaluate(model, dataloader, device) -> dict:
    """Run evaluation on val or test set (no gradient computation).

    Returns:
        Dict with 'loss', 'accuracy', 'macro_f1', 'neg_f1', 'pos_f1'.
    """
    model.eval()
    total_loss = 0.0
    num_batches = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        total_loss += outputs["loss"].item()
        num_batches += 1

        # Argmax ‚Üí predicted class
        preds = torch.argmax(outputs["logits"], dim=-1)
        all_preds.extend(preds.cpu().numpy().tolist())
        all_labels.extend(labels.cpu().numpy().tolist())

    metrics = compute_metrics(all_preds, all_labels)
    metrics["loss"] = round(total_loss / num_batches, 6)

    return metrics


def train(model, loaders, cfg, device) -> dict:
    """Full training loop with early stopping on val macro-F1.

    Why early stopping on macro-F1 (not loss):
        Validation loss can continue decreasing even as the model starts
        overfitting to the majority class. Macro-F1 directly measures
        what we care about: balanced performance on both classes.

    Args:
        model: ABSASentimentClassifier (already on device).
        loaders: Dict with 'train', 'val', 'test' DataLoaders.
        cfg: TrainingConfig.
        device: torch.device.

    Returns:
        Dict with full training history (for plotting / logging).
    """
    print("\n" + "=" * 70)
    print("TRAINING")
    print("=" * 70)

    # --- Optimizer: AdamW (standard for transformer fine-tuning) --------
    optimizer = AdamW(
        model.parameters(),
        lr=cfg.learning_rate,
        weight_decay=cfg.weight_decay,
    )

    # OPTIMIZATION: Initialize Scaler for FP16
    scaler = GradScaler()

    # --- Learning rate scheduler: linear warm-up then linear decay -------
    # Why: Transformers are sensitive to LR. A warm-up phase prevents
    # catastrophic early updates to pre-trained weights.
    total_steps = len(loaders["train"]) * cfg.num_epochs
    warmup_steps = int(total_steps * cfg.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
    )
    print(f"  Total training steps:  {total_steps:,}")
    print(f"  Warmup steps:          {warmup_steps:,}")

    # --- Output directory -------------------------------------------------
    os.makedirs(os.path.dirname(cfg.best_model_path) or ".", exist_ok=True)
    os.makedirs(os.path.dirname(cfg.metrics_path) or ".", exist_ok=True)

    # --- Training history -------------------------------------------------
    history = {"train": [], "val": [], "test": None}
    best_val_f1 = -1.0
    patience = 5   # Increased from 2 to 5 - minority class needs more time
    patience_counter = 0

    for epoch in range(1, cfg.num_epochs + 1):
        epoch_start = time.time()

        # --- Train ---
        train_metrics = train_epoch(model, loaders["train"], optimizer,
                                    scheduler, device)
        history["train"].append(train_metrics)

        # --- Validate ---
        val_metrics = evaluate(model, loaders["val"], device)
        history["val"].append(val_metrics)

        epoch_time = time.time() - epoch_start

        # --- Log -------------------------------------------------------------
        print(f"\n  Epoch {epoch}/{cfg.num_epochs}  ({epoch_time:.1f}s)")
        print(f"    Train Loss:      {train_metrics['loss']:.6f}")
        print(f"    Val  Loss:       {val_metrics['loss']:.6f}")
        print(f"    Val  Accuracy:   {val_metrics['accuracy']:.4f}")
        print(f"    Val  Macro-F1:   {val_metrics['macro_f1']:.4f}  "
              f"(neg: {val_metrics['neg_f1']:.4f} | "
              f"pos: {val_metrics['pos_f1']:.4f})")

        # --- Early stopping & best-model checkpoint -------------------------
        if val_metrics["macro_f1"] > best_val_f1:
            best_val_f1 = val_metrics["macro_f1"]
            patience_counter = 0
            # Save only the model state_dict (not the whole object)
            torch.save(model.state_dict(), cfg.best_model_path)
            print(f"    ‚òÖ New best model saved  (macro-F1: {best_val_f1:.4f})")
        else:
            patience_counter += 1
            print(f"    ‚úó No improvement. Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print(f"\n  ‚ö° Early stopping at epoch {epoch}.")
                break

    # --- Final evaluation on WEAK TEST set (only once, after training) ------
    # Load the best checkpoint before evaluating
    print("\n" + "-" * 70)
    print("  LOADING BEST MODEL FOR WEAK TEST SET EVALUATION")
    print("-" * 70)
    model.load_state_dict(torch.load(cfg.best_model_path, map_location=device))

    test_metrics = evaluate(model, loaders["test"], device)
    history["test"] = test_metrics

    print(f"\n  ‚òÖ WEAK TEST SET RESULTS (15% split from training data):")
    print(f"    Test Loss:       {test_metrics['loss']:.6f}")
    print(f"    Test Accuracy:   {test_metrics['accuracy']:.4f}")
    print(f"    Test Macro-F1:   {test_metrics['macro_f1']:.4f}")
    print(f"      Negative F1:   {test_metrics['neg_f1']:.4f}")
    print(f"      Positive  F1:  {test_metrics['pos_f1']:.4f}")

    # ------------ Full classification report --------------
    # Re-run test set to collect all predictions for the report
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loaders["test"]:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            preds = torch.argmax(outputs["logits"], dim=-1)
            all_preds.extend(preds.cpu().numpy().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())

    print(f"\n  Classification Report:")
    print(classification_report(
        all_labels, all_preds,
        target_names=["Negative", "Positive"],
    ))

    # --- Save training history as JSON ------------------------------------
    with open(cfg.metrics_path, "w") as f:
        json.dump(history, f, indent=2)
    print(f"  ‚úì Training metrics saved to: {cfg.metrics_path}")

    return history

# STAGE 6: Execute Training

In [None]:
HISTORY = train(MODEL, LOADERS, CFG, DEVICE)

print("\n" + "=" * 70)
print("TRAINING COMPLETE")
print("=" * 70)
print(f"  Best model:   {CFG.best_model_path}")
print(f"  Metrics file: {CFG.metrics_path}")
print("=" * 70)

# STAGE 7: Model Evaluation on Gold

In [None]:
def evaluate_on_gold(model, cfg: TrainingConfig, tokenizer, device) -> dict:
    """Evaluate trained model on the Gold Standard (manually-annotated) dataset.

    Why separate gold evaluation:
        - Training uses WEAK labels (star ratings) with inherent noise
        - Gold standard has HUMAN-ANNOTATED labels (ground truth)
        - We evaluate on gold separately to measure TRUE model performance

    Academic Justification:
        Following the evaluation protocol of Sun et al. (2019) and Pontiki et al.
        (2016), we report performance on a gold standard test set annotated by
        domain experts. This accounts for label noise in the weak supervision
        training set and provides trustworthy F1 scores for the final thesis.

    Args:
        model: Trained ABSASentimentClassifier (already on best checkpoint)
        cfg: TrainingConfig with gold_data_path
        tokenizer: XLM-RoBERTa tokenizer
        device: torch.device (GPU or CPU)

    Returns:
        Dict with overall metrics + per-aspect breakdown for thesis reporting
    """
    print(f"\n  Loading gold standard from: {cfg.gold_data_path}")

    # --- Load and preprocess gold data ---
    try:
        gold_df = pd.read_csv(cfg.gold_data_path)
        print(f"  ‚úì Gold dataset loaded: {len(gold_df):,} rows (before exploding multi-aspect)")
    except Exception as e:
        print(f"  ‚úó Error loading gold data: {e}")
        return {"error": str(e)}

    # Rename columns to match training format
    # Input columns: Segment, Manual_Aspect, Manual_Sentiment
    gold_df_prep = gold_df.copy()
    gold_df_prep.rename(columns={
        "Manual_Aspect": "aspect",
        "Manual_Sentiment": "Sentiment_Label",
    }, inplace=True)

    # Normalize sentiment labels to lowercase (handle 'POSITIVE'/'NEGATIVE' vs 'positive'/'negative')
    gold_df_prep["Sentiment_Label"] = gold_df_prep["Sentiment_Label"].str.lower()

    # --- Handle multi-aspect segments: explode into separate rows -------
    # Aspect is stored as string representation of list (e.g., "['FOOD', 'VALUE']"),
    # convert to actual list
    import ast
    def parse_aspect(val):
        """Parse aspect column - handle both strings and lists."""
        if isinstance(val, str):
            try:
                # Try to parse as Python literal (handles "['FOOD', 'VALUE']")
                parsed = ast.literal_eval(val)
                if isinstance(parsed, list):
                    return parsed
                else:
                    return [parsed]  # Single aspect as string
            except (ValueError, SyntaxError):
                # Already a plain string like "FOOD"
                return [val]
        elif isinstance(val, list):
            return val
        else:
            return [str(val)]

    gold_df_prep["aspect"] = gold_df_prep["aspect"].apply(parse_aspect)
    
    # Count single vs multi-aspect rows BEFORE exploding
    n_single_aspect = sum(len(aspects) == 1 for aspects in gold_df_prep["aspect"])
    n_multi_aspect = len(gold_df_prep) - n_single_aspect
    
    # Explode: one row per aspect (same segment can appear multiple times)
    gold_df_exploded = gold_df_prep.explode("aspect").reset_index(drop=True)
    
    print(f"    Original rows:        {len(gold_df):,}")
    print(f"      Single-aspect:      {n_single_aspect:,}")
    print(f"      Multi-aspect:       {n_multi_aspect:,}")
    print(f"    After exploding:      {len(gold_df_exploded):,} aspect-segment pairs")

    # Encode sentiment labels to numeric format
    gold_df_exploded["label"] = gold_df_exploded["Sentiment_Label"].map(LABEL2ID)

    # Sanity check: warn if any labels couldn't be mapped
    n_unmapped = gold_df_exploded["label"].isna().sum()
    if n_unmapped > 0:
        print(f"  ‚ö†Ô∏è  Warning: {n_unmapped} rows with unmapped sentiment labels")
        print(f"     Available values: {gold_df_exploded['Sentiment_Label'].unique()}")
        print(f"     Expected: {list(LABEL2ID.keys())}")
        # Drop unmapped rows
        gold_df_exploded = gold_df_exploded.dropna(subset=["label"])
        print(f"     Proceeding with {len(gold_df_exploded):,} valid samples")

    print(f"\n  Gold dataset label distribution:")
    for label_id in sorted(gold_df_exploded["label"].unique()):
        count = (gold_df_exploded["label"] == label_id).sum()
        pct = count / len(gold_df_exploded) * 100
        print(f"    {ID2LABEL[label_id]:<10}: {count:>5,} samples ({pct:>5.1f}%)")
    
    # Check aspect distribution in gold data
    print(f"\n  Gold dataset aspect distribution:")
    aspect_counts = gold_df_exploded["aspect"].value_counts()
    for aspect, count in aspect_counts.items():
        print(f"    {aspect:<16}: {count:>4} samples")
    
    # Use the exploded dataframe for evaluation
    gold_df_prep = gold_df_exploded

    # --- Create dataset and dataloader ---
    gold_dataset = ABSADataset(gold_df_prep, tokenizer, cfg.max_seq_length)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    gold_loader = torch.utils.data.DataLoader(
        gold_dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
        collate_fn=data_collator,
        num_workers=0,
    )

    # --- Inference on gold set (no gradients) ---
    print(f"\n  Running inference on gold set...")
    model.eval()
    all_preds = []
    all_labels = []
    all_aspects = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(gold_loader):
            if (batch_idx + 1) % max(1, len(gold_loader) // 5) == 0:
                print(f"    Progress: {batch_idx + 1}/{len(gold_loader)} batches")

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            # Get predicted classes (argmax over logits)
            preds = torch.argmax(outputs["logits"], dim=-1)
            all_preds.extend(preds.cpu().numpy().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())

    # Get corresponding aspects for per-aspect breakdown
    all_aspects = gold_df_prep["aspect"].tolist()

    # --- DIAGNOSTIC: Check prediction distribution ---
    from collections import Counter
    pred_counts = Counter(all_preds)
    label_counts = Counter(all_labels)
    
    print(f"\n  ‚ö†Ô∏è  PREDICTION DIAGNOSTIC:")
    print(f"    Ground truth distribution:")
    for label_id in sorted(label_counts.keys()):
        count = label_counts[label_id]
        pct = count / len(all_labels) * 100
        print(f"      {ID2LABEL[label_id]:<10}: {count:>4} ({pct:>5.1f}%)")
    
    print(f"\n    Model prediction distribution:")
    for label_id in sorted(pred_counts.keys()):
        count = pred_counts[label_id]
        pct = count / len(all_preds) * 100
        print(f"      {ID2LABEL[label_id]:<10}: {count:>4} ({pct:>5.1f}%)")
    
    if len(pred_counts) == 1:
        only_class = list(pred_counts.keys())[0]
        print(f"\n    ‚ùå PROBLEM: Model predicts ONLY {ID2LABEL[only_class].upper()}!")
        print(f"       This indicates severe overfitting to the majority class.")
        print(f"       Possible causes:")
        print(f"         1. Domain shift: Gold data differs from training data")
        print(f"         2. Aspect mismatch: Check if aspects in gold match training")
        print(f"         3. Decision threshold: Model confidence too skewed")

    # --- Compute overall metrics ---
    overall_metrics = compute_metrics(all_preds, all_labels)

    # --- Per-aspect breakdown ---
    aspects_unique = sorted(set(all_aspects))
    per_aspect_metrics = {}

    for aspect in aspects_unique:
        # Filter to samples of this aspect
        mask = [i for i, a in enumerate(all_aspects) if a == aspect]
        if not mask:
            continue

        aspect_preds = [all_preds[i] for i in mask]
        aspect_labels = [all_labels[i] for i in mask]
        
        # Try to compute metrics; skip if aspect has too few samples or predictions
        try:
            per_aspect_metrics[aspect] = compute_metrics(aspect_preds, aspect_labels)
        except (IndexError, ValueError) as e:
            # Aspect has predictions of only one class - compute what we can
            acc = accuracy_score(aspect_labels, aspect_preds)
            per_aspect_metrics[aspect] = {
                "accuracy": round(acc, 4),
                "macro_f1": 0.0,  # Can't compute macro-F1 with single class
                "neg_f1": 0.0,
                "pos_f1": 0.0,
                "note": f"Single-class predictions ({len(mask)} samples)"
            }

    # --- Format results for saving ---
    gold_results = {
        "overall": overall_metrics,
        "per_aspect": per_aspect_metrics,
        "n_samples": len(gold_df_prep),
        "aspects": aspects_unique,
    }

    # --- Print results for immediate feedback ---
    print(f"\n  {'='*70}")
    print(f"  ‚òÖ GOLD TEST SET RESULTS (Human-Annotated Ground Truth)")
    print(f"  ‚òÖ Total samples: {len(gold_df_prep):,} aspect-segment pairs")
    print(f"  {'='*70}")
    print(f"\n  OVERALL PERFORMANCE:")
    print(f"    Accuracy:  {overall_metrics['accuracy']:.4f}")
    print(f"    Macro-F1:  {overall_metrics['macro_f1']:.4f}")
    print(f"      Negative F1 (Recall on negative class):  {overall_metrics['neg_f1']:.4f}")
    print(f"      Positive F1 (Recall on positive class):  {overall_metrics['pos_f1']:.4f}")

    print(f"\n  PER-ASPECT BREAKDOWN (for Kano Model analysis):")
    print(f"  {'Aspect':<16} {'Samples':>8} {'Accuracy':>10} {'Macro-F1':>10}")
    print(f"  {'-'*16} {'-'*8} {'-'*10} {'-'*10}")
    for aspect in aspects_unique:
        metrics = per_aspect_metrics[aspect]
        n_samples = sum(1 for a in all_aspects if a == aspect)
        print(f"  {aspect:<16} {n_samples:>8} {metrics['accuracy']:>10.4f} {metrics['macro_f1']:>10.4f}")

    print(f"\n  FULL CLASSIFICATION REPORT (for thesis):")
    print(classification_report(
        all_labels, all_preds,
        target_names=["Negative", "Positive"],
        digits=4,
    ))

    return gold_results


In [None]:
# ==============================================================================
# GOLD STANDARD EVALUATION 
# ==============================================================================
print("\n" + "=" * 70)
print("EVALUATING ON GOLD STANDARD (GROUND TRUTH LABELS)")
print("=" * 70)

# Load best checkpoint for gold evaluation
MODEL.load_state_dict(torch.load(CFG.best_model_path, map_location=DEVICE))

# Run evaluation on manually-annotated gold dataset
gold_metrics = evaluate_on_gold(MODEL, CFG, LOADERS["tokenizer"], DEVICE)

# Save gold evaluation results to JSON
import json
os.makedirs(os.path.dirname(CFG.gold_results_path) or ".", exist_ok=True)
with open(CFG.gold_results_path, "w") as f:
    json.dump(gold_metrics, f, indent=2)

print(f"\n‚úì Gold evaluation saved to: {CFG.gold_results_path}")
print("\n" + "=" * 70)
print("ALL EVALUATIONS COMPLETE")
print("=" * 70)
print(f"\nüìä RESULTS SUMMARY:")
print(f"  Weak Test Set:   {CFG.metrics_path}")
print(f"  Gold Test Set:   {CFG.gold_results_path}")
print(f"  Best Model:      {CFG.best_model_path}")
print("=" * 70)