In [1]:
import json, os, torch
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch.nn as nn
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split

In [14]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Build category lists
Enumerate every entity#attribute combination for restaurant, laptop, hotel, and OUT_OF_SCOPE domains, then prepare the bidirectional label maps `category2id` and `id2category` used by the classifier.

In [38]:
def build_category_list():
    # Restaurant
    restaurant_entities = [
        "RESTAURANT", "FOOD", "DRINKS", "AMBIENCE", "SERVICE", "LOCATION"
    ]
    restaurant_attrs = [
        "GENERAL", "PRICES", "QUALITY", "STYLE_OPTIONS", "MISCELLANEOUS"
    ]

    # Laptop
    laptop_entities = [
        "LAPTOP", "DISPLAY", "KEYBOARD", "MOUSE", "MOTHERBOARD", "CPU",
        "FANS_COOLING", "PORTS", "MEMORY", "POWER_SUPPLY", "OPTICAL_DRIVES",
        "BATTERY", "GRAPHICS", "HARD_DISK", "MULTIMEDIA_DEVICES", "HARDWARE",
        "SOFTWARE", "OS", "WARRANTY", "SHIPPING", "SUPPORT", "COMPANY"
    ]
    laptop_attrs = [
        "GENERAL", "PRICE", "QUALITY", "DESIGN_FEATURES",
        "OPERATION_PERFORMANCE", "USABILITY", "PORTABILITY",
        "CONNECTIVITY", "MISCELLANEOUS"
    ]

    # Hotel
    hotel_entities = [
        "HOTEL", "ROOMS", "FACILITIES", "ROOM_AMENITIES",
        "SERVICE", "LOCATION", "FOOD_DRINKS"
    ]
    hotel_attrs = [
        "GENERAL", "PRICE", "COMFORT", "CLEANLINESS",
        "QUALITY", "DESIGN_FEATURES",
        "STYLE_OPTIONS", "MISCELLANEOUS"
    ]

    # OUT_OF_SCOPE
    out_of_scope_entities = ["OUT_OF_SCOPE"]
    out_of_scope_attrs = [
        "GENERAL",
        "OPERATION_PERFORMANCE",
        "DESIGN_FEATURES",
        "USABILITY"
    ]

    categories = []

    # Restaurant
    for e in restaurant_entities:
        for a in restaurant_attrs:
            categories.append(f"{e}#{a}")

    # Laptop
    for e in laptop_entities:
        for a in laptop_attrs:
            categories.append(f"{e}#{a}")

    # Hotel
    for e in hotel_entities:
        for a in hotel_attrs:
            categories.append(f"{e}#{a}")

    # OUT_OF_SCOPE
    for e in out_of_scope_entities:
        for a in out_of_scope_attrs:
            categories.append(f"{e}#{a}")

    return categories


# Label mappings
ALL_CATEGORIES = build_category_list()
category2id = {cat: i for i, cat in enumerate(ALL_CATEGORIES)}
id2category = {i: cat for cat, i in category2id.items()}

NUM_LABELS = len(ALL_CATEGORIES)

## Input text helper
Concatenate the raw sentence with its aspect, opinion, and VA scores into a single prompt string expected by BERT.

In [39]:
def build_input(text, aspect, opinion, va):
    return (
        f"Text: {text} "
        f"Aspect: {aspect}. "
        f"Opinion: {opinion}. "
        f"VA: {va}"
    )

## PyTorch dataset for classification
Read the JSONL file, expand each quadruplet into a (text, label) pair, and tokenize with truncation/padding to fixed length for BERT inputs.

In [40]:
# DATASET PyTorch

class CategoryDataset(Dataset):
    def __init__(self, path, tokenizer, category2id, max_len=128):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        with open(path, encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                text = data["Text"]

                for t in data["Quadruplet"]:
                    input_text = build_input(
                        text,
                        t["Aspect"],
                        t["Opinion"],
                        t["VA"]
                    )

                    label = category2id[t["Category"]]

                    self.samples.append((input_text, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }


## Class weights helper
Compute inverse-frequency weights for cross-entropy to mitigate class imbalance.

In [41]:
def compute_class_weights(samples, num_labels):
    counts = Counter(label for _, label in samples)
    max_count = max(counts.values()) if counts else 1
    weights = [max_count / counts.get(i, 1) for i in range(num_labels)]
    return torch.tensor(weights, dtype=torch.float)

## Data splitting utility
Split the loaded samples into training and validation sets based on a specified ratio.

In [None]:
def split_data_from_file(file_path, train_ratio=0.8, random_state=42):
    samples = []
    labels = []

    with open(file_path, encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data["Text"]

            for t in data["Quadruplet"]:
                input_text = build_input(
                    text,
                    t["Aspect"],
                    t["Opinion"],
                    t["VA"]
                )
                label = category2id[t["Category"]]
                samples.append((input_text, label))
                labels.append(label)

    try:
        train_samples, val_samples, _, _ = train_test_split(
            samples,
            labels,
            train_size=train_ratio,
            random_state=random_state,
            shuffle=True,
            stratify=labels
        )
        print("Using stratified split to preserve class balance")
    except ValueError as e:
        print(f"Stratification failed (some classes have too few samples), using regular split")
        train_samples, val_samples = train_test_split(
            samples,
            train_size=train_ratio,
            random_state=random_state,
            shuffle=True
        )

    return train_samples, val_samples


class CategoryDatasetFromSamples(Dataset):
    """
    Dataset that takes pre-loaded samples instead of reading from file.
    """
    def __init__(self, samples, tokenizer, max_len=128):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

## Initialize BERT model
Load `bert-base-uncased` for sequence classification and set the output dimension to match the number of category labels.

In [None]:
def build_model(num_labels):
    return BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=num_labels
    )

## Training loop and early stopping
Runs multiple epochs over the DataLoader and tracks loss + macro-F1.
- Per batch: (1) zero gradients, (2) forward pass with labels to get loss/logits, (3) backprop, (4) optimizer step, (5) accumulate loss and predictions.
- Per epoch: compute average loss, accuracy, and macro-F1 on all seen batches and log them.
- Early stopping: monitor macro-F1; if it fails to improve by at least `min_delta` over the best score for `patience` consecutive epochs, halt training to avoid overfitting and wasted compute.
- Returns: the best macro-F1 achieved before stopping.

In [None]:
def train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    scheduler,
    criterion,
    device,
    epochs=10,
    patience=3,
    min_delta=1e-4
):
    best_val_f1 = 0.0
    best_state = None
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()

        total_loss = 0.0
        all_preds = []
        all_labels = []

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(train_loader)
        train_acc = accuracy_score(all_labels, all_preds)
        train_macro_f1 = f1_score(all_labels, all_preds, average="macro")

        # VALIDATION 
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item()

                preds = torch.argmax(logits, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_avg_loss = val_loss / len(val_loader)
        val_acc = accuracy_score(val_labels, val_preds)
        val_macro_f1 = f1_score(val_labels, val_preds, average="macro")

        print(
            f"[Epoch {epoch+1}] "
            f"Train Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Train Macro-F1: {train_macro_f1:.4f} "
            f"|| Val Loss: {val_avg_loss:.4f} | Val Acc: {val_acc:.4f} | Val Macro-F1: {val_macro_f1:.4f}"
        )

        # EARLY STOPPING ON VAL F1 
        if val_macro_f1 > best_val_f1 + min_delta:
            best_val_f1 = val_macro_f1
            best_state = {k: v.clone().detach().cpu() for k, v in model.state_dict().items()}
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(f"  - No validation improvement for {epochs_no_improve}/{patience} epochs")

        if epochs_no_improve >= patience:
            print(
                f"  - Early stopping at epoch {epoch+1}. "
                f"Best Val Macro-F1: {best_val_f1:.4f}"
            )
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return best_val_f1

## Prediction helper
Builds the combined input string for a single example, tokenizes it, performs a forward pass on the model, and returns the category with the highest logit (argmax).

In [45]:
def predict_category(model, tokenizer, device, text, aspect, opinion, va):
    model.eval()

    input_text = build_input(text, aspect, opinion, va)

    enc = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits

    pred_id = torch.argmax(logits, dim=1).item()
    return id2category[pred_id]

## Evaluate on validation samples
Evaluate the model on a pre-split set of validation samples (not from file).

In [47]:
def evaluate_on_samples(model, tokenizer, val_samples, device, batch_size=8):
    """
    Evaluate model on validation samples (list of (text, label) tuples).
    """
    val_dataset = CategoryDatasetFromSamples(
        samples=val_samples,
        tokenizer=tokenizer
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    model.eval()

    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average="macro")

    print("\n=== VALIDATION RESULTS ===")
    print(f"Loss:      {avg_loss:.4f}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Macro-F1:  {macro_f1:.4f}")

    return {
        "loss": avg_loss,
        "accuracy": acc,
        "macro_f1": macro_f1
    }

## Save model and tokenizer
Persist the trained weights and tokenizer configuration to disk so they can be reloaded without retraining.

In [48]:
def save_model(model, tokenizer, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

## Load a saved model
Recover the tokenizer and model from disk and move them to the active device for inference or evaluation.

In [49]:
def load_model(save_dir, device):
    tokenizer = BertTokenizer.from_pretrained(save_dir)
    model = BertForSequenceClassification.from_pretrained(save_dir)
    model.to(device)
    model.eval()
    return model, tokenizer

## End-to-end training pipeline
Define paths and device, instantiate tokenizer and dataset/dataloader, build the model and optimizer, run training with early stopping, save the best checkpoint, and finally evaluate the saved model on the chosen split.

In [None]:
def main():
    SAVE_DIR = "/content/drive/MyDrive/nlp_proiect/modele/bun_2"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    file_name = "/content/drive/MyDrive/nlp_proiect/nlp_data/eng_laptop_train_alltasks.jsonl"

    print("Splitting data into train (80%) and validation (20%)...")
    train_samples, val_samples = split_data_from_file(
        file_path=file_name,
        train_ratio=0.8,
        random_state=42
    )

    print(f"Train samples (80%): {len(train_samples)}")
    print(f"Validation samples (20%): {len(val_samples)}")

    class_weights = compute_class_weights(train_samples, NUM_LABELS).to(device)

    train_dataset = CategoryDatasetFromSamples(
        samples=train_samples,
        tokenizer=tokenizer
    )

    val_dataset = CategoryDatasetFromSamples(
        samples=val_samples,
        tokenizer=tokenizer
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=8,
        shuffle=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=8,
        shuffle=False
    )

    model = build_model(NUM_LABELS)
    model.to(device)

    optimizer = AdamW(
        model.parameters(),
        lr=3e-5,
        weight_decay=0.05
    )

    total_steps = len(train_loader) * 30
    warmup_steps = int(total_steps * 0.1)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    criterion = nn.CrossEntropyLoss(weight=class_weights)

    print("\nTRAINING WITH VAL MONITORING (80/20)")
    best_val_f1 = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion,
        device=device,
        epochs=35,
        patience=3
    )

    print(f"\nBest validation Macro-F1: {best_val_f1:.4f}")

    save_model(model, tokenizer, SAVE_DIR)

    # Evaluate on remaining 20% of data
    print("\nFINAL EVALUATION ON 20% VALIDATION SET")
    evaluate_on_samples(
        model=model,
        tokenizer=tokenizer,
        val_samples=val_samples,
        device=device,
        batch_size=8
    )

In [27]:
if __name__ == "__main__":
    main()

Splitting data into train (80%) and validation (20%)...
⚠ Stratification failed (some classes have too few samples), using regular split
Train samples (80%): 4618
Validation samples (20%): 1155


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



TRAINING WITH VAL MONITORING (80/20)
[Epoch 1] Train Loss: 5.4258 | Train Acc: 0.0048 | Train Macro-F1: 0.0006 || Val Loss: 4.9643 | Val Acc: 0.0199 | Val Macro-F1: 0.0011
[Epoch 2] Train Loss: 4.5988 | Train Acc: 0.0784 | Train Macro-F1: 0.0183 || Val Loss: 4.4677 | Val Acc: 0.1628 | Val Macro-F1: 0.0421
[Epoch 3] Train Loss: 3.7936 | Train Acc: 0.3149 | Train Macro-F1: 0.0722 || Val Loss: 3.5926 | Val Acc: 0.4130 | Val Macro-F1: 0.1155
[Epoch 4] Train Loss: 2.9864 | Train Acc: 0.4465 | Train Macro-F1: 0.1340 || Val Loss: 3.1067 | Val Acc: 0.4424 | Val Macro-F1: 0.1490
[Epoch 5] Train Loss: 2.3891 | Train Acc: 0.5288 | Train Macro-F1: 0.1970 || Val Loss: 2.7612 | Val Acc: 0.4918 | Val Macro-F1: 0.1966
[Epoch 6] Train Loss: 2.0045 | Train Acc: 0.5760 | Train Macro-F1: 0.2551 || Val Loss: 2.6845 | Val Acc: 0.5186 | Val Macro-F1: 0.2163
[Epoch 7] Train Loss: 1.6937 | Train Acc: 0.6464 | Train Macro-F1: 0.3141 || Val Loss: 2.6461 | Val Acc: 0.5506 | Val Macro-F1: 0.2503
[Epoch 8] Train L

In [None]:
### PT INTEGRARE IN PROIECT


SAVE_DIR = "task_3_saved_model"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, tokenizer = load_model(SAVE_DIR, device)

text = "the trackpad works well and the screen display is great too ."
aspect = "trackpad"
opinion = "well"
va = "6.50#6.62"

category = predict_category(model, tokenizer, device, text, aspect, opinion, va)

print(f"Predicted category: {category}")

Predicted category: HARDWARE#OPERATION_PERFORMANCE


In [None]:
def generate_predictions(input_file, output_file, model, tokenizer, device, category2id, id2category):
    """
    Citește fișierul JSONL cu Quadruplets (Aspect + Opinion + VA),
    prezice Category pentru fiecare quadruplet,
    și scrie rezultatele în format JSON complet.
    """

    predictions = []

    with open(input_file, encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data["Text"]
            sample_id = data.get("ID", "")

            quadruplets = []

            # Preluăm Quadruplets din fișierul de intrare (conține Aspect + Opinion + VA)
            if "Quadruplets" in data and data["Quadruplets"]:
                for quad in data["Quadruplets"]:
                    aspect = quad.get("Aspect", "NULL")
                    opinion = quad.get("Opinion", "NULL")
                    va = quad.get("VA", "5.00#5.00")

                    # Fac predicția pentru Category folosind valorile din fișier
                    predicted_category = predict_category(
                        model, tokenizer, device, text, aspect, opinion, va
                    )

                    quadruplets.append({
                        "Aspect": aspect,
                        "Category": predicted_category,
                        "Opinion": opinion,
                        "VA": va
                    })

            # Fallback: dacă avem Quadruplet (format singular)
            elif "Quadruplet" in data and data["Quadruplet"]:
                for quad in data["Quadruplet"]:
                    aspect = quad.get("Aspect", "NULL")
                    opinion = quad.get("Opinion", "NULL")
                    va = quad.get("VA", "5.00#5.00")

                    # Fac predicția pentru Category
                    predicted_category = predict_category(
                        model, tokenizer, device, text, aspect, opinion, va
                    )

                    quadruplets.append({
                        "Aspect": aspect,
                        "Category": predicted_category,
                        "Opinion": opinion,
                        "VA": va
                    })

            else:
                # Pentru cazul fără niciun Quadruplet, cream placeholder
                quadruplets.append({
                    "Aspect": "NULL",
                    "Category": "OUT_OF_SCOPE#GENERAL",
                    "Opinion": "NULL",
                    "VA": "5.00#5.00"
                })

            predictions.append({
                "ID": sample_id,
                "Quadruplet": quadruplets
            })

    # Scriu predicțiile în fișierul JSON
    with open(output_file, "w", encoding="utf-8") as f:
        for pred in predictions:
            f.write(json.dumps(pred) + "\n")

    print(f"✓ Predicții generate și salvate în: {output_file}")
    print(f"Total instanțe procesate: {len(predictions)}")
    return predictions



SAVE_DIR = "task_3_saved_model"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, tokenizer = load_model(SAVE_DIR, device)

# Generez predicții pentru test_pred_with_va.jsonl
generate_predictions(
    input_file="test_pred_with_va.jsonl",
    output_file="submit_data_task3.json",
    model=model,
    tokenizer=tokenizer,
    device=device,
    category2id=category2id,
    id2category=id2category
)

✓ Predicții generate și salvate în: submit_data_task3.json
Total instanțe procesate: 200


[{'ID': 'lap26_asqp_dev_1',
  'Quadruplet': [{'Aspect': 'perforemce',
    'Category': 'SHIPPING#PRICE',
    'Opinion': 'great',
    'VA': '6.39#6.20'},
   {'Aspect': 'price',
    'Category': 'LAPTOP#PRICE',
    'Opinion': 'great',
    'VA': '6.39#6.20'}]},
 {'ID': 'lap26_asqp_dev_2',
  'Quadruplet': [{'Aspect': 'display',
    'Category': 'DISPLAY#OPERATION_PERFORMANCE',
    'Opinion': 'Very bright',
    'VA': '6.99#6.97'},
   {'Aspect': 'color gamut',
    'Category': 'DISPLAY#DESIGN_FEATURES',
    'Opinion': 'wide',
    'VA': '5.00#4.80'}]},
 {'ID': 'lap26_asqp_dev_3',
  'Quadruplet': [{'Aspect': 'Battery life',
    'Category': 'BATTERY#OPERATION_PERFORMANCE',
    'Opinion': 'bad',
    'VA': '3.61#6.20'}]},
 {'ID': 'lap26_asqp_dev_4',
  'Quadruplet': [{'Aspect': 'Chromebook',
    'Category': 'LAPTOP#DESIGN_FEATURES',
    'Opinion': 'very clean',
    'VA': '5.00#5.56'},
   {'Aspect': ' Chromebook',
    'Category': 'LAPTOP#DESIGN_FEATURES',
    'Opinion': 'NULL',
    'VA': '5.00#4.80'}]}