In [None]:
import json, os, torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
)

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [29]:
def build_category_list():
    # Restaurant
    restaurant_entities = [
        "RESTAURANT", "FOOD", "DRINKS", "AMBIENCE", "SERVICE", "LOCATION"
    ]
    restaurant_attrs = [
        "GENERAL", "PRICES", "QUALITY", "STYLE_OPTIONS", "MISCELLANEOUS"
    ]

    # Laptop
    laptop_entities = [
        "LAPTOP", "DISPLAY", "KEYBOARD", "MOUSE", "MOTHERBOARD", "CPU",
        "FANS_COOLING", "PORTS", "MEMORY", "POWER_SUPPLY", "OPTICAL_DRIVES",
        "BATTERY", "GRAPHICS", "HARD_DISK", "MULTIMEDIA_DEVICES", "HARDWARE",
        "SOFTWARE", "OS", "WARRANTY", "SHIPPING", "SUPPORT", "COMPANY"
    ]
    laptop_attrs = [
        "GENERAL", "PRICE", "QUALITY", "DESIGN_FEATURES",
        "OPERATION_PERFORMANCE", "USABILITY", "PORTABILITY",
        "CONNECTIVITY", "MISCELLANEOUS"
    ]

    # Hotel
    hotel_entities = [
        "HOTEL", "ROOMS", "FACILITIES", "ROOM_AMENITIES",
        "SERVICE", "LOCATION", "FOOD_DRINKS"
    ]
    hotel_attrs = [
        "GENERAL", "PRICE", "COMFORT", "CLEANLINESS",
        "QUALITY", "DESIGN_FEATURES",
        "STYLE_OPTIONS", "MISCELLANEOUS"
    ]

    # OUT_OF_SCOPE
    out_of_scope_entities = ["OUT_OF_SCOPE"]
    out_of_scope_attrs = [
        "GENERAL",
        "OPERATION_PERFORMANCE",
        "DESIGN_FEATURES",
        "USABILITY"
    ]

    categories = []

    # Restaurant
    for e in restaurant_entities:
        for a in restaurant_attrs:
            categories.append(f"{e}#{a}")

    # Laptop
    for e in laptop_entities:
        for a in laptop_attrs:
            categories.append(f"{e}#{a}")

    # Hotel
    for e in hotel_entities:
        for a in hotel_attrs:
            categories.append(f"{e}#{a}")

    # OUT_OF_SCOPE
    for e in out_of_scope_entities:
        for a in out_of_scope_attrs:
            categories.append(f"{e}#{a}")

    return categories


# Label mappings
ALL_CATEGORIES = build_category_list()
category2id = {cat: i for i, cat in enumerate(ALL_CATEGORIES)}
id2category = {i: cat for cat, i in category2id.items()}

NUM_LABELS = len(ALL_CATEGORIES)

In [30]:
def build_input(text, aspect, opinion, va):
    return (
        f"Text: {text} "
        f"Aspect: {aspect}. "
        f"Opinion: {opinion}. "
        f"VA: {va}"
    )

In [31]:
# DATASET PyTorch

class CategoryDataset(Dataset):
    def __init__(self, path, tokenizer, category2id, max_len=128):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        with open(path, encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                text = data["Text"]

                for t in data["Quadruplet"]:
                    input_text = build_input(
                        text,
                        t["Aspect"],
                        t["Opinion"],
                        t["VA"]
                    )

                    label = category2id[t["Category"]]

                    self.samples.append((input_text, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, label = self.samples[idx]

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }


In [32]:
# MODEL BERT + CLS CLASSIFICATION

def build_model(num_labels):
    return BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=num_labels
    )

In [33]:
def train_model(model,
                dataloader,
                optimizer,
                device,
                epochs=10,
                patience=3,
                min_delta=1e-4):
    best_macro_f1 = 0.0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()

        total_loss = 0.0
        all_preds = []
        all_labels = []

        for batch in dataloader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(dataloader)
        acc = accuracy_score(all_labels, all_preds)
        macro_f1 = f1_score(all_labels, all_preds, average="macro")

        print(
            f"[Epoch {epoch+1}] "
            f"Loss: {avg_loss:.4f} | "
            f"Acc: {acc:.4f} | "
            f"Macro-F1: {macro_f1:.4f}"
        )

        # ---- EARLY STOPPING ----
        if macro_f1 > best_macro_f1 + min_delta:
            best_macro_f1 = macro_f1
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(
                f"  - No improvement for {epochs_no_improve}/{patience} epochs"
            )

        if epochs_no_improve >= patience:
            print(
                f"!! Early stopping triggered at epoch {epoch+1}. "
                f"Best Macro-F1: {best_macro_f1:.4f}"
            )
            break

    return best_macro_f1


In [34]:
def predict_category(model, tokenizer, device, text, aspect, opinion, va):
    model.eval()

    input_text = build_input(text, aspect, opinion, va)

    enc = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits

    pred_id = torch.argmax(logits, dim=1).item()
    return id2category[pred_id]

In [35]:
def evaluate_saved_model(
    save_dir,
    data_path,
    category2id,
    device,
    batch_size=8
):

    # Load model & tokenizer
    model, tokenizer = load_model(save_dir, device)

    # Dataset & DataLoader
    dataset = CategoryDataset(
        path=data_path,
        tokenizer=tokenizer,
        category2id=category2id
    )

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False
    )

    model.eval()

    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average="macro")

    print("\n EVALUATE ON TEST DATA")
    print(f"Loss:      {avg_loss:.4f}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Macro-F1:  {macro_f1:.4f}")

    return {
        "loss": avg_loss,
        "accuracy": acc,
        "macro_f1": macro_f1
    }


In [36]:
def save_model(model, tokenizer, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

In [37]:
def load_model(save_dir, device):
    tokenizer = BertTokenizer.from_pretrained(save_dir)
    model = BertForSequenceClassification.from_pretrained(save_dir)
    model.to(device)
    model.eval()
    return model, tokenizer

In [38]:
def main():
    SAVE_DIR = "/content/drive/MyDrive/nlp_proiect/modele/25_epoci"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    file_name = "/content/drive/MyDrive/nlp_proiect/nlp_data/eng_laptop_train_alltasks.jsonl"

    train_dataset = CategoryDataset(
        path=file_name,
        tokenizer=tokenizer,
        category2id=category2id
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=8,
        shuffle=True
    )

    model = build_model(NUM_LABELS)
    model.to(device)

    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,
        weight_decay=0.01
    )

    train_model(
        model=model,
        dataloader=train_loader,
        optimizer=optimizer,
        device=device,
        epochs=30,
        patience=3
    )

    save_model(model, tokenizer, SAVE_DIR)

    evaluate_saved_model(
      save_dir=SAVE_DIR,
      data_path="/content/drive/MyDrive/nlp_proiect/nlp_data/eng_laptop_train_alltasks.jsonl",
      category2id=category2id,
      device=device
    )

In [39]:
if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1] Loss: 3.2856 | Acc: 0.3113 | Macro-F1: 0.0247
[Epoch 2] Loss: 2.0165 | Acc: 0.5117 | Macro-F1: 0.0729
[Epoch 3] Loss: 1.5237 | Acc: 0.6139 | Macro-F1: 0.1152
[Epoch 4] Loss: 1.2098 | Acc: 0.6858 | Macro-F1: 0.1557
[Epoch 5] Loss: 0.9784 | Acc: 0.7362 | Macro-F1: 0.1951
[Epoch 6] Loss: 0.7886 | Acc: 0.7911 | Macro-F1: 0.2530
[Epoch 7] Loss: 0.6398 | Acc: 0.8320 | Macro-F1: 0.3016
[Epoch 8] Loss: 0.5201 | Acc: 0.8670 | Macro-F1: 0.3455
[Epoch 9] Loss: 0.4154 | Acc: 0.8992 | Macro-F1: 0.4174
[Epoch 10] Loss: 0.3494 | Acc: 0.9130 | Macro-F1: 0.4522
[Epoch 11] Loss: 0.2989 | Acc: 0.9293 | Macro-F1: 0.5093
[Epoch 12] Loss: 0.2444 | Acc: 0.9456 | Macro-F1: 0.5557
[Epoch 13] Loss: 0.1891 | Acc: 0.9595 | Macro-F1: 0.6249
[Epoch 14] Loss: 0.1766 | Acc: 0.9598 | Macro-F1: 0.6367
[Epoch 15] Loss: 0.1555 | Acc: 0.9667 | Macro-F1: 0.6767
[Epoch 16] Loss: 0.1429 | Acc: 0.9692 | Macro-F1: 0.7111
[Epoch 17] Loss: 0.1066 | Acc: 0.9777 | Macro-F1: 0.7248
[Epoch 18] Loss: 0.1191 | Acc: 0.9709 | 

In [None]:
### PT INTEGRARE IN PROIECT


SAVE_DIR = "task_3_saved_model"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, tokenizer = load_model(SAVE_DIR, device)

text = "the trackpad works well and the screen display is great too ."
aspect = "trackpad"
opinion = "well"
va = "6.50#6.62"

category = predict_category(model, tokenizer, device, text, aspect, opinion, va)

print(f"Predicted category: {category}")