In [None]:
import os
import sys

PROJECT_MARKERS = ("src", "data", "prompts", "results")

def find_project_root(start_path):
    current = os.path.abspath(start_path)

    while True:
        if all(os.path.isdir(os.path.join(current, m)) for m in PROJECT_MARKERS):
            return current

        parent = os.path.dirname(current)
        if parent == current:
            raise RuntimeError("Project root not found")

        current = parent


# ---- execution directory (cwd) ----
cwd = os.getcwd()

# ---- safe starting point ----
try:
    start_path = os.path.dirname(os.path.abspath(__file__))
except NameError:
    start_path = cwd


# ---- resolve canonical paths ----
project_root = find_project_root(start_path)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

src_root     = os.path.join(project_root, "src", "daniel", "gemini")
data_root    = os.path.join(project_root, "data", "MAMS-ACSA", "raw", "data_jsonl", "annotated")
schemas_root = os.path.join(project_root, "data", "MAMS-ACSA", "raw", "data_jsonl", "schema")
prompts_root = os.path.join(project_root, "prompts", "daniel", "llama")
utils_root   = os.path.join(project_root, "utils")
results_root = os.path.join(project_root, "results", "daniel")

print(
    f"ðŸ“‚ cwd          : {cwd}\n"
    f"ðŸ“‚ Project root : {project_root}\n"
    f"ðŸ“‚ Source root  : {src_root}\n"
    f"ðŸ“‚ Data root    : {data_root}\n"
    f"ðŸ“‚ Prompts root : {prompts_root}\n"
    f"ðŸ“‚ Utils root   : {utils_root}\n"
    f"ðŸ“‚ Results root : {results_root}"
)

In [None]:
import json
from pathlib import Path

import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments
)

In [None]:
data_file = Path(data_root) / "train.jsonl"   # adjust filename if needed

rows = []
with open(data_file, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

len(rows)


In [None]:
def light_check(rows, name):
    for i, r in enumerate(rows):
        assert isinstance(r, dict), f"{name}[{i}] is not a dict"

        # input
        assert "input" in r, f"{name}[{i}] missing 'input'"
        assert isinstance(r["input"], str), f"{name}[{i}]['input'] not a string"
        assert r["input"].strip(), f"{name}[{i}] empty 'input'"

        # output
        assert "output" in r, f"{name}[{i}] missing 'output'"
        assert isinstance(r["output"], list), f"{name}[{i}]['output'] not a list"
        assert len(r["output"]) > 0, f"{name}[{i}] empty 'output' list"

        # each label item
        for j, o in enumerate(r["output"]):
            assert isinstance(o, dict), f"{name}[{i}]['output'][{j}] not a dict"
            for k in ("aspect", "polarity", "emotion"):
                assert k in o, f"{name}[{i}]['output'][{j}] missing '{k}'"
                assert isinstance(o[k], str), f"{name}[{i}]['output'][{j}]['{k}'] not a string"
                assert o[k].strip(), f"{name}[{i}]['output'][{j}] empty '{k}'"

    print(f"{name}: {len(rows)} rows passed")


light_check(rows=rows, name="train")


In [None]:
records = []

for r in rows:
    text = r["input"]
    for o in r["output"]:
        records.append({
            "text": text,
            "aspect": o["aspect"],
            "emotion": o["emotion"],
            "polarity": o["polarity"]
        })

df = pd.DataFrame(records)
df.head()

In [None]:
df["emotion"] = df["emotion"].replace({"mentioned_only": "neutral"})

In [None]:
emotion_encoder = LabelEncoder()
polarity_encoder = LabelEncoder()

df["emotion_id"]  = emotion_encoder.fit_transform(df["emotion"])
df["polarity_id"] = polarity_encoder.fit_transform(df["polarity"])

num_emotions  = len(emotion_encoder.classes_)
num_polarity  = len(polarity_encoder.classes_)

emotion_encoder.classes_, polarity_encoder.classes_

In [None]:
unique_texts = df["text"].unique()

train_texts, val_texts = train_test_split(
    unique_texts,
    test_size=0.2,
    random_state=42
)

train_df = df[df["text"].isin(train_texts)]
val_df   = df[df["text"].isin(val_texts)]

In [None]:
MODEL_NAME = "distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.loc[idx]

        text = f"ASPECT: {row['aspect']} | TEXT: {row['text']}"

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "emotion_labels": torch.tensor(row["emotion_id"]),
            "polarity_labels": torch.tensor(row["polarity_id"])
        }

In [None]:
class EmotionPolarityModel(torch.nn.Module):
    def __init__(self, model_name, num_emotions, num_polarity):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

        hidden = self.encoder.config.hidden_size

        self.emotion_head  = torch.nn.Linear(hidden, num_emotions)
        self.polarity_head = torch.nn.Linear(hidden, num_polarity)

    def forward(self, input_ids, attention_mask, emotion_labels=None, polarity_labels=None):
        out = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        cls = out.last_hidden_state[:, 0]

        emotion_logits  = self.emotion_head(cls)
        polarity_logits = self.polarity_head(cls)

        loss = None
        if emotion_labels is not None:
            loss_e = torch.nn.functional.cross_entropy(emotion_logits, emotion_labels)
            loss_p = torch.nn.functional.cross_entropy(polarity_logits, polarity_labels)
            loss = loss_e + 0.3 * loss_p   # polarity = auxiliary

        return {
            "loss": loss,
            "emotion_logits": emotion_logits,
            "polarity_logits": polarity_logits
        }

In [None]:
train_ds = EmotionDataset(train_df, tokenizer)
val_ds   = EmotionDataset(val_df, tokenizer)

model = EmotionPolarityModel(
    MODEL_NAME,
    num_emotions=num_emotions,
    num_polarity=num_polarity
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "macro_f1": f1_score(labels, preds, average="macro")
    }

In [None]:
class EmotionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            emotion_labels=inputs["emotion_labels"],
            polarity_labels=inputs["polarity_labels"]
        )
        return (outputs["loss"], outputs) if return_outputs else outputs["loss"]

    def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
        with torch.no_grad():
            outputs = model(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"]
            )

        logits = outputs["emotion_logits"]
        labels = inputs["emotion_labels"]

        return None, logits.cpu().numpy(), labels.cpu().numpy()

In [None]:
args = TrainingArguments(
    output_dir=results_root,
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    logging_steps=50,
    report_to="none"
)

trainer = EmotionTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

trainer.train()