In [None]:
import os
import sys

PROJECT_MARKERS = ("src", "data", "prompts", "results")

def find_project_root(start_path):
    current = os.path.abspath(start_path)

    while True:
        if all(os.path.isdir(os.path.join(current, m)) for m in PROJECT_MARKERS):
            return current

        parent = os.path.dirname(current)
        if parent == current:
            raise RuntimeError("Project root not found")

        current = parent


# ---- execution directory (cwd) ----
cwd = os.getcwd()

# ---- safe starting point ----
try:
    start_path = os.path.dirname(os.path.abspath(__file__))
except NameError:
    start_path = cwd


# ---- resolve canonical paths ----
project_root = find_project_root(start_path)

# âœ… THIS IS THE IMPORTANT PART
if project_root not in sys.path:
    sys.path.insert(0, project_root)

src_root     = os.path.join(project_root, "src", "daniel", "gemini")
data_root    = os.path.join(project_root, "data", "MAMS-ACSA", "raw", "data_jsonl", "annotated")
schemas_root = os.path.join(project_root, "data", "MAMS-ACSA", "raw", "data_jsonl", "schema")
prompts_root = os.path.join(project_root, "prompts", "daniel", "gemini")
utils_root   = os.path.join(project_root, "utils")
results_root = os.path.join(project_root, "results", "daniel", "gemini")

print(
    f"ðŸ“‚ cwd          : {cwd}\n"
    f"ðŸ“‚ Project root : {project_root}\n"
    f"ðŸ“‚ Source root  : {src_root}\n"
    f"ðŸ“‚ Data root    : {data_root}\n"
    f"ðŸ“‚ Prompts root : {prompts_root}\n"
    f"ðŸ“‚ Utils root   : {utils_root}\n"
    f"ðŸ“‚ Results root : {results_root}"
)

ðŸ“‚ Project root: /Users/hd/Desktop/EMOTION-PRED
ðŸ“‚ Source root: /Users/hd/Desktop/EMOTION-PRED/src
ðŸ“‚ Results root: /Users/hd/Desktop/EMOTION-PRED/src/results
ðŸ“‚ Data root: /Users/hd/Desktop/EMOTION-PRED/src/data/MAMS-ACSA/raw/data_jsonl
Using dataset directory: /Users/hd/Desktop/EMOTION-PRED/src/data/MAMS-ACSA/raw/data_jsonl


In [None]:
## For EMOTION-ONLY evaluation

import json
import os
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from collections import Counter

# -----------------------------
# Paths
# -----------------------------

#GOLD_PATH = os.path.join(data_root, "daniel_50.jsonl")
#PRED_PATH = os.path.join(results_root, "gemini-flash", "gemini_emotion_only_daniel_50.jsonl")

GOLD_PATH = os.path.join(data_root, "02_iteration_cleaned_300.jsonl")

# ---- derive input identifier ----
input_name = os.path.splitext(os.path.basename(GOLD_PATH))[0]

#PRED_PATH = os.path.join(results_root, "gemini-flash", "{input_name}__gemini_emotion_only.jsonl")
PRED_PATH = os.path.join(results_root, "claude", "claude_output.jsonl")
# -----------------------------
# Load JSONL
# -----------------------------
def load_jsonl(path):
    return [json.loads(line) for line in open(path, "r", encoding="utf-8")]

gold = load_jsonl(GOLD_PATH)
pred = load_jsonl(PRED_PATH)

# Map gold/pred by input text
gold_map = {row["input"]: row["output"] for row in gold}
pred_map = {row["input"]: row["output"] for row in pred}

print(f"Loaded gold rows: {len(gold_map)}")
print(f"Loaded pred rows: {len(pred_map)}")

# -----------------------------
# EMOTION-ONLY ALIGNMENT
# -----------------------------
gold_emotions = []
pred_emotions = []

for text, gold_list in gold_map.items():

    pred_list = pred_map.get(text, [])

    # In emotion-only mode:
    # - Number of triples must be identical (same aspects + polarities)
    # - We ignore gold emotions and only evaluate the predicted ones

    if len(pred_list) != len(gold_list):
        raise ValueError(f"Triple count mismatch for: {text[:50]}...")

    for gold_triple, pred_triple in zip(gold_list, pred_list):

        # Gold label = gold triple's emotion
        gold_emotions.append(gold_triple["emotion"])

        # Pred emotion
        pred_emotions.append(pred_triple["emotion"])


# -----------------------------
# Compute Metrics (Emotion Only)
# -----------------------------
emotion_macro = f1_score(gold_emotions, pred_emotions, average="macro", zero_division=0)
emotion_micro = f1_score(gold_emotions, pred_emotions, average="micro", zero_division=0)

# Full match accuracy = correct emotion only
correct = sum(1 for g, p in zip(gold_emotions, pred_emotions) if g == p)
full_match_acc = correct / len(gold_emotions)

# -----------------------------
# PRINT REPORT
# -----------------------------
print("\n==============================")
print("EMOTION-ONLY EVALUATION REPORT")
print("==============================\n")

print("Emotion Macro-F1 :", round(emotion_macro, 4))
print("Emotion Micro-F1 :", round(emotion_micro, 4))
print("Exact Emotion Accuracy:", round(full_match_acc, 4))

print("\n==============================")
print("DETAILED EMOTION REPORT")
print("==============================\n")
print(classification_report(gold_emotions, pred_emotions, zero_division=0))

# Save CSV
df = pd.DataFrame({
    "Emotion Macro F1": [emotion_macro],
    "Emotion Micro F1": [emotion_micro],
    "Emotion Accuracy": [full_match_acc]
})
df.to_csv(f"emotion_eval_results_{GOLD_PATH.split('/')[-1].replace('.jsonl', '')}.csv", index=False)

print(f"\nSaved â†’ emotion_eval_results_{GOLD_PATH.split('/')[-1].replace('.jsonl', '')}.csv")

Loaded gold rows: 300
Loaded pred rows: 300

EMOTION-ONLY EVALUATION REPORT

Emotion Macro-F1 : 0.3731
Emotion Micro-F1 : 0.5799
Exact Emotion Accuracy: 0.5799

DETAILED EMOTION REPORT

                precision    recall  f1-score   support

    Admiration       0.50      0.25      0.33        52
     Annoyance       0.56      0.49      0.52        82
  Appreciation       0.00      0.00      0.00         0
      Approval       0.74      0.59      0.66        44
     Confusion       1.00      0.56      0.71         9
Disappointment       0.56      0.39      0.46        70
   Disapproval       0.52      0.46      0.49        24
       Disgust       0.00      0.00      0.00         0
    Excitement       0.12      0.50      0.20         2
          Fear       0.00      0.00      0.00         1
   Frustration       0.33      0.70      0.45        37
     Gratitude       0.58      0.64      0.61        11
     Impressed       0.16      0.33      0.22        18
   Indifferent       0.99    

In [42]:
## Aspect-Polarity-Emotion evaluation

import json
import os
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from collections import Counter

# -----------------------------
# Paths
# -----------------------------

#GOLD_PATH = os.path.join(data_root, "daniel_50.jsonl")
#PRED_PATH = os.path.join(results_root, "gemini-flash", "gemini_annotated_aspect_polarity_emotions_daniel_50.jsonl")

GOLD_PATH = os.path.join(data_root, "cleaned_300.jsonl")
PRED_PATH = os.path.join(results_root, "gemini-flash", "gemini_annotated_aspect_polarity_emotions_300.jsonl")
# -----------------------------
# Load JSONL
# -----------------------------
def load_jsonl(path):
    return [json.loads(line) for line in open(path, "r", encoding="utf-8")]

gold = load_jsonl(GOLD_PATH)
pred = load_jsonl(PRED_PATH)

# Map gold/pred by input text
gold_map = {row["input"]: row["output"] for row in gold}
pred_map = {row["input"]: row["output"] for row in pred}

print(f"Loaded gold rows: {len(gold_map)}")
print(f"Loaded pred rows: {len(pred_map)}")


# -----------------------------------
# Duplicate detection helper
# -----------------------------------
def find_duplicates(rows, name):
    counter = Counter([row["input"] for row in rows])
    dups = [inp for inp, c in counter.items() if c > 1]

    print(f"\n==============================")
    print(f"DUPLICATE CHECK â†’ {name}")
    print("==============================")

    if not dups:
        print("No duplicates found.\n")
        return {}

    print(f"Found {len(dups)} duplicate input texts:")
    for d in dups:
        print(f" - {d!r}  (x{counter[d]})")
    print()

    # return mapping of input â†’ list of rows
    dup_map = {d: [r for r in rows if r["input"] == d] for d in dups}
    return dup_map


from collections import defaultdict

def find_duplicate_indices_1_based(rows, name):
    index_map = defaultdict(list)

    for i, row in enumerate(rows):
        index_map[row["input"]].append(i + 1)   # convert to 1-based

    duplicates = {text: idxs for text, idxs in index_map.items() if len(idxs) > 1}

    print(f"\n==============================")
    print(f"DUPLICATE ROW INDICES â†’ {name}")
    print("==============================")

    if not duplicates:
        print("No duplicates found.\n")
        return duplicates

    for text, idxs in duplicates.items():
        print(f"\nTEXT: {text!r}")
        print(f"1-based row numbers: {idxs}")

    print()
    return duplicates


gold_dup_indices = find_duplicate_indices_1_based(gold, "GOLD FILE")
pred_dup_indices = find_duplicate_indices_1_based(pred, "PRED FILE")

# -----------------------------------
# Duplicates appearing in BOTH (with indices)
# -----------------------------------
print("\n==============================")
print("DUPLICATES IN BOTH FILES (WITH ROW NUMBERS)")
print("==============================")

common = set(gold_dup_indices.keys()) & set(pred_dup_indices.keys())

if not common:
    print("No overlapping duplicates.\n")
else:
    for text in common:
        print(f"\nTEXT: {text!r}")
        print(f" â†’ GOLD rows: {gold_dup_indices[text]}")
        print(f" â†’ PRED rows: {pred_dup_indices[text]}")
        print("\n-----------------------------------")

# -----------------------------
# Triple alignment
# -----------------------------
def align_triples(gold_list, pred_list):
    """
    Ensures aligned triples for scoring:
    - If pred has MORE triples â†’ truncate
    - If pred has FEWER triples â†’ pad with ('none','none','none')
    """
    g = [(t["aspect"], t["polarity"], t["emotion"]) for t in gold_list]
    p = [(t["aspect"], t["polarity"], t["emotion"]) for t in pred_list]

    gold_n = len(g)
    pred_n = len(p)

    # truncate hallucinations
    if pred_n > gold_n:
        p = p[:gold_n]

    # pad missing predictions
    if pred_n < gold_n:
        pad = [("none", "none", "none")] * (gold_n - pred_n)
        p.extend(pad)

    return g, p


# -----------------------------
# Collect aligned labels
# -----------------------------
all_gold_as, all_pred_as = [], []
all_gold_pol, all_pred_pol = [], []
all_gold_emo, all_pred_emo = [], []

gold_triples_full = []
pred_triples_full = []

for text, gold_list in gold_map.items():

    pred_list = pred_map.get(text, [])

    g_aligned, p_aligned = align_triples(gold_list, pred_list)

    for (ga, gp, ge), (pa, pp, pe) in zip(g_aligned, p_aligned):
        all_gold_as.append(ga)
        all_gold_pol.append(gp)
        all_gold_emo.append(ge)

        all_pred_as.append(pa)
        all_pred_pol.append(pp)
        all_pred_emo.append(pe)

        gold_triples_full.append((ga, gp, ge))
        pred_triples_full.append((pa, pp, pe))


# -----------------------------
# Compute metrics
# -----------------------------
aspect_f1 = f1_score(all_gold_as, all_pred_as, average="macro", zero_division=0)
polarity_f1 = f1_score(all_gold_pol, all_pred_pol, average="macro", zero_division=0)
emotion_macro = f1_score(all_gold_emo, all_pred_emo, average="macro", zero_division=0)
emotion_micro = f1_score(all_gold_emo, all_pred_emo, average="micro", zero_division=0)

# Full ABSA accuracy (exact triple match)
match = sum(1 for g, p in zip(gold_triples_full, pred_triples_full) if g == p)
exact_acc = match / len(gold_triples_full)


# -----------------------------
# Pretty PRINT
# -----------------------------
print("\n==============================")
print("FINAL EVALUATION REPORT")
print("==============================\n")

print("ASPECT CLASSIFICATION")
print("------------------------------")
print(f"Macro F1: {aspect_f1:.4f}\n")

print("POLARITY CLASSIFICATION")
print("------------------------------")
print(f"Macro F1: {polarity_f1:.4f}\n")

print("EMOTION CLASSIFICATION")
print("------------------------------")
print(f"Emotion Macro-F1 : {emotion_macro:.4f}")
print(f"Emotion Micro-F1 : {emotion_micro:.4f}\n")

print("FULL ABSA TRIPLE MATCH")
print("------------------------------")
print(f"Full ABSA Accuracy: {exact_acc:.4f}\n")

print("==============================")
print("DETAILED CLASSIFICATION REPORT")
print("==============================\n")
print(classification_report(all_gold_emo, all_pred_emo, zero_division=0))


# Save optional CSV
df = pd.DataFrame({
    "Aspect F1":      [aspect_f1],
    "Polarity F1":    [polarity_f1],
    "Emotion F1":     [emotion_macro],
    "Emotion Micro":  [emotion_micro],
    "Full ABSA Acc":  [exact_acc]
})
df.to_csv(f"eval_results_absa_{GOLD_PATH.split('/')[-1].replace('.jsonl', '')}.csv", index=False)

print(f"\nSaved â†’ eval_results_{GOLD_PATH.split('/')[-1].replace('.jsonl', '')}.csv")

Loaded gold rows: 300
Loaded pred rows: 300

DUPLICATE ROW INDICES â†’ GOLD FILE

TEXT: 'We were able to reserve a spot at the chef tasting bar with Morimoto who actually called in sick that night, but we were still charged full price.'
1-based row numbers: [1, 202]

TEXT: 'The bar was so crowded there was no point in ordering a drink, and when we were finally seated we got no service at all for 20 minutes.'
1-based row numbers: [152, 302]


DUPLICATE ROW INDICES â†’ PRED FILE

TEXT: 'We were able to reserve a spot at the chef tasting bar with Morimoto who actually called in sick that night, but we were still charged full price.'
1-based row numbers: [1, 202]

TEXT: 'The bar was so crowded there was no point in ordering a drink, and when we were finally seated we got no service at all for 20 minutes.'
1-based row numbers: [152, 302]


DUPLICATES IN BOTH FILES (WITH ROW NUMBERS)

TEXT: 'The bar was so crowded there was no point in ordering a drink, and when we were finally seated we got