# Evaluation Script
This script was used to evaluate the results from both the techniques to create the metrics as discussed in the paper. 

## Settings
The script takes into account some assumptions in their calculations using the *should-have* and *must-have* classes. 
* Synonyms are allowed once, otherwise it becomes a **False Positive**
* There is an exception for no-punishment in the gold&silver standard
* If the class name is plurar, it also checks for the singular version. It does not do this the other way around (for example "products" -> "product", but not "product" -> "products")
* **True Positives**: either in ground truth or in synonyms 

## Use
When trying to use the script take notice that the directory as presented in the replication package is different from the way the script will behave. The results/LLM/predictions_*.csv files should be in the script folder. The ground_truth.csv, notpunished.csv and synonyms.csv should also be in this folder.

## Use of LLM
The script was produced using Chatgpt, it has been evaluated by the authors to make sure that no errors or misinterpretations exist.  

In [None]:
import pandas as pd
import os

# === USER SETTINGS ===
domains = ["camperplus", "supermarket", "fish&chips", "planningpoker", "grocery", "school", "sports", "ticket"]
base_predictions_pattern = "predictions_{}.csv"

# === LOAD BASE FILES ===
ground_truth_all = pd.read_csv("ground_truth.csv")
synonyms_all = pd.read_csv("synonyms.csv")
notpunish_all = pd.read_csv("notpunish.csv")

# === NORMALIZE COLUMN NAMES ===
ground_truth_all.columns = ground_truth_all.columns.str.lower().str.strip()
synonyms_all.columns = synonyms_all.columns.str.lower().str.strip()
notpunish_all.columns = notpunish_all.columns.str.lower().str.strip()

# === NORMALIZE STRINGS ===
def norm(s):
    if pd.isna(s):
        return ""
    return str(s).lower().strip()

ground_truth_all = ground_truth_all.applymap(norm)
synonyms_all = synonyms_all.applymap(norm)
notpunish_all = notpunish_all.applymap(norm)

# === SAFE SAVE ===
def safe_save(df, path):
    try:
        df.to_csv(path, index=False)
    except PermissionError:
        alt_path = path.replace(".csv", "_new.csv")
        print(f"⚠️ File locked: {path} — saving as {alt_path}")
        df.to_csv(alt_path, index=False)

# === COLLECT OVERALL SUMMARY ===
overall_rows = []

# === MAIN LOOP ===
for domain in domains:
    print(f"\n=== Evaluating domain: {domain} ===")

    pred_file = base_predictions_pattern.format(domain)
    if not os.path.exists(pred_file):
        print(f"⚠️ No predictions file for '{domain}', skipping.")
        continue

    predictions = pd.read_csv(pred_file).applymap(norm)
    predictions = predictions.dropna(axis=1, how='all')
    predictions = predictions.loc[:, (predictions != "").any(axis=0)]
    print(f"→ Using {len(predictions.columns)} valid rounds for '{domain}'")

    # === Filter for current domain ===
    ground_truth = ground_truth_all[ground_truth_all['domain'] == domain]
    synonyms = synonyms_all[synonyms_all['domain'] == domain]
    notpunish = notpunish_all[notpunish_all['domain'] == domain]

    if ground_truth.empty and synonyms.empty:
        print(f"⚠️ No ground truth or synonyms for '{domain}', skipping.")
        continue

    # === BUILD LOOKUPS ===
    concept_lookup = {}
    type_lookup = {}
    group_lookup = {}

    # --- Ground truth base ---
    for _, row in ground_truth.iterrows():
        cls = row['class']
        singular = row['singular']
        ttype = row.get('type', 'must-have').lower()
        for term in [cls, singular]:
            if term:
                concept_lookup[term] = cls
        type_lookup[cls] = ttype

    # --- Synonyms (includes Type column, cross-check with ground truth) ---
    for _, row in synonyms.iterrows():
        ttype_syn = row.get('type', 'must-have').lower()
        concept = row['concept']
        synonym_terms = [v for v in list(row[3:]) if v]
        if concept not in synonym_terms:
            synonym_terms.append(concept)

        # Check consistency with ground truth
        gt_type = type_lookup.get(concept)
        if gt_type and gt_type != ttype_syn:
            print(f"⚠️ Type mismatch for concept '{concept}' in domain '{domain}': "
                  f"Ground truth = '{gt_type}', Synonyms file = '{ttype_syn}'. Using ground truth.")
            ttype_syn = gt_type  # enforce ground truth

        group = set(synonym_terms)
        for term in group:
            group_lookup[term] = group
            concept_lookup[term] = concept
        type_lookup[concept] = ttype_syn

    # --- Not punish lookup ---
    notpunish_lookup = {}
    for _, row in notpunish.iterrows():
        concept = row['concept']
        terms = [v for v in list(row[1:]) if v]
        if concept not in terms:
            terms.append(concept)
        for t in terms:
            notpunish_lookup[t] = concept

    # === EVALUATE ROUNDS ===
    results = []
    annotated_columns = []

    for col in predictions.columns:
        preds = predictions[col].tolist()
        detected_terms = set(preds)
        status_list = []

        tp_concepts = set()
        fp_terms = set()
        nopunish_terms = set()
        used_concepts = set()

        # --- Evaluate TPs and FPs ---
        for term in preds:
            if not term:
                status_list.append("empty/na")
                continue

            canonical = concept_lookup.get(term)
            if canonical:
                canonical_type = type_lookup.get(canonical, 'must-have')
                if canonical not in used_concepts:
                    used_concepts.add(canonical)
                    tp_concepts.add(canonical)
                    # Use the correct type label from type_lookup
                    if canonical_type == 'must-have':
                        status_list.append(f"TP (M: {canonical})")
                    else:
                        status_list.append(f"TP (S: {canonical})")
                else:
                    # Repeated concept
                    if term in notpunish_lookup:
                        nopunish_terms.add(term)
                        status_list.append("NOPUNISH")
                    else:
                        fp_terms.add(term)
                        status_list.append("FP (redundant synonym)")
            else:
                if term in notpunish_lookup:
                    nopunish_terms.add(term)
                    status_list.append("NOPUNISH")
                else:
                    fp_terms.add(term)
                    status_list.append("FP (unknown term)")

        # --- Compute FNs ---
        fn_m, fn_s = set(), set()
        for _, row in ground_truth.iterrows():
            cls = row['class']
            singular = row['singular']
            ttype = row.get('type', 'must-have').lower()
            variants = {cls, singular}
            for v in list(variants):
                if v in group_lookup:
                    variants.update(group_lookup[v])
            if not any(v in detected_terms for v in variants):
                if ttype == 'must-have':
                    fn_m.add(cls)
                else:
                    fn_s.add(cls)

        # --- Metrics ---
        tp_m = sum(1 for c in tp_concepts if type_lookup.get(c) == 'must-have')
        tp_s = sum(1 for c in tp_concepts if type_lookup.get(c) == 'should-have')
        fp_count = len(fp_terms)
        nopunish_count = len(nopunish_terms)
        fn_m_count, fn_s_count = len(fn_m), len(fn_s)
        tp_total, fn_total = tp_m + tp_s, fn_m_count + fn_s_count

        def safe_div(a, b): return a / b if b else 0
        def fscore(p, r, beta): return (1 + beta**2) * (p * r) / ((beta**2 * p) + r) if (p + r) > 0 else 0

        # overall
        precision = safe_div(tp_total, tp_total + fp_count)
        recall = safe_div(tp_total, tp_total + fn_total)

        # must-have
        precision_m = safe_div(tp_m, tp_m + fp_count)
        recall_m = safe_div(tp_m, tp_m + fn_m_count)

        # should-have
        precision_s = safe_div(tp_s, tp_s + fp_count)
        recall_s = safe_div(tp_s, tp_s + fn_s_count)

        # --- Add results ---
        results.append({
            "Domain": domain,
            "Round": col,
            "TP (M)": tp_m,
            "TP (S)": tp_s,
            "FP": fp_count,
            "FN (M)": fn_m_count,
            "FN (S)": fn_s_count,
            "NOPUNISH": nopunish_count,
            "PRECISION (overall)": precision,
            "RECALL (overall)": recall,
            "F0.5 (overall)": fscore(precision, recall, 0.5),
            "F1 (overall)": fscore(precision, recall, 1),
            "F2 (overall)": fscore(precision, recall, 2),
            "PRECISION (M)": precision_m,
            "RECALL (M)": recall_m,
            "F0.5 (M)": fscore(precision_m, recall_m, 0.5),
            "F1 (M)": fscore(precision_m, recall_m, 1),
            "F2 (M)": fscore(precision_m, recall_m, 2),
            "PRECISION (S)": precision_s,
            "RECALL (S)": recall_s,
            "F0.5 (S)": fscore(precision_s, recall_s, 0.5),
            "F1 (S)": fscore(precision_s, recall_s, 1),
            "F2 (S)": fscore(precision_s, recall_s, 2),
            "Missed_Classes": ", ".join(sorted(fn_m)),
            "Missed_ShouldHave": ", ".join(sorted(fn_s))
        })

        annotated_columns.append(pd.Series(preds, name=col))
        annotated_columns.append(pd.Series(status_list, name=f"{col}_status"))

    # === DOMAIN SUMMARY (with Average row) ===
    summary = pd.DataFrame(results)
    avg_row = {col: summary[col].mean() if pd.api.types.is_numeric_dtype(summary[col]) else "" for col in summary.columns}
    avg_row["Domain"] = domain
    avg_row["Round"] = "Average"
    summary = pd.concat([summary, pd.DataFrame([avg_row])], ignore_index=True)

    # === SAVE DOMAIN FILES ===
    annotated_predictions = pd.concat(annotated_columns, axis=1)
    safe_save(annotated_predictions, f"annotated_predictions_{domain}.csv")
    safe_save(summary, f"evaluation_summary_{domain}.csv")

    # === ADD DOMAIN AVG TO OVERALL SUMMARY ===
    overall_rows.append(avg_row)

# === BUILD OVERALL DOMAIN SUMMARY ===
if overall_rows:
    overall_summary = pd.DataFrame(overall_rows)
    avg_row_all = {col: overall_summary[col].mean() if pd.api.types.is_numeric_dtype(overall_summary[col]) else "" for col in overall_summary.columns}
    avg_row_all["Domain"] = "OVERALL_AVERAGE"
    overall_summary = pd.concat([overall_summary, pd.DataFrame([avg_row_all])], ignore_index=True)
    safe_save(overall_summary, "overall_domain_summary.csv")
    print("\n✅ overall_domain_summary.csv created.")
else:
    print("\n⚠️ No domain summaries to aggregate.")


  ground_truth_all = ground_truth_all.applymap(norm)
  synonyms_all = synonyms_all.applymap(norm)
  notpunish_all = notpunish_all.applymap(norm)
  predictions = pd.read_csv(pred_file).applymap(norm)
  predictions = pd.read_csv(pred_file).applymap(norm)
  predictions = pd.read_csv(pred_file).applymap(norm)



=== Evaluating domain: camperplus ===
→ Using 10 valid rounds for 'camperplus'

=== Evaluating domain: supermarket ===
→ Using 10 valid rounds for 'supermarket'

=== Evaluating domain: fish&chips ===
→ Using 10 valid rounds for 'fish&chips'

=== Evaluating domain: planningpoker ===
→ Using 10 valid rounds for 'planningpoker'

=== Evaluating domain: grocery ===
→ Using 10 valid rounds for 'grocery'

=== Evaluating domain: school ===
→ Using 10 valid rounds for 'school'


  predictions = pd.read_csv(pred_file).applymap(norm)
  predictions = pd.read_csv(pred_file).applymap(norm)
  predictions = pd.read_csv(pred_file).applymap(norm)



=== Evaluating domain: sports ===
→ Using 10 valid rounds for 'sports'

=== Evaluating domain: ticket ===
→ Using 10 valid rounds for 'ticket'

✅ overall_domain_summary.csv created.


  predictions = pd.read_csv(pred_file).applymap(norm)
  predictions = pd.read_csv(pred_file).applymap(norm)
