In [11]:

import pandas as pd
import os

# === USER SETTINGS ===
domains = ["camperplus", "supermarket", "fish&chips", "planningpoker", "grocery", "school", "sports", "ticket"]  # update this list with your actual domains
base_predictions_pattern = "predictions_{}.csv"  # naming pattern for predictions files

# === STEP 1: LOAD BASE FILES ===
ground_truth_all = pd.read_csv("ground_truth.csv")  # columns: domain, Class, Singular, Type
synonyms_all = pd.read_csv("synonyms.csv")          # columns: domain, concept, synonym_1, ...
notpunish_all = pd.read_csv("notpunish.csv")        # columns: domain, concept, synonym_1, ...

# === STEP 2: NORMALIZE STRINGS ===
def norm(s):
    if pd.isna(s):
        return ""
    return str(s).lower().strip()

ground_truth_all = ground_truth_all.applymap(norm)
synonyms_all = synonyms_all.applymap(lambda x: norm(x) if pd.notna(x) else "")
notpunish_all = notpunish_all.applymap(lambda x: norm(x) if pd.notna(x) else "")

# === STEP 3: LOOP OVER DOMAINS ===
for domain in domains:
    print(f"\n=== Evaluating domain: {domain} ===")

    # --- Load predictions file ---
    pred_file = base_predictions_pattern.format(domain)
    if not os.path.exists(pred_file):
        print(f"⚠️ Predictions file not found for domain '{domain}' — skipping.")
        continue
    predictions = pd.read_csv(pred_file).applymap(norm)

    # 🧹 Remove empty or all-NaN columns (fixes the 27-row issue)
    predictions = predictions.dropna(axis=1, how='all')
    predictions = predictions.loc[:, (predictions != "").any(axis=0)]
    print(f"→ Using {len(predictions.columns)} valid rounds for domain '{domain}'")

    # --- Filter for current domain ---
    ground_truth = ground_truth_all[ground_truth_all['domain'] == domain]
    synonyms = synonyms_all[synonyms_all['domain'] == domain]
    notpunish = notpunish_all[notpunish_all['domain'] == domain]

    if ground_truth.empty:
        print(f"⚠️ No ground truth found for domain '{domain}', skipping.")
        continue

    # === STEP 4: BUILD SYNONYM GROUPS ===
    synonym_groups = []
    group_lookup = {}
    concept_lookup = {}

    for _, row in synonyms.iterrows():
        concept = row['concept']
        terms = [v for v in list(row[1:]) if v]
        if concept not in terms:
            terms.append(concept)
        group = frozenset(terms)
        synonym_groups.append((concept, group))
        for t in group:
            group_lookup[t] = group
            concept_lookup[t] = concept

    # === STEP 5: BUILD NOTPUNISH LOOKUP ===
    notpunish_lookup = {}
    for _, row in notpunish.iterrows():
        concept = row['concept']
        terms = [v for v in list(row[1:]) if v]
        if concept not in terms:
            terms.append(concept)
        for t in terms:
            notpunish_lookup[t] = concept

    # === STEP 6: BUILD GROUND TRUTH GROUPS ===
    gt_groups = []
    gt_lookup = {}
    type_lookup = {}
    concepts_gt = set()

    for _, row in ground_truth.iterrows():
        cls = row['Class']
        singular = row['Singular']
        type_value = row.get('Type', 'Must-have').lower()
        concept = cls
        group = frozenset([cls, singular])
        gt_groups.append((concept, group))
        type_lookup[concept] = type_value
        concepts_gt.add(concept)
        for t in group:
            gt_lookup[t] = group
            concept_lookup[t] = concept

    # Add synonym groups as valid GT concepts
    for concept, group in synonym_groups:
        gt_groups.append((concept, group))
        concepts_gt.add(concept)
        for t in group:
            gt_lookup[t] = group
            concept_lookup[t] = concept

    # === STEP 7: EVALUATE ROUNDS ===
    results = []
    annotated_columns = []

    for col in predictions.columns:
        preds_list = predictions[col].tolist()
        tp_concepts = set()
        fp_terms = set()
        nopunish_terms = set()
        used_concepts = set()
        detected_terms = set(preds_list)
        status_list = []

        # --- Evaluate predictions ---
        for term in preds_list:
            if not term:
                status_list.append("empty/na")
                continue

            concept = concept_lookup.get(term)
            group = gt_lookup.get(term)

            if concept and group:
                if concept not in used_concepts:
                    tp_concepts.add(concept)
                    used_concepts.add(concept)
                    status_list.append(f"TP (concept: {concept})")
                else:
                    if term in notpunish_lookup:
                        nopunish_terms.add(term)
                        status_list.append(f"NOPUNISH (redundant synonym of {concept})")
                    else:
                        fp_terms.add(term)
                        status_list.append(f"FP (redundant synonym of {concept})")
            else:
                if term in notpunish_lookup:
                    nopunish_terms.add(term)
                    status_list.append("NOPUNISH (not in GT but allowed)")
                else:
                    fp_terms.add(term)
                    status_list.append("FP (not in synonyms or GT)")

        # --- Compute FN & SHOULDHAVE ---
        fn_concepts = set()
        shouldhave_concepts = set()

        for _, row in ground_truth.iterrows():
            cls = row['Class']
            singular = row['Singular']
            concept = cls

            variants = {cls, singular}
            for v in list(variants):
                if v in group_lookup:
                    variants.update(group_lookup[v])

            # check if all variants are missed
            if not any(v in detected_terms for v in variants):
                type_value = type_lookup.get(concept, 'must-have')
                if type_value == 'must-have':
                    fn_concepts.add(concept)
                else:
                    shouldhave_concepts.add(concept)

        # --- Metrics ---
        tp_count = len(tp_concepts)
        fp_count = len(fp_terms)
        fn_count = len(fn_concepts)

        precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0
        recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0

        def fscore(beta):
            return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall) if (precision + recall) > 0 else 0

        results.append({
            "Domain": domain,
            "Round": col,
            "True_Positive": tp_count,
            "False_Positive": fp_count,
            "False_Negative": fn_count,
            "SHOULDHAVE": len(shouldhave_concepts),
            "NOPUNISH": len(nopunish_terms),
            "Precision": round(precision, 4),
            "Recall": round(recall, 4),
            "F0.5": round(fscore(0.5), 4),
            "F1": round(fscore(1), 4),
            "F2": round(fscore(2), 4),
            "Missed_Classes": ", ".join(sorted(fn_concepts)),
            "Missed_ShouldHave": ", ".join(sorted(shouldhave_concepts))
        })

        annotated_columns.append(pd.Series(preds_list, name=col))
        annotated_columns.append(pd.Series(status_list, name=f"{col}_status"))

    # === STEP 8: ADD AVERAGE ROW ===
    summary = pd.DataFrame(results)

    avg_row = {
        "Domain": domain,
        "Round": "Average",
        "True_Positive": summary["True_Positive"].mean(),
        "False_Positive": summary["False_Positive"].mean(),
        "False_Negative": summary["False_Negative"].mean(),
        "SHOULDHAVE": summary["SHOULDHAVE"].mean(),
        "NOPUNISH": summary["NOPUNISH"].mean(),
        "Precision": summary["Precision"].mean(),
        "Recall": summary["Recall"].mean(),
        "F0.5": summary["F0.5"].mean(),
        "F1": summary["F1"].mean(),
        "F2": summary["F2"].mean(),
        "Missed_Classes": "",
        "Missed_ShouldHave": ""
    }
    summary = pd.concat([summary, pd.DataFrame([avg_row])], ignore_index=True)

    # === STEP 9: SAVE OUTPUT FILES ===
    annotated_predictions = pd.concat(annotated_columns, axis=1)
    annotated_predictions.to_csv(f"annotated_predictions_{domain}.csv", index=False)
    summary.to_csv(f"evaluation_summary_{domain}.csv", index=False)

    print(f"✅ Done: {domain} → evaluation_summary_{domain}.csv (with 11th row average)")




=== Evaluating domain: camperplus ===
→ Using 10 valid rounds for domain 'camperplus'
✅ Done: camperplus → evaluation_summary_camperplus.csv (with 11th row average)

=== Evaluating domain: supermarket ===
⚠️ Predictions file not found for domain 'supermarket' — skipping.

=== Evaluating domain: fish&chips ===
⚠️ Predictions file not found for domain 'fish&chips' — skipping.

=== Evaluating domain: planningpoker ===
→ Using 10 valid rounds for domain 'planningpoker'
✅ Done: planningpoker → evaluation_summary_planningpoker.csv (with 11th row average)

=== Evaluating domain: grocery ===
⚠️ Predictions file not found for domain 'grocery' — skipping.

=== Evaluating domain: school ===
⚠️ Predictions file not found for domain 'school' — skipping.

=== Evaluating domain: sports ===
⚠️ Predictions file not found for domain 'sports' — skipping.

=== Evaluating domain: ticket ===
⚠️ Predictions file not found for domain 'ticket' — skipping.


  ground_truth_all = ground_truth_all.applymap(norm)
  synonyms_all = synonyms_all.applymap(lambda x: norm(x) if pd.notna(x) else "")
  notpunish_all = notpunish_all.applymap(lambda x: norm(x) if pd.notna(x) else "")
  predictions = pd.read_csv(pred_file).applymap(norm)
  predictions = pd.read_csv(pred_file).applymap(norm)
