In [None]:
import json
import pandas as pd

# -----------------------
# Paths (edit if needed)
# -----------------------
GROUND_TRUTH_JSON = "Codellama_groundtruth_thirty.json"
STUDENT_CSV = "cl_thirty.csv"
OUT_MIN_GT_JSON = "Codellama_groundtruth_minimized.json"

MEASURES = ["accuracy", "completeness", "relevance", "clarity"]

# -----------------------
# Helpers
# -----------------------
def norm_question(q: str) -> str:
    return str(q).strip().lower()

def get_score(obj):
    """Ground-truth JSON stores scores like {"score": 2}."""
    if isinstance(obj, dict):
        return obj.get("score")
    return None

# -----------------------
# Load files
# -----------------------
with open(GROUND_TRUTH_JSON, "r", encoding="utf-8") as f:
    gt = json.load(f)

df_gt = pd.DataFrame(gt)
df_st = pd.read_csv(STUDENT_CSV)

# -----------------------
# Normalize join key
# -----------------------
df_gt["question_norm"] = df_gt["question"].apply(norm_question)
df_st["question_norm"] = df_st["question"].apply(norm_question)

# -----------------------
# DEDUPE to avoid many-to-many merges
# (keep first occurrence per question)
# -----------------------
df_gt_uniq = df_gt.drop_duplicates("question_norm", keep="first").copy()
df_st_uniq = df_st.drop_duplicates("question_norm", keep="first").copy()

# -----------------------
# Extract GT scores
# -----------------------
for m in MEASURES:
    df_gt_uniq[m + "_gt"] = df_gt_uniq[m].apply(get_score)

# Student scores are already numeric columns: df_st_uniq[m]

# -----------------------
# Minimize ground truth to only those present in student file
# -----------------------
df_gt_min = pd.merge(
    df_gt_uniq,
    df_st_uniq[["question_norm"]],
    on="question_norm",
    how="inner"
).copy()

# Save minimized GT (drop helper column)
gt_min_records = df_gt_min.drop(columns=["question_norm"]).to_dict(orient="records")
with open(OUT_MIN_GT_JSON, "w", encoding="utf-8") as f:
    json.dump(gt_min_records, f, ensure_ascii=False, indent=2)

print(f"Minimized ground truth saved to: {OUT_MIN_GT_JSON}")
print(f"Minimized GT examples: {len(df_gt_min)}")

# -----------------------
# Merge for evaluation (one-to-one by question_norm)
# -----------------------
df_eval = pd.merge(
    df_gt_uniq[["question_norm"] + [m + "_gt" for m in MEASURES]],
    df_st_uniq[["question_norm"] + MEASURES],
    on="question_norm",
    how="inner"
)

print(f"\nAligned examples for evaluation: {len(df_eval)}\n")

# -----------------------
# Agreement accuracy: student score == gold score
# -----------------------
for m in MEASURES:
    valid = df_eval[m + "_gt"].notna() & df_eval[m].notna()
    matched = (df_eval[m + "_gt"] == df_eval[m]) & valid

    total = int(valid.sum())
    correct = int(matched.sum())
    acc = correct / total if total else 0.0

    print(f"{m.capitalize():12s} | Compared: {total:4d} | Matched: {correct:4d} | Agreement: {acc:.4f}")
