In [2]:
import pandas as pd
import numpy as np

# Input (already aligned 1:1 by code+question)
aligned_path = "mistral_vs_ground_truth_aligned.csv"
df = pd.read_csv(aligned_path)

# The 4 measures to compare
MEASURES = ["accuracy", "completeness", "relevance", "clarity"]

# Columns in the aligned file
model_cols = [f"{m}_model" for m in MEASURES]
gt_cols    = [f"{m}_gt"    for m in MEASURES]

# Ensure numeric (handles floats from NaNs in GT)
for c in model_cols + gt_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Per-measure exact match flags
for m in MEASURES:
    df[f"match_{m}"] = df[f"{m}_model"].eq(df[f"{m}_gt"])

# A row is "accurate" only if all 4 measures match exactly
# (and only evaluate rows where all 4 GT values exist)
gt_complete_mask = df[gt_cols].notna().all(axis=1)
df["match_all_4"] = gt_complete_mask & df[[f"match_{m}" for m in MEASURES]].all(axis=1)

# --- Metrics ---
# Per-measure accuracy (on rows where that GT value exists)
per_measure_accuracy = {
    m: df.loc[df[f"{m}_gt"].notna(), f"match_{m}"].mean()
    for m in MEASURES
}

# Strict overall accuracy: all 4 must match (on rows where all 4 GT exist)
overall_all4_accuracy = df.loc[gt_complete_mask, "match_all_4"].mean()

print("Per-measure accuracy (exact match):")
for m, acc in per_measure_accuracy.items():
    print(f"  {m:>12}: {acc:.2%}")

print(f"\nOverall accuracy (ALL 4 measures match): {overall_all4_accuracy:.2%}")
print(f"Rows evaluated for overall: {gt_complete_mask.sum()} / {len(df)}")

# Optional: breakdown of how many measures match per row (0-4)
df["num_measures_matched"] = df[[f"match_{m}" for m in MEASURES]].sum(axis=1)
print("\nDistribution of matched measures (0..4):")
print(df.loc[gt_complete_mask, "num_measures_matched"].value_counts().sort_index())

# Save detailed comparison
out_path = "codeqa_all_mistral.csv"
df.to_csv(out_path, index=False)
print("\nSaved detailed results to:", out_path)


Per-measure accuracy (exact match):
      accuracy: 41.41%
  completeness: 43.95%
     relevance: 33.18%
       clarity: 20.93%

Overall accuracy (ALL 4 measures match): 5.83%
Rows evaluated for overall: 669 / 726

Distribution of matched measures (0..4):
num_measures_matched
0    234
1     93
2    225
3     78
4     39
Name: count, dtype: int64

Saved detailed results to: codeqa_all_mistral.csv
