In [2]:
import pandas as pd

PATH = "mistral_thirty_726_vs_ground_truth_726.csv"
df = pd.read_csv(PATH)

MEASURES = ["accuracy", "completeness", "relevance", "clarity"]

model_cols = [f"{m}_model" for m in MEASURES]
gt_cols    = [f"{m}_gt" for m in MEASURES]

# Ensure numeric comparison (prevents "4" vs 4.0 mismatches)
for c in model_cols + gt_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Per-measure matches
for m in MEASURES:
    df[f"match_{m}"] = df[f"{m}_model"] == df[f"{m}_gt"]

# Only evaluate rows where all 4 GT values exist
valid = df[gt_cols].notna().all(axis=1)

# All-4 strict accuracy
df["match_all_4"] = valid & df[[f"match_{m}" for m in MEASURES]].all(axis=1)

# ---- Metrics ----
print("Per-measure accuracy:")
for m in MEASURES:
    acc = df.loc[df[f"{m}_gt"].notna(), f"match_{m}"].mean()
    print(f"  {m:>12}: {acc:.2%}")

overall = df.loc[valid, "match_all_4"].mean()
print(f"\nOverall accuracy (ALL 4 must match): {overall:.2%}")
print(f"Rows evaluated: {valid.sum()} / {len(df)}")

# Optional: save detailed output
OUT = "mistral_thirty_726_accuracy_results.csv"
df.to_csv(OUT, index=False)
print("\nSaved detailed results to:", OUT)


Per-measure accuracy:
      accuracy: 60.24%
  completeness: 57.70%
     relevance: 47.23%
       clarity: 36.77%

Overall accuracy (ALL 4 must match): 21.38%
Rows evaluated: 669 / 726

Saved detailed results to: mistral_thirty_726_accuracy_results.csv


In [3]:
import pandas as pd

PATH = "mistral_sixty_726_vs_ground_truth_726.csv"
df = pd.read_csv(PATH)

MEASURES = ["accuracy", "completeness", "relevance", "clarity"]

model_cols = [f"{m}_model" for m in MEASURES]
gt_cols    = [f"{m}_gt" for m in MEASURES]

# Ensure numeric comparison (prevents "4" vs 4.0 mismatches)
for c in model_cols + gt_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Per-measure matches
for m in MEASURES:
    df[f"match_{m}"] = df[f"{m}_model"] == df[f"{m}_gt"]

# Only evaluate rows where all 4 GT values exist
valid = df[gt_cols].notna().all(axis=1)

# All-4 strict accuracy
df["match_all_4"] = valid & df[[f"match_{m}" for m in MEASURES]].all(axis=1)

# ---- Metrics ----
print("Per-measure accuracy:")
for m in MEASURES:
    acc = df.loc[df[f"{m}_gt"].notna(), f"match_{m}"].mean()
    print(f"  {m:>12}: {acc:.2%}")

overall = df.loc[valid, "match_all_4"].mean()
print(f"\nOverall accuracy (ALL 4 must match): {overall:.2%}")
print(f"Rows evaluated: {valid.sum()} / {len(df)}")

# Optional: save detailed output
OUT = "mistral_thirty_726_accuracy_results.csv"
df.to_csv(OUT, index=False)
print("\nSaved detailed results to:", OUT)


Per-measure accuracy:
      accuracy: 51.57%
  completeness: 51.87%
     relevance: 40.81%
       clarity: 28.40%

Overall accuracy (ALL 4 must match): 14.05%
Rows evaluated: 669 / 726

Saved detailed results to: mistral_thirty_726_accuracy_results.csv
