In [6]:
# setup - imports and paths

import json
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np

JUDGE_RESULTS_PATH = Path("..") / "results" / "runs"/ "rag_judge_faithfulness"/ "phase2_judge_outputs_sample.jsonl"

print("Judge results path:", JUDGE_RESULTS_PATH)
print("Exists:", JUDGE_RESULTS_PATH.exists())


Judge results path: ../results/runs/rag_judge_faithfulness/phase2_judge_outputs_sample.jsonl
Exists: True


In [7]:
# Sanity Check: Judge output file availability

assert JUDGE_RESULTS_PATH.exists(), "Phase 2 judge outputs not found."
print("Phase 2 judge outputs found. Proceeding to metrics.")


Phase 2 judge outputs found. Proceeding to metrics.


In [8]:
#Load judge outputs

def read_jsonl(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))
    return rows

rows = read_jsonl(JUDGE_RESULTS_PATH)

print("Loaded rows:", len(rows))
print("Keys:", list(rows[0].keys()))


Loaded rows: 6
Keys: ['example_id', 'judge_model', 'judge_label', 'judge_confidence', 'judge_evidence', 'judge_notes', 'raw_judge_output', 'human_label']


In [9]:
# Sanity Check: Required fields present

required = [
    "example_id",
    "judge_label",
    "judge_confidence",
    "human_label",
]

for r in rows:
    for k in required:
        assert k in r, f"Missing key {k}"

print("All required fields present.")


All required fields present.


In [10]:
# Inspect raw label distributions

judge_labels = [r["judge_label"] for r in rows]
human_labels = [r["human_label"] for r in rows if r["human_label"] is not None]

print("Judge label distribution:")
print(Counter(judge_labels))

print("\nHuman label distribution (raw, as-is from dataset):")
print(Counter(human_labels))


Judge label distribution:
Counter({'supported': 5, 'partially_supported': 1})

Human label distribution (raw, as-is from dataset):
Counter({"{'evident_conflict': 0, 'baseless_info': 0}": 5, "{'evident_conflict': 1, 'baseless_info': 1}": 1})


In [11]:
# Sanity Check: Non-empty labels

assert len(judge_labels) > 0
assert len(human_labels) > 0
print("Both judge and human labels are non-empty.")


Both judge and human labels are non-empty.


In [37]:
# Normalize labels into binary faithfulness classes

def normalize_human_label(label):
    """
    Normalize RAGTruth human annotations into binary faithfulness.

    Rule:
    - faithful        if evident_conflict == 0 AND baseless_info == 0
    - hallucinated    otherwise
    """
    if label is None:
        return None

    # Case 1: label is already a dict
    if isinstance(label, dict):
        evident_conflict = label.get("evident_conflict")
        baseless_info = label.get("baseless_info")

    # Case 2: label is a stringified dict (common in JSONL)
    elif isinstance(label, str):
        try:
            parsed = eval(label)
            if not isinstance(parsed, dict):
                return None
            evident_conflict = parsed.get("evident_conflict")
            baseless_info = parsed.get("baseless_info")
        except Exception:
            return None

    else:
        return None

    # Apply RAGTruth rule
    if evident_conflict == 0 and baseless_info == 0:
        return "faithful"
    else:
        return "hallucinated"


In [28]:
normalized = []

for r in rows:
    h = normalize_human_label(r["human_label"])
    if h is None:
        continue

    normalized.append({
        "judge": normalize_judge_label(r["judge_label"]),
        "human": h,
    })


In [29]:
# Sanity Check: Normalized labels sanity

labels_seen = set()
for r in normalized:
    labels_seen.add(r["judge"])
    labels_seen.add(r["human"])

print("Labels seen:", labels_seen)
assert labels_seen.issubset({"faithful", "hallucinated"})
print("Normalized labels look correct.")


Labels seen: {'hallucinated', 'faithful'}
Normalized labels look correct.


In [38]:
# Confusion matrix computation

confusion = {
    "TP": 0,  # hallucinated correctly detected
    "TN": 0,  # faithful correctly detected
    "FP": 0,  # faithful predicted hallucinated
    "FN": 0,  # hallucinated predicted faithful
}

for r in normalized:
    if r["human"] == "hallucinated" and r["judge"] == "hallucinated":
        confusion["TP"] += 1
    elif r["human"] == "faithful" and r["judge"] == "faithful":
        confusion["TN"] += 1
    elif r["human"] == "faithful" and r["judge"] == "hallucinated":
        confusion["FP"] += 1
    elif r["human"] == "hallucinated" and r["judge"] == "faithful":
        confusion["FN"] += 1

print("Confusion matrix:")
for k, v in confusion.items():
    print(f"{k}: {v}")


Confusion matrix:
TP: 0
TN: 4
FP: 1
FN: 1


In [39]:
# Checking each row corresponding decision
for i, (r, n) in enumerate(zip(rows, normalized)):
    print("-" * 60)
    print("Index:", i)
    print("example_id (row):", r["example_id"])
    print("judge_label:", r["judge_label"])
    print("human_label_raw:", r["human_label"])
    print("normalized_human:", n["human"])
    print("normalized_judge:", n["judge"])


------------------------------------------------------------
Index: 0
example_id (row): 0
judge_label: supported
human_label_raw: {'evident_conflict': 0, 'baseless_info': 0}
normalized_human: faithful
normalized_judge: faithful
------------------------------------------------------------
Index: 1
example_id (row): 1
judge_label: partially_supported
human_label_raw: {'evident_conflict': 0, 'baseless_info': 0}
normalized_human: faithful
normalized_judge: hallucinated
------------------------------------------------------------
Index: 2
example_id (row): 2
judge_label: supported
human_label_raw: {'evident_conflict': 1, 'baseless_info': 1}
normalized_human: hallucinated
normalized_judge: faithful
------------------------------------------------------------
Index: 3
example_id (row): 6
judge_label: supported
human_label_raw: {'evident_conflict': 0, 'baseless_info': 0}
normalized_human: faithful
normalized_judge: faithful
------------------------------------------------------------
Index: 4


In [40]:
# Sanity Check: Confusion matrix totals

total = sum(confusion.values())
print("Total evaluated rows:", total)
assert total == len(normalized)
print("Confusion matrix totals match.")


Total evaluated rows: 6
Confusion matrix totals match.


In [43]:
# Metric calculations (accuracy, precision, recall, F1)

TP = confusion["TP"]
TN = confusion["TN"]
FP = confusion["FP"]
FN = confusion["FN"]

accuracy = (TP + TN) / max(1, TP + TN + FP + FN)
precision = TP / max(1, TP + FP)
recall = TP / max(1, TP + FN)
f1 = (2 * precision * recall) / max(1e-8, precision + recall)

print("Metrics (hallucination detection):")
print(f"Accuracy : {accuracy:.3f}") # Out of all answers, how often did the judge get it right
print(f"Precision: {precision:.3f}") # When the judge says hallucinated, how often is it actually hallucinated
print(f"Recall   : {recall:.3f}") # Out of all real hallucinations, how many did the judge catch
print(f"F1-score : {f1:.3f}") # A single number that balances precision and recall


Metrics (hallucination detection):
Accuracy : 0.667
Precision: 0.000
Recall   : 0.000
F1-score : 0.000


In [17]:
# Sanity Check Cell 12: Metric bounds

for name, val in {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1,
}.items():
    assert 0.0 <= val <= 1.0, f"{name} out of bounds"

print("All metrics within expected bounds.")


All metrics within expected bounds.


In [18]:
# Cell 13: Confidence analysis (judge confidence vs correctness)

conf_correct = []
conf_incorrect = []

for r, n in zip(rows, normalized):
    is_correct = (n["judge"] == n["human"])
    if is_correct:
        conf_correct.append(r["judge_confidence"])
    else:
        conf_incorrect.append(r["judge_confidence"])

print("Avg confidence (correct):", round(float(np.mean(conf_correct)), 3))
print("Avg confidence (incorrect):", round(float(np.mean(conf_incorrect)), 3))
print("Count correct:", len(conf_correct))
print("Count incorrect:", len(conf_incorrect))


Avg confidence (correct): 0.8
Avg confidence (incorrect): 0.98
Count correct: 1
Count incorrect: 5


In [19]:
# Sanity Check Cell 14: Confidence arrays non-empty

assert len(conf_correct) > 0
assert len(conf_incorrect) > 0
print("Confidence analysis completed.")


Confidence analysis completed.


In [20]:
# Cell 15: Error analysis - show a few false negatives and false positives

false_negatives = []
false_positives = []

for r, n in zip(rows, normalized):
    if n["human"] == "hallucinated" and n["judge"] == "faithful":
        false_negatives.append(r)
    elif n["human"] == "faithful" and n["judge"] == "hallucinated":
        false_positives.append(r)

print("False negatives (missed hallucinations):", len(false_negatives))
print("False positives (over-flagged):", len(false_positives))

def preview_errors(errs, title, k=2):
    print("\n", title)
    for e in errs[:k]:
        print("-" * 80)
        print("example_id:", e["example_id"])
        print("judge_label:", e["judge_label"])
        print("human_label:", e["human_label"])
        print("notes:", e["judge_notes"])
        print("answer preview:", e["raw_judge_output"][:300])

preview_errors(false_negatives, "False Negatives")
preview_errors(false_positives, "False Positives")


False negatives (missed hallucinations): 5
False positives (over-flagged): 0

 False Negatives
--------------------------------------------------------------------------------
example_id: 0
judge_label: supported
human_label: {'evident_conflict': 0, 'baseless_info': 0}
notes: 
answer preview: {
  "label": "supported",
  "confidence": 1.0,
  "evidence": "The Anne Frank House has revealed that Anne Frank and her older sister, Margot, likely died at least a month earlier than previously believed.",
  "notes": ""
}
--------------------------------------------------------------------------------
example_id: 2
judge_label: supported
human_label: {'evident_conflict': 1, 'baseless_info': 1}
notes: 
answer preview: {
  "label": "supported",
  "confidence": 1.0,
  "evidence": "New research conducted by the Anne Frank House has revealed that Anne Frank and her sister Margot likely died in the Bergen-Belsen concentration camp at least a month earlier than previously believed.",
  "notes": ""
}

 F

In [22]:
# Sanity Check Cell 16: Phase 3 completion check

print("Computed quantitative metrics and qualitative error analysis.")



Computed quantitative metrics and qualitative error analysis.
