In [44]:
# setup - imports and paths

import json
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np

JUDGE_RESULTS_PATH = Path("..") / "results" / "runs"/ "rag_judge_faithfulness"/ "phase2_judge_outputs_sample.jsonl"

print("Judge results path:", JUDGE_RESULTS_PATH)
print("Exists:", JUDGE_RESULTS_PATH.exists())


Judge results path: ../results/runs/rag_judge_faithfulness/phase2_judge_outputs_sample.jsonl
Exists: True


In [47]:
# Sanity Check: Judge output file availability

assert JUDGE_RESULTS_PATH.exists(), "Phase 2 judge outputs not found."
print("judge outputs found. Proceeding to metrics.")


judge outputs found. Proceeding to metrics.


In [48]:
# Load judge outputs

def read_jsonl(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))
    return rows

rows = read_jsonl(JUDGE_RESULTS_PATH)

print("Loaded rows:", len(rows))
print("Keys in one row:", list(rows[0].keys()))


Loaded rows: 10
Keys in one row: ['example_id', 'judge_model', 'evaluation', 'nll', 'reason', 'raw_judge_output', 'human_label']


In [49]:
# Sanity Check: Required fields present (PASS / FAIL judge)

required = [
    "example_id",
    "evaluation",     # PASS / FAIL
    "nll",            # uncertainty signal
    "reason",         # short explanation
    "human_label",
]

for r in rows:
    for k in required:
        assert k in r, f"Missing key {k}"

print("All required fields present.")


All required fields present.


In [52]:
# Inspect raw label distributions

judge_labels = [r["evaluation"] for r in rows]
human_labels = [r["human_label"] for r in rows if r["human_label"] is not None]

print("Judge label distribution:")
print(Counter(judge_labels))

print("\nHuman label distribution (raw, as-is from dataset):")
print(Counter(human_labels))


Judge label distribution:
Counter({'PASS': 10})

Human label distribution (raw, as-is from dataset):
Counter({"{'evident_conflict': 0, 'baseless_info': 0}": 7, "{'evident_conflict': 1, 'baseless_info': 1}": 2, "{'evident_conflict': 0, 'baseless_info': 1}": 1})


In [54]:
# Sanity Check: Non-empty judge evaluations and human labels

assert len(judge_labels) > 0, "No judge evaluations found."
assert len(human_labels) > 0, "No human labels found."

print("Both judge evaluations and human labels are non-empty.")


Both judge evaluations and human labels are non-empty.


In [55]:
# Normalize labels into binary faithfulness classes

def normalize_human_label(label):
    """
    Normalize RAGTruth human annotations into binary faithfulness.

    Rule:
    - faithful        if evident_conflict == 0 AND baseless_info == 0
    - hallucinated    otherwise
    """
    if label is None:
        return None

    # Case 1: label is already a dict
    if isinstance(label, dict):
        evident_conflict = label.get("evident_conflict")
        baseless_info = label.get("baseless_info")

    # Case 2: label is a stringified dict (common in JSONL)
    elif isinstance(label, str):
        try:
            parsed = eval(label)
            if not isinstance(parsed, dict):
                return None
            evident_conflict = parsed.get("evident_conflict")
            baseless_info = parsed.get("baseless_info")
        except Exception:
            return None

    else:
        return None

    # Apply RAGTruth rule
    if evident_conflict == 0 and baseless_info == 0:
        return "faithful"
    else:
        return "hallucinated"


In [56]:
def normalize_judge_evaluation(evaluation):
    """
    Normalize judge evaluation into binary faithfulness.

    Rule:
    - PASS -> faithful
    - FAIL -> hallucinated
    """
    if evaluation == "PASS":
        return "faithful"
    elif evaluation == "FAIL":
        return "hallucinated"
    else:
        return None


In [57]:
normalized = []

for r in rows:
    human_norm = normalize_human_label(r["human_label"])
    if human_norm is None:
        continue

    judge_norm = normalize_judge_evaluation(r["evaluation"])
    if judge_norm is None:
        continue

    normalized.append({
        "judge": judge_norm,
        "human": human_norm,
    })


In [59]:
# Sanity Check: Normalized labels sanity

labels_seen = set()
for r in normalized:
    labels_seen.add(r["judge"])
    labels_seen.add(r["human"])

print("Labels seen:", labels_seen)
assert labels_seen.issubset({"faithful", "hallucinated"})
print("Normalized labels look correct.")


Labels seen: {'hallucinated', 'faithful'}
Normalized labels look correct.


In [61]:
# Confusion matrix computation

confusion = {
    "TP": 0,  # hallucinated correctly detected
    "TN": 0,  # faithful correctly detected
    "FP": 0,  # faithful predicted hallucinated
    "FN": 0,  # hallucinated predicted faithful
}

for r in normalized:
    if r["human"] == "hallucinated" and r["judge"] == "hallucinated":
        confusion["TP"] += 1
    elif r["human"] == "faithful" and r["judge"] == "faithful":
        confusion["TN"] += 1
    elif r["human"] == "faithful" and r["judge"] == "hallucinated":
        confusion["FP"] += 1
    elif r["human"] == "hallucinated" and r["judge"] == "faithful":
        confusion["FN"] += 1

print("Confusion matrix:")
for k, v in confusion.items():
    print(f"{k}: {v}")


Confusion matrix:
TP: 0
TN: 7
FP: 0
FN: 3


In [64]:
# Checking each row corresponding decision

for i, (r, n) in enumerate(zip(rows, normalized)):
    print("-" * 60)
    print("Index:", i)
    print("example_id (row):", r["example_id"])
    print("judge_evaluation:", r["evaluation"])
    print("human_label_raw:", r["human_label"])
    print("normalized_human:", n["human"])
    print("normalized_judge:", n["judge"])


------------------------------------------------------------
Index: 0
example_id (row): 0
judge_evaluation: PASS
human_label_raw: {'evident_conflict': 0, 'baseless_info': 0}
normalized_human: faithful
normalized_judge: faithful
------------------------------------------------------------
Index: 1
example_id (row): 1
judge_evaluation: PASS
human_label_raw: {'evident_conflict': 0, 'baseless_info': 0}
normalized_human: faithful
normalized_judge: faithful
------------------------------------------------------------
Index: 2
example_id (row): 2
judge_evaluation: PASS
human_label_raw: {'evident_conflict': 1, 'baseless_info': 1}
normalized_human: hallucinated
normalized_judge: faithful
------------------------------------------------------------
Index: 3
example_id (row): 3
judge_evaluation: PASS
human_label_raw: {'evident_conflict': 0, 'baseless_info': 1}
normalized_human: hallucinated
normalized_judge: faithful
------------------------------------------------------------
Index: 4
example_id

In [65]:
# Sanity Check: Confusion matrix totals

total = sum(confusion.values())
print("Total evaluated rows:", total)
assert total == len(normalized)
print("Confusion matrix totals match.")


Total evaluated rows: 10
Confusion matrix totals match.


In [66]:
# Metric calculations (accuracy, precision, recall, F1)

TP = confusion["TP"]
TN = confusion["TN"]
FP = confusion["FP"]
FN = confusion["FN"]

accuracy = (TP + TN) / max(1, TP + TN + FP + FN)
precision = TP / max(1, TP + FP)
recall = TP / max(1, TP + FN)
f1 = (2 * precision * recall) / max(1e-8, precision + recall)

print("Metrics (hallucination detection):")
print(f"Accuracy : {accuracy:.3f}") # Out of all answers, how often did the judge get it right
print(f"Precision: {precision:.3f}") # When the judge says hallucinated, how often is it actually hallucinated
print(f"Recall   : {recall:.3f}") # Out of all real hallucinations, how many did the judge catch
print(f"F1-score : {f1:.3f}") # A single number that balances precision and recall


Metrics (hallucination detection):
Accuracy : 0.700
Precision: 0.000
Recall   : 0.000
F1-score : 0.000


In [67]:
# Sanity Check Cell 12: Metric bounds

for name, val in {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1,
}.items():
    assert 0.0 <= val <= 1.0, f"{name} out of bounds"

print("All metrics within expected bounds.")


All metrics within expected bounds.


In [69]:
# Cell 13: Uncertainty analysis (NLL vs correctness)

nll_correct = []
nll_incorrect = []

for r, n in zip(rows, normalized):
    is_correct = (n["judge"] == n["human"])
    if is_correct:
        nll_correct.append(r["nll"])
    else:
        nll_incorrect.append(r["nll"])

print("Avg NLL (correct predictions):", round(float(np.mean(nll_correct)), 3))
print("Avg NLL (incorrect predictions):", round(float(np.mean(nll_incorrect)), 3))
print("Count correct:", len(nll_correct))
print("Count incorrect:", len(nll_incorrect))


Avg NLL (correct predictions): 0.043
Avg NLL (incorrect predictions): 0.06
Count correct: 7
Count incorrect: 3


## Note: NLL (Uncertainty) Analysis Results

Results:
- Avg NLL (correct predictions): 0.043
- Avg NLL (incorrect predictions): 0.060

Lower NLL corresponds to higher confidence.  
Even though the judge made incorrect decisions on hallucinated answers, those incorrect predictions had **higher NLL on average** than correct ones.

This indicates that:
- The judge is often *confidently accepting* answers,
- But it is **less confident when it is wrong**.

Although the decision boundary (PASS / FAIL) is currently weak, the uncertainty signal itself is informative.
This suggests that NLL can be leveraged for:
- threshold-based filtering,
- risk-aware evaluation,
- or prioritizing low-confidence answers for review.

In short, the judge is poorly calibrated in classification, but its uncertainty estimates already contain useful signal.


In [71]:
# Sanity Check Cell 14: NLL arrays non-empty

assert len(nll_correct) > 0, "No correct predictions found for NLL analysis."
assert len(nll_incorrect) > 0, "No incorrect predictions found for NLL analysis."

print("NLL uncertainty analysis completed.")


NLL uncertainty analysis completed.


In [75]:
# Cell 15: Error analysis - show a few false negatives and false positives

false_negatives = []
false_positives = []

for r, n in zip(rows, normalized):
    if n["human"] == "hallucinated" and n["judge"] == "faithful":
        false_negatives.append(r)
    elif n["human"] == "faithful" and n["judge"] == "hallucinated":
        false_positives.append(r)

print("False negatives (missed hallucinations):", len(false_negatives))
print("False positives (over-flagged):", len(false_positives))

def preview_errors(errs, title, k=2):
    print("\n", title)
    for e in errs[:k]:
        print("-" * 80)
        print("example_id:", e["example_id"])
        print("judge_evaluation:", e["evaluation"])
        print("human_label_raw:", e["human_label"])
        print("reason:", e["reason"])
        print("nll:", e["nll"])
        print("raw_judge_output preview:")
        print(e["raw_judge_output"][:300])

preview_errors(false_negatives, "False Negatives")
preview_errors(false_positives, "False Positives")


False negatives (missed hallucinations): 3
False positives (over-flagged): 0

 False Negatives
--------------------------------------------------------------------------------
example_id: 2
judge_evaluation: PASS
human_label_raw: {'evident_conflict': 1, 'baseless_info': 1}
reason: The answer restates facts that are explicitly mentioned in the provided context, including the new research findings, the conditions at Bergen-Belsen concentration camp, and the likely dates of death for Anne Frank and her sister Margot.
nll: 0.01
raw_judge_output preview:
EVALUATION: PASS
REASON: The answer restates facts that are explicitly mentioned in the provided context, including the new research findings, the conditions at Bergen-Belsen concentration camp, and the likely dates of death for Anne Frank and her sister Margot.
NLL: 0.01
--------------------------------------------------------------------------------
example_id: 3
judge_evaluation: PASS
human_label_raw: {'evident_conflict': 0, 'baseless_in

In [73]:
# Sanity Check Cell 16: Phase 3 completion check

print("Computed quantitative metrics and qualitative error analysis.")



Computed quantitative metrics and qualitative error analysis.
