In [10]:
# setup - imports, paths, and model config

import json
import time
import re
from pathlib import Path

import requests

LLM_MODEL = "llama3:8b"
DATA_ROOT = Path("..") / "data"
PROCESSED_PATH = DATA_ROOT / "processed" / "ragtruth_processed.jsonl"
SAMPLE_PATH = DATA_ROOT / "samples" / "ragtruth_sample.jsonl"

print("LLM_MODEL:", LLM_MODEL)
print("Processed path exists:", PROCESSED_PATH.exists(), "|", PROCESSED_PATH)
print("Sample path exists:", SAMPLE_PATH.exists(), "|", SAMPLE_PATH)


LLM_MODEL: llama3:8b
Processed path exists: True | ../data/processed/ragtruth_processed.jsonl
Sample path exists: True | ../data/samples/ragtruth_sample.jsonl


In [11]:
# Sanity Check: Basic file presence checks

assert SAMPLE_PATH.exists(), "Sample file not found. Make sure Phase 1 ran successfully."
print(" outputs found.")


 outputs found.


In [12]:
# GPU awareness quick check (system-level)

import subprocess

print("nvidia-smi (first 20 lines):")
try:
    out = subprocess.check_output(["bash", "-lc", "nvidia-smi | head -n 20"], text=True)
    print(out)
except Exception as e:
    print("Could not run nvidia-smi:", e)


nvidia-smi (first 20 lines):
Mon Dec 29 14:16:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX 6000 Ada Gene...    Off |   00000000:AC:00.0  On |                  Off |
| 30%   42C    P8             29W /  300W |     831MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX 6

In [13]:
# Confirm we can run shell commands

print("Shell OK. If nvidia-smi printed, GPU is visible to the OS.")


Shell OK. If nvidia-smi printed, GPU is visible to the OS.


In [14]:
# Check Ollama is reachable and model is available

import subprocess

OLLAMA_HOST = "http://localhost:11434"
print("Ollama host:", OLLAMA_HOST)

try:
    # This requires ollama CLI to be installed in the environment.
    models = subprocess.check_output(["bash", "-lc", "ollama list"], text=True)
    print("ollama list:\n", models)
except Exception as e:
    print("Could not run `ollama list` (CLI not available?). Error:", e)

# Also check the HTTP endpoint is reachable
try:
    tags = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=10).json()
    print("Ollama HTTP reachable. Number of models visible:", len(tags.get("models", [])))
except Exception as e:
    print("Ollama HTTP not reachable. Start Ollama and retry. Error:", e)


Ollama host: http://localhost:11434
ollama list:
 NAME         ID              SIZE      MODIFIED   
llama3:8b    365c0bd3c000    4.7 GB    9 days ago    

Ollama HTTP reachable. Number of models visible: 1


In [15]:
# Sanity Check: Basic Ollama HTTP reachability

resp = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=10)
print("HTTP status:", resp.status_code)
assert resp.status_code == 200, "Ollama HTTP server not reachable at localhost:11434"
print("Ollama server is reachable.")


HTTP status: 200
Ollama server is reachable.


In [28]:
# Load a small set of examples to judge

def read_jsonl(path: Path, limit=None):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if limit is not None and i >= limit:
                break
            rows.append(json.loads(line))
    return rows

# examples = read_jsonl(SAMPLE_PATH, limit=10)
examples = read_jsonl(PROCESSED_PATH, limit=200)

print("Loaded examples:", len(examples))
print("Keys:", list(examples[0].keys()))
print("Preview example_id:", examples[0].get("example_id"))
print("Question preview:", (examples[0].get("question") or "")[:200])
print("Context preview:", (examples[0].get("context") or "")[:200])
print("Answer preview:", (examples[0].get("answer") or "")[:200])
print("Human label preview (if any):", examples[0].get("label"))


Loaded examples: 200
Keys: ['example_id', 'task', 'question', 'context', 'answer', 'label', 'meta']
Preview example_id: 0
Question preview: Summarize the following news within 116 words:
Context preview: Seventy years ago, Anne Frank died of typhus in a Nazi concentration camp at the age of 15. Just two weeks after her supposed death on March 31, 1945, the Bergen-Belsen concentration camp where she ha
Answer preview: The Anne Frank House has revealed that Anne Frank and her older sister, Margot, likely died at least a month earlier than previously believed. The sisters, who were imprisoned in Nazi concentration ca
Human label preview (if any): {'evident_conflict': 0, 'baseless_info': 0}


In [17]:
# Sanity Check: Example schema looks usable for judging

req = ["example_id", "question", "context", "answer"]
for k in req:
    assert k in examples[0], f"Missing key in sample: {k}"
print("Sample schema OK for judge.")


Sample schema OK for judge.


In [29]:
# Define the judge rubric and prompt builder (faithfulness/groundedness)

JUDGE_RUBRIC = """
You are evaluating a Retrieval-Augmented Generation (RAG) answer.

Your ONLY source of truth is the provided CONTEXT.
Do not use outside knowledge.

Your task is to determine whether the ANSWER is fully supported by the CONTEXT.

Decision rules:
- PASS: All factual claims in the answer are directly supported by the context.
- FAIL: Any claim is unsupported, missing from the context, or contradicted by the context.

Then:
1. Give a short reason (1–2 sentences).
2. Provide a numeric uncertainty score as negative log likelihood (NLL).
   - Lower NLL = more confident
   - Higher NLL = less confident

Use the following format EXACTLY:

EVALUATION: PASS or FAIL
REASON: <short explanation>
NLL: <floating point number>

Examples:

EVALUATION: PASS
REASON: The answer restates facts that are explicitly mentioned in the provided context.
NLL: 0.12

EVALUATION: FAIL
REASON: The answer introduces details that do not appear in the provided context.
NLL: 1.05

Return ONLY the three lines above.
Do not include JSON, markdown, code blocks, or extra text.

""".strip()

def build_judge_prompt(question: str, context: str, answer: str) -> str:
    question = question.strip()
    context = context.strip()
    answer = answer.strip()
    prompt = f"""
{JUDGE_RUBRIC}

QUESTION:
{question}

CONTEXT:
{context}

ANSWER:
{answer}
""".strip()
    return prompt

print("Judge rubric ready. Length:", len(JUDGE_RUBRIC), "chars")


Judge rubric ready. Length: 1053 chars


In [20]:
# Sanity Check: Build and preview a judge prompt

p = build_judge_prompt(
    question=examples[0]["question"],
    context=examples[0]["context"],
    answer=examples[0]["answer"],
)

print("Prompt preview (first 800 chars):\n")
print(p[:800] + ("\n... (truncated)" if len(p) > 800 else ""))

assert "EVALUATION:" in p
assert "REASON:" in p
assert "NLL:" in p

print("Prompt looks OK.")


Prompt preview (first 800 chars):

You are evaluating a Retrieval-Augmented Generation (RAG) answer.

The provided CONTEXT is the ONLY source of truth.
Do NOT use outside knowledge.
Do NOT assume missing facts are true.

IMPORTANT DECISION RULE:
- The answer may contain multiple factual claims.
- You must evaluate EACH factual claim independently.
- If ANY factual claim is not explicitly supported by the context, the final evaluation MUST be FAIL.

Evaluation steps you must follow internally:
1. Identify all factual claims in the answer.
2. For each claim, check whether it is directly supported by the context.
3. If even one claim is unsupported, incomplete, or contradicted, output FAIL.

Output format (exactly):

EVALUATION: PASS or FAIL
REASON: One short sentence explaining the decision.
NLL: A numeric uncertainty value (
... (truncated)
Prompt looks OK.


In [21]:
# Ollama generate call helper (non-streaming for easy parsing)

def ollama_generate(prompt: str, model: str, temperature: float = 0.0, timeout_s: int = 120) -> str:
    url = f"{OLLAMA_HOST}/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": temperature,
            "stop": ["\n\n"],  # stop after first block
        },
    }
    r = requests.post(url, json=payload, timeout=timeout_s)
    r.raise_for_status()
    data = r.json()
    return data.get("response", "").strip()



In [22]:
# Sanity Check: Quick smoke test using judge-style prompt

dummy_prompt = """
You are evaluating a Retrieval-Augmented Generation (RAG) answer.

Your ONLY source of truth is the CONTEXT.

QUESTION:
What is 2 + 2?

CONTEXT:
2 + 2 equals 4.

ANSWER:
2 + 2 is 4.

Respond in the following format:

EVALUATION: PASS or FAIL
REASON: <short explanation>
NLL: <number>
"""

smoke = ollama_generate(
    dummy_prompt,
    model=LLM_MODEL,
    temperature=0.0,
    timeout_s=120,
)

print("Raw model output:\n", smoke)

assert "EVALUATION:" in smoke, "Missing EVALUATION field"
assert "REASON:" in smoke, "Missing REASON field"
assert "NLL:" in smoke, "Missing NLL field"

print("Smoke test passed: judge-style output produced.")


Raw model output:
 EVALUATION: PASS
REASON: The answer matches the context, which provides the correct solution to the arithmetic problem.
NLL: 0.0
Smoke test passed: judge-style output produced.


In [23]:
# Robust text extraction for judge output (PASS / FAIL format)

def extract_judge_fields(text: str):
    """
    Extracts evaluation, reason, and NLL from judge output.
    Expected format:

    EVALUATION: PASS or FAIL
    REASON: <text>
    NLL: <float>
    """
    text = text.strip()

    eval_match = re.search(r"EVALUATION:\s*(PASS|FAIL)", text)
    reason_match = re.search(r"REASON:\s*(.+)", text)
    nll_match = re.search(r"NLL:\s*([0-9]*\.?[0-9]+)", text)

    if not eval_match or not reason_match or not nll_match:
        raise ValueError("Failed to extract required judge fields.")

    return {
        "evaluation": eval_match.group(1),
        "reason": reason_match.group(1).strip(),
        "nll": float(nll_match.group(1)),
    }


def judge_one(example: dict, model: str, temperature: float = 0.0):
    prompt = build_judge_prompt(
        example["question"],
        example["context"],
        example["answer"],
    )

    raw = ollama_generate(
        prompt,
        model=model,
        temperature=temperature,
        timeout_s=180,
    )

    parsed = extract_judge_fields(raw)

    # light validation
    if parsed["evaluation"] not in ["PASS", "FAIL"]:
        raise ValueError(f"Unexpected evaluation: {parsed['evaluation']}")
    if not isinstance(parsed["nll"], float):
        raise ValueError("NLL must be a float.")

    result = {
        "example_id": example.get("example_id"),
        "judge_model": model,
        "evaluation": parsed["evaluation"],
        "nll": parsed["nll"],
        "reason": parsed["reason"],
        "raw_judge_output": raw,
        "human_label": example.get("label", None),
    }

    return result



In [24]:
# Run judge on 1 example and print the result

one = examples[0]
res = judge_one(one, model=LLM_MODEL, temperature=0.0)

print("example_id:", res["example_id"])
print("evaluation:", res["evaluation"])
print("nll:", res["nll"])
print("reason:", res["reason"])
print("human_label:", res["human_label"])

assert res["evaluation"] in ["PASS", "FAIL"]
assert isinstance(res["nll"], float)
print("Single-example judging works.")


example_id: 0
evaluation: PASS
nll: 0.9
reason: Every factual claim in the answer is fully supported by the context.
human_label: {'evident_conflict': 0, 'baseless_info': 0}
Single-example judging works.


In [None]:
# Batch judge a small set of examples and save results

OUT_DIR = Path("..") / "results" / "runs" / "rag_judge_faithfulness"
OUT_DIR.mkdir(parents=True, exist_ok=True)

JUDGE_OUT_PATH = OUT_DIR / "phase2_judge_outputs_200.jsonl"

batch_results = []

for ex in examples:
    try:
        r = judge_one(ex, model=LLM_MODEL, temperature=0.0)
        batch_results.append(r)

        print(
            f"OK | {r['example_id']} | "
            f"evaluation={r['evaluation']} | "
            f"human={r['human_label']} | "
            f"nll={r['nll']:.3f}"
        )

    except Exception as e:
        print(f"FAIL | {ex.get('example_id')} | error={e}")

with JUDGE_OUT_PATH.open("w", encoding="utf-8") as f:
    for r in batch_results:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"\nSaved judge outputs: {JUDGE_OUT_PATH}")
print("Rows saved:", len(batch_results))


OK | 0 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 0} | nll=0.120
OK | 1 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 0} | nll=0.050
OK | 2 | evaluation=PASS | human={'evident_conflict': 1, 'baseless_info': 1} | nll=0.010
OK | 3 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 1} | nll=0.120
OK | 4 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 0} | nll=0.050
OK | 5 | evaluation=PASS | human={'evident_conflict': 1, 'baseless_info': 1} | nll=0.010
OK | 6 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 0} | nll=0.010
OK | 7 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 0} | nll=0.050
OK | 8 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 0} | nll=0.010
OK | 9 | evaluation=PASS | human={'evident_conflict': 0, 'baseless_info': 0} | nll=0.010
OK | 10 | evaluation=PASS | human={'evident_conflict': 1, 'baseless_info': 0} | nll=0.050
OK | 11 | evaluation

In [26]:
# Sanity Check: Read back judge results and summarize evaluation counts

def count_evaluations(rows):
    counts = {}
    for r in rows:
        ev = r.get("evaluation", "missing")
        counts[ev] = counts.get(ev, 0) + 1
    return counts

rows_back = read_jsonl(JUDGE_OUT_PATH, limit=None)
print("Read back rows:", len(rows_back))

counts = count_evaluations(rows_back)
print("Evaluation counts:", counts)

assert len(rows_back) > 0, "No judge outputs saved. Check Ollama and prompts."
assert "PASS" in counts or "FAIL" in counts, "No valid evaluations found."
print("Sample run looks good.")


Read back rows: 10
Evaluation counts: {'PASS': 10}
Sample run looks good.


## Note: Why the Judge Output Format Was Changed

Originally, the LLM judge was asked to return a strict JSON object (label, confidence, evidence, notes).  
In practice, this caused frequent failures because the model often produced slightly invalid JSON
(extra text, missing commas, formatting issues), especially when running batch evaluations with a local model.

To make the pipeline more robust, the judge output contract was changed.

Instead of JSON, the judge now returns structured plain text with three fields:
- **EVALUATION**: PASS or FAIL (whether the answer is supported by the retrieved context)
- **REASON**: a short explanation of the decision
- **NLL**: a numeric uncertainty signal (negative log likelihood)

This text output is easier for the model to produce reliably.  
Regex is then used to extract these fields, and the results are stored as JSON afterward.

This change improves stability without changing the judge’s reasoning logic.
It separates *generation robustness* (LLM output) from *data structure* (storage and evaluation).

In short:
- JSON generation was fragile
- Text + regex is more reliable
- Structured data is still preserved after parsing
