In [3]:
# setup - imports, paths, and model config

import json
import time
import re
from pathlib import Path

import requests

LLM_MODEL = "llama3:8b"
DATA_ROOT = Path("..") / "data"
PROCESSED_PATH = DATA_ROOT / "processed" / "ragtruth_processed.jsonl"
SAMPLE_PATH = DATA_ROOT / "samples" / "ragtruth_sample.jsonl"

print("LLM_MODEL:", LLM_MODEL)
print("Processed path exists:", PROCESSED_PATH.exists(), "|", PROCESSED_PATH)
print("Sample path exists:", SAMPLE_PATH.exists(), "|", SAMPLE_PATH)


LLM_MODEL: llama3:8b
Processed path exists: True | ../data/processed/ragtruth_processed.jsonl
Sample path exists: True | ../data/samples/ragtruth_sample.jsonl


In [None]:
# Sanity Check: Basic file presence checks

assert SAMPLE_PATH.exists(), "Sample file not found. Make sure Phase 1 ran successfully."
print(" outputs found.")


 outputs found


In [6]:
# GPU awareness quick check (system-level)

import subprocess

print("nvidia-smi (first 20 lines):")
try:
    out = subprocess.check_output(["bash", "-lc", "nvidia-smi | head -n 20"], text=True)
    print(out)
except Exception as e:
    print("Could not run nvidia-smi:", e)


nvidia-smi (first 20 lines):
Fri Dec 26 19:46:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX 6000 Ada Gene...    Off |   00000000:AC:00.0  On |                  Off |
| 30%   41C    P8             29W /  300W |    1407MiB /  49140MiB |     10%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX 6

In [7]:
# Confirm we can run shell commands

print("Shell OK. If nvidia-smi printed, GPU is visible to the OS.")


Shell OK. If nvidia-smi printed, GPU is visible to the OS.


In [8]:
# Check Ollama is reachable and model is available

import subprocess

OLLAMA_HOST = "http://localhost:11434"
print("Ollama host:", OLLAMA_HOST)

try:
    # This requires ollama CLI to be installed in the environment.
    models = subprocess.check_output(["bash", "-lc", "ollama list"], text=True)
    print("ollama list:\n", models)
except Exception as e:
    print("Could not run `ollama list` (CLI not available?). Error:", e)

# Also check the HTTP endpoint is reachable
try:
    tags = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=10).json()
    print("Ollama HTTP reachable. Number of models visible:", len(tags.get("models", [])))
except Exception as e:
    print("Ollama HTTP not reachable. Start Ollama and retry. Error:", e)


Ollama host: http://localhost:11434
ollama list:
 NAME         ID              SIZE      MODIFIED   
llama3:8b    365c0bd3c000    4.7 GB    7 days ago    

Ollama HTTP reachable. Number of models visible: 1


In [9]:
# Sanity Check: Basic Ollama HTTP reachability

resp = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=10)
print("HTTP status:", resp.status_code)
assert resp.status_code == 200, "Ollama HTTP server not reachable at localhost:11434"
print("Ollama server is reachable.")


HTTP status: 200
Ollama server is reachable.


In [10]:
# Load a small set of examples to judge

def read_jsonl(path: Path, limit=None):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if limit is not None and i >= limit:
                break
            rows.append(json.loads(line))
    return rows

examples = read_jsonl(SAMPLE_PATH, limit=10)

print("Loaded examples:", len(examples))
print("Keys:", list(examples[0].keys()))
print("Preview example_id:", examples[0].get("example_id"))
print("Question preview:", (examples[0].get("question") or "")[:200])
print("Context preview:", (examples[0].get("context") or "")[:200])
print("Answer preview:", (examples[0].get("answer") or "")[:200])
print("Human label preview (if any):", examples[0].get("label"))


Loaded examples: 10
Keys: ['example_id', 'task', 'question', 'context', 'answer', 'label', 'meta']
Preview example_id: 0
Question preview: Summarize the following news within 116 words:
Context preview: Seventy years ago, Anne Frank died of typhus in a Nazi concentration camp at the age of 15. Just two weeks after her supposed death on March 31, 1945, the Bergen-Belsen concentration camp where she ha
Answer preview: The Anne Frank House has revealed that Anne Frank and her older sister, Margot, likely died at least a month earlier than previously believed. The sisters, who were imprisoned in Nazi concentration ca
Human label preview (if any): {'evident_conflict': 0, 'baseless_info': 0}


In [21]:
# Sanity Check: Example schema looks usable for judging

req = ["example_id", "question", "context", "answer"]
for k in req:
    assert k in examples[0], f"Missing key in sample: {k}"
print("Sample schema OK for judge.")


Sample schema OK for judge.


In [34]:
# Define the judge rubric and prompt builder (faithfulness/groundedness)

JUDGE_RUBRIC = """
You are evaluating a Retrieval-Augmented Generation (RAG) answer.

Your ONLY source of truth is the provided CONTEXT.
Do not use outside knowledge. If the context does not support a claim, treat it as unsupported.

Classify the ANSWER into one of these labels:
- supported: All claims in the answer are supported by the context.
- partially_supported: Some claims are supported, but the answer also includes unsupported claims OR important details are missing.
- unsupported: The answer is not supported by the context (the context does not contain the needed facts).
- contradicted: The answer contradicts the context.

Return a JSON object with these keys exactly:
{
  "label": one of ["supported","partially_supported","unsupported","contradicted"],
  "confidence": a number from 0.0 to 1.0,
  "evidence": a short quote or pointer from the context that supports your decision (or "" if none),
  "notes": a short explanation (1-3 sentences)
}

Return ONLY valid JSON. No extra text.
""".strip()

def build_judge_prompt(question: str, context: str, answer: str) -> str:
    question = question.strip()
    context = context.strip()
    answer = answer.strip()
    prompt = f"""
{JUDGE_RUBRIC}

QUESTION:
{question}

CONTEXT:
{context}

ANSWER:
{answer}
""".strip()
    return prompt

print("Judge rubric ready. Length:", len(JUDGE_RUBRIC), "chars")


Judge rubric ready. Length: 984 chars


In [35]:
# Sanity Check: Build and preview a judge prompt

p = build_judge_prompt(
    question=examples[0]["question"],
    context=examples[0]["context"],
    answer=examples[0]["answer"],
)
print("Prompt preview (first 800 chars):\n")
print(p[:800] + ("\n... (truncated)" if len(p) > 800 else ""))
assert "Return ONLY valid JSON" in p
print("Prompt looks OK.")


Prompt preview (first 800 chars):

You are evaluating a Retrieval-Augmented Generation (RAG) answer.

Your ONLY source of truth is the provided CONTEXT.
Do not use outside knowledge. If the context does not support a claim, treat it as unsupported.

Classify the ANSWER into one of these labels:
- supported: All claims in the answer are supported by the context.
- partially_supported: Some claims are supported, but the answer also includes unsupported claims OR important details are missing.
- unsupported: The answer is not supported by the context (the context does not contain the needed facts).
- contradicted: The answer contradicts the context.

Return a JSON object with these keys exactly:
{
  "label": one of ["supported","partially_supported","unsupported","contradicted"],
  "confidence": a number from 0.0 to 1.0,
  "ev
... (truncated)
Prompt looks OK.


In [36]:
# Ollama generate call helper (non-streaming for easy parsing)

def ollama_generate(prompt: str, model: str, temperature: float = 0.0, timeout_s: int = 120) -> str:
    url = f"{OLLAMA_HOST}/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": temperature,
        },
    }
    r = requests.post(url, json=payload, timeout=timeout_s)
    r.raise_for_status()
    data = r.json()
    return data.get("response", "")

print("Ollama generate helper ready.")


Ollama generate helper ready.


In [37]:
# Sanity Check: Quick smoke test with a tiny prompt

smoke = ollama_generate("Return JSON: {\"ok\": true}", model=LLM_MODEL, temperature=0.0, timeout_s=120)
print("Raw model output:\n", smoke[:300] + ("..." if len(smoke) > 300 else ""))


Raw model output:
 Here is the JSON:

```json
{
  "ok": true
}
```


In [38]:
# Robust JSON extraction for judge output

def extract_json_object(text: str):
    """
    Tries to pull the first JSON object from the model output.
    This makes the pipeline resilient if the model adds extra whitespace.
    """
    text = text.strip()

    # Fast path: pure JSON
    try:
        return json.loads(text)
    except Exception:
        pass

    # Try to find a JSON object substring
    
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in model output.")
    candidate = m.group(0)

    return json.loads(candidate)

def judge_one(example: dict, model: str, temperature: float = 0.0):
    prompt = build_judge_prompt(example["question"], example["context"], example["answer"])
    raw = ollama_generate(prompt, model=model, temperature=temperature, timeout_s=180)

    parsed = extract_json_object(raw)

    # light validation
    label = parsed.get("label")
    conf = parsed.get("confidence")
    if label not in ["supported", "partially_supported", "unsupported", "contradicted"]:
        raise ValueError(f"Unexpected label: {label}")
    if not isinstance(conf, (int, float)):
        raise ValueError(f"Invalid confidence type: {type(conf)}")

    result = {
        "example_id": example.get("example_id"),
        "judge_model": model,
        "judge_label": label,
        "judge_confidence": float(conf),
        "judge_evidence": parsed.get("evidence", ""),
        "judge_notes": parsed.get("notes", ""),
        "raw_judge_output": raw,
        "human_label": example.get("label", None),
    }
    return result

print("Judge parsing helpers ready.")


Judge parsing helpers ready.


In [39]:
# Run judge on 1 example and print the result

one = examples[0]
res = judge_one(one, model=LLM_MODEL, temperature=0.0)

print("example_id:", res["example_id"])
print("judge_label:", res["judge_label"])
print("judge_confidence:", res["judge_confidence"])
print("judge_evidence:", (res["judge_evidence"][:200] + ("..." if len(res["judge_evidence"]) > 200 else "")))
print("judge_notes:", res["judge_notes"])
print("human_label:", res["human_label"])

assert res["judge_label"] in ["supported", "partially_supported", "unsupported", "contradicted"]
assert 0.0 <= res["judge_confidence"] <= 1.0
print("Single-example judging works.")


example_id: 0
judge_label: supported
judge_confidence: 1.0
judge_evidence: The Anne Frank House has revealed that Anne Frank and her older sister, Margot, likely died at least a month earlier than previously believed.
judge_notes: 
human_label: {'evident_conflict': 0, 'baseless_info': 0}
Single-example judging works.


In [27]:
# Batch judge a small set of examples and save results

OUT_DIR = Path("..") / "results" / "runs" / "rag_judge_faithfulness"
OUT_DIR.mkdir(parents=True, exist_ok=True)

JUDGE_OUT_PATH = OUT_DIR / "phase2_judge_outputs_sample.jsonl"

batch_results = []
for ex in examples:
    try:
        r = judge_one(ex, model=LLM_MODEL, temperature=0.0)
        batch_results.append(r)
        print( 
            f"OK | {r['example_id']} | "
        f"judge={r['judge_label']} | "
        f"human={r['human_label']} | "
        f"conf={r['judge_confidence']:.2f}"
)

    except Exception as e:
        print(f"FAIL | {ex.get('example_id')} | error={e}")

with JUDGE_OUT_PATH.open("w", encoding="utf-8") as f:
    for r in batch_results:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"\nSaved judge outputs: {JUDGE_OUT_PATH}")
print("Rows saved:", len(batch_results))


OK | 0 | judge=supported | human={'evident_conflict': 0, 'baseless_info': 0} | conf=1.00
OK | 1 | judge=partially_supported | human={'evident_conflict': 0, 'baseless_info': 0} | conf=0.80
OK | 2 | judge=supported | human={'evident_conflict': 1, 'baseless_info': 1} | conf=1.00
FAIL | 3 | error=Expecting ',' delimiter: line 4 column 18 (char 74)
FAIL | 4 | error=Expecting ',' delimiter: line 4 column 17 (char 73)
FAIL | 5 | error=Expecting ',' delimiter: line 4 column 17 (char 73)
OK | 6 | judge=supported | human={'evident_conflict': 0, 'baseless_info': 0} | conf=1.00
OK | 7 | judge=supported | human={'evident_conflict': 0, 'baseless_info': 0} | conf=1.00
OK | 8 | judge=supported | human={'evident_conflict': 0, 'baseless_info': 0} | conf=0.90
FAIL | 9 | error=No JSON object found in model output.

Saved judge outputs: ../results/runs/rag_judge_faithfulness/phase2_judge_outputs_sample.jsonl
Rows saved: 6


In [25]:
# Sanity Check: Read back judge results and summarize label counts

def count_labels(rows):
    counts = {}
    for r in rows:
        lab = r.get("judge_label", "missing")
        counts[lab] = counts.get(lab, 0) + 1
    return counts

rows_back = read_jsonl(JUDGE_OUT_PATH, limit=None)
print("Read back rows:", len(rows_back))

counts = count_labels(rows_back)
print("Label counts:", counts)

assert len(rows_back) > 0, "No judge outputs saved. Check Ollama and prompts."
print("sample run looks good.")


Read back rows: 6
Label counts: {'supported': 5, 'partially_supported': 1}
sample run looks good.
