# GLACIS Judge Pipeline Demo

End-to-end: **attested QA generation** &rarr; **LLM judge review** &rarr; **human edit** &rarr; **re-judge** &rarr; **attestation chain**

Every LLM call goes through `attested_openai()` &mdash; the control pipeline (input/output staged PII scanning, jailbreak detection), evidence hashing, signing, and storage happen automatically.

```bash
pip install glacis openai anthropic
```

Set `OPENAI_API_KEY` and `ANTHROPIC_API_KEY` in environment or `tests/.env`.

In [None]:
import json, os, sys
from pathlib import Path

REPO_ROOT = Path(".").resolve().parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

# Load .env
env_path = REPO_ROOT / "tests" / ".env"
if env_path.exists():
    for line in env_path.read_text().splitlines():
        line = line.strip()
        if line and not line.startswith("#") and "=" in line:
            k, v = line.split("=", 1)
            os.environ.setdefault(k.strip(), v.strip())

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]
SIGNING_SEED = b"demo-pipeline-seed-0000000000000"  # 32 bytes

from glacis.integrations.openai import attested_openai, get_last_receipt
from glacis import Glacis
from glacis.config import load_config
from glacis.storage import create_storage
from glacis.judges import JudgeRunner
from judges.qa_explain_judge import OpenAIJudge, AnthropicJudge

config = load_config("glacis.yaml")

# Attested OpenAI client — controls + attestation built in
client = attested_openai(
    openai_api_key=OPENAI_API_KEY,
    signing_seed=SIGNING_SEED,
    config="glacis.yaml",
)

# Glacis client for chain operations (decompose, review, sampling)
glacis = Glacis(
    mode="offline",
    signing_seed=SIGNING_SEED,
    storage_backend=config.evidence_storage.backend,
    storage_path=Path(config.evidence_storage.path),
    sampling_config=config.sampling,
)

# Evidence store for persisting full input/output payloads
evidence_store = create_storage(
    backend=config.evidence_storage.backend,
    path=Path(config.evidence_storage.path),
)

# Judge runner (thresholds from glacis.yaml)
judge_runner = JudgeRunner(
    judges=[OpenAIJudge(api_key=OPENAI_API_KEY), AnthropicJudge(api_key=ANTHROPIC_API_KEY)],
    config=config.judges,
)

print(f"Config:  {config.policy.id} | storage: {config.evidence_storage.backend}")
print(f"Sampling: L1={config.sampling.l1_rate}, L2={config.sampling.l2_rate}")
print(f"Judges:  uphold >= {config.judges.uphold_threshold}, borderline >= {config.judges.borderline_threshold}")

## 1. Attested QA Generation

Swap `OpenAI()` for `attested_openai()` &mdash; every LLM call is automatically attested with the staged control pipeline (input PII scanning + jailbreak detection, output controls), evidence hashing, and local signing. Zero changes to your prompt logic.

In [None]:
import re

SOURCE_DOCUMENT = """
ACME Health \u2014 Patient Data Handling Policy (v2.1)

1. All patient records must be encrypted at rest using AES-256.
2. Access to PHI requires two-factor authentication (2FA).
3. Data retention: patient records are kept for 7 years after last visit.
4. De-identification follows the HIPAA Safe Harbor method (18 identifiers removed).
5. Breach notification must occur within 72 hours of discovery.
6. Business associates must sign a BAA before accessing any PHI.
7. Minimum necessary standard: staff access only the PHI needed for their role.
8. Audit logs of all PHI access are retained for 6 years.
""".strip()

def parse_json_array(raw):
    """Extract a JSON array from an LLM response (handles markdown fences)."""
    text = raw.strip()
    if text.startswith("```"):
        text = text.split("\n", 1)[1] if "\n" in text else text[3:]
        if text.endswith("```"):
            text = text[:-3]
        text = text.strip()
    match = re.search(r"\[.*\]", text, re.DOTALL)
    if match:
        text = match.group()
    return json.loads(text)

def parse_json_object(raw):
    """Extract a JSON object from an LLM response (handles markdown fences)."""
    text = raw.strip()
    if text.startswith("```"):
        text = text.split("\n", 1)[1] if "\n" in text else text[3:]
        if text.endswith("```"):
            text = text[:-3]
        text = text.strip()
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        text = match.group()
    return json.loads(text)

# One call — PII scanning + attestation happen automatically
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": f"""Generate exactly 5 question-answer pairs from this document.
Each pair tests factual knowledge. Include one pair with an intentionally wrong answer.
Return JSON array: [{{"question": "...", "answer": "..."}}]

Document:
{SOURCE_DOCUMENT}"""}],
    temperature=0.3,
)

receipt = get_last_receipt()
print(f"Attested: {receipt.id}")
print(f"  hash:      {receipt.evidence_hash[:32]}...")
print(f"  signature: {receipt.signature[:32]}...")
assert glacis.verify(receipt).valid

# Parse QA pairs
qa_pairs = parse_json_array(response.choices[0].message.content)

print(f"\n{len(qa_pairs)} QA pairs:\n")
for i, qa in enumerate(qa_pairs):
    print(f"  [{i}] Q: {qa['question']}")
    print(f"      A: {qa['answer']}\n")

## 2. Decompose, Sample & Judge

Break the batch into individual attestations (shared `operation_id`), run deterministic L1 sampling, then score each pair with GPT-4o-mini + Claude Haiku judges.

In [None]:
# Decompose into individual QA pair attestations
qa_attestations = glacis.decompose(
    receipt, qa_pairs,
    operation_type="qa_pair",
    source_data={"source_document": SOURCE_DOCUMENT},
)

# L1 sampling — deterministic, auditor-reproducible, rate from glacis.yaml
for att in qa_attestations:
    assert glacis.should_review(att).level == "L1"  # l1_rate=1.0 in config

print(f"{len(qa_attestations)} pairs decomposed, all sampled for L1 review\n")

# Judge each pair + attest the review
reviews = []
seq = qa_attestations[-1].operation_sequence + 1

for i, (qa, qa_att) in enumerate(zip(qa_pairs, qa_attestations)):
    result = judge_runner.run(item=qa, reference=SOURCE_DOCUMENT)

    review_input = {"qa_pair": qa, "reference": SOURCE_DOCUMENT}
    review_output = result.model_dump()

    review_att = glacis.attest(
        service_id=config.attestation.service_id,
        operation_type="qa_review",
        input=review_input,
        output=review_output,
        operation_id=receipt.operation_id,
        operation_sequence=seq + i,
    )

    # Store full review evidence (judge scores, explanations, recommendation)
    evidence_store.store_evidence(
        attestation_id=review_att.id,
        attestation_hash=review_att.evidence_hash,
        mode="offline",
        service_id=config.attestation.service_id,
        operation_type="qa_review",
        timestamp=review_att.timestamp or 0,
        input_data=review_input,
        output_data=review_output,
    )

    reviews.append((qa, result, qa_att, review_att))
    print(f"  [{i}] {result.final_score:.1f}/{result.max_score:.0f}  {result.recommendation:>10}   {qa['question'][:55]}")

uphold = sum(1 for _, r, _, _ in reviews if r.recommendation == "uphold")
escalate = sum(1 for _, r, _, _ in reviews if r.recommendation == "escalate")
print(f"\n  {uphold} upheld, {len(reviews) - uphold - escalate} borderline, {escalate} escalated")

## 3. Human Edit & Re-Judge

Find the lowest-scoring pair, correct it, re-attest with `supersedes` (revision chain), and re-judge to verify the fix.

In [None]:
# Find worst pair
worst_i = min(range(len(reviews)), key=lambda i: reviews[i][1].final_score)
worst_qa, worst_result, worst_att, _ = reviews[worst_i]

print(f"Flagged [{worst_i}]: {worst_result.final_score}/{worst_result.max_score} ({worst_result.recommendation})")
print(f"  Q: {worst_qa['question']}")
print(f"  A: {worst_qa['answer']}")

# Generate correction (also attested automatically via attested_openai)
fix = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": f"""Correct this answer using ONLY the source document.

Question: {worst_qa['question']}
Wrong answer: {worst_qa['answer']}
Source: {SOURCE_DOCUMENT}

Return JSON: {{"corrected_answer": "..."}}"""}],
    temperature=0.1,
)

corrected = parse_json_object(fix.choices[0].message.content)["corrected_answer"]
edited_qa = {"question": worst_qa["question"], "answer": corrected}
print(f"  Corrected: {corrected}")

# Re-attest with supersedes (revision chain)
next_seq = reviews[-1][3].operation_sequence + 1

edited_input = {"source_document": SOURCE_DOCUMENT}
edited_att = glacis.attest(
    service_id=config.attestation.service_id,
    operation_type="qa_pair",
    input=edited_input,
    output=edited_qa,
    operation_id=receipt.operation_id,
    operation_sequence=next_seq,
    supersedes=worst_att.id,
)

# Store evidence for the edited pair
evidence_store.store_evidence(
    attestation_id=edited_att.id,
    attestation_hash=edited_att.evidence_hash,
    mode="offline",
    service_id=config.attestation.service_id,
    operation_type="qa_pair",
    timestamp=edited_att.timestamp or 0,
    input_data=edited_input,
    output_data=edited_qa,
)

# Re-judge the corrected pair
re_result = judge_runner.run(item=edited_qa, reference=SOURCE_DOCUMENT)

re_review_input = {"qa_pair": edited_qa, "reference": SOURCE_DOCUMENT}
re_review_output = re_result.model_dump()

re_review_att = glacis.attest(
    service_id=config.attestation.service_id,
    operation_type="qa_review",
    input=re_review_input,
    output=re_review_output,
    operation_id=receipt.operation_id,
    operation_sequence=next_seq + 1,
)

# Store evidence for the re-review
evidence_store.store_evidence(
    attestation_id=re_review_att.id,
    attestation_hash=re_review_att.evidence_hash,
    mode="offline",
    service_id=config.attestation.service_id,
    operation_type="qa_review",
    timestamp=re_review_att.timestamp or 0,
    input_data=re_review_input,
    output_data=re_review_output,
)

print(f"\nRe-judge: {re_result.final_score}/{re_result.max_score} ({re_result.recommendation})")
print(f"Improvement: {worst_result.final_score} -> {re_result.final_score}")
print(f"Chain: {edited_att.id[:22]}... --supersedes--> {worst_att.id[:22]}...")

## 4. Attestation Chain

Every operation &mdash; generation, decomposition, review, edit, re-review &mdash; is linked in one attestation chain.

In [None]:
all_atts = (
    [receipt] + qa_attestations
    + [ra for _, _, _, ra in reviews]
    + [edited_att, re_review_att]
)
all_atts.sort(key=lambda a: a.operation_sequence)

print(f"Operation {receipt.operation_id}\n")
print(f"{'Seq':<5} {'Type':<14} {'ID':<26} {'Supersedes'}")
print("-" * 75)
for att in all_atts:
    sup = att.supersedes[:22] + "..." if att.supersedes else ""
    print(f"{att.operation_sequence:<5} {att.operation_type:<14} {att.id[:24]:<26} {sup}")

print(f"\n{len(all_atts)} attestations, 1 operation\n")

# Evidence storage summary (JSONL format — one line per record)
evidence_base = Path(config.evidence_storage.path)
for name in ["receipts.jsonl", "evidence.jsonl"]:
    p = evidence_base / name
    if p.exists():
        lines = [l for l in p.read_text().splitlines() if l.strip()]
        print(f"{name}: {len(lines)} records")

evidence_store.close()
glacis.close()
judge_runner.close()