# FACTR_06 â€” Verify claims against Ground-Truth KB

For each extracted **claim**, retrieve KB passages and judge **supports / contradicts / insufficient**, with evidence.

**Inputs**: `CLAIMS_raw.jsonl` + `KB.faiss`/`KB.index.json`/`KB_passages.jsonl`

**Output**: `VERIFICATION.jsonl`


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Setup & load artefacts
import os, json, faiss, numpy as np

ROOT = "/content/drive/MyDrive/FATCR"
DATA_DIR = os.path.join(ROOT, "data", "processed")

# Claims side
CLAIMS = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl")
claims = [json.loads(l) for l in open(CLAIMS, "r", encoding="utf-8") if l.strip()]

# KB side
KB_FAISS = os.path.join(DATA_DIR, "KB.faiss")
KB_MAP   = os.path.join(DATA_DIR, "KB.index.json")
KB_PASS  = os.path.join(DATA_DIR, "KB_passages.jsonl")

index_kb = faiss.read_index(KB_FAISS)
kb_map = json.load(open(KB_MAP, "r", encoding="utf-8"))
kb_rows = [json.loads(l) for l in open(KB_PASS, "r", encoding="utf-8") if l.strip()]

print("Loaded:", len(claims), "claims;", index_kb.ntotal, "KB passages.")


## Embedder (copy of 05 logic)

In [None]:
import numpy as np, os, json
try:
    from google.colab import userdata  # type: ignore
    openai_api_key = userdata.get("OPENAI_API_KEY")
except Exception:
    openai_api_key = os.getenv("OPENAI_API_KEY")

EMBED_META = os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json")
embed_meta = json.load(open(EMBED_META, "r", encoding="utf-8"))
model_name = embed_meta.get("model_name") or embed_meta.get("model", "text-embedding-3-small")
was_normalized = bool(embed_meta.get("normalized", True))
metric = (embed_meta.get("faiss_metric") or embed_meta.get("metric") or "ip").lower()

def _l2_normalize(vecs: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-8
    return vecs / norms

def _is_openai_model(name: str) -> bool:
    return name.startswith("text-embedding-")

use_openai = _is_openai_model(model_name) and bool(openai_api_key)
print("Loading embedding model:", model_name)
print("Using provider:", "OpenAI" if use_openai else "SentenceTransformers")

if use_openai:
    from openai import OpenAI
    client = OpenAI(api_key=openai_api_key)
    def embed_queries(texts):
        out = []
        for i in range(0, len(texts), 96):
            batch = texts[i:i+96]
            resp = client.embeddings.create(model=model_name, input=batch)
            out.extend([d.embedding for d in resp.data])
        vecs = np.asarray(out, dtype="float32")
        return _l2_normalize(vecs) if was_normalized else vecs
else:
    if _is_openai_model(model_name) and not openai_api_key:
        print("Warning: OpenAI model requested but no key; falling back to MiniLM for testing only.")
        model_name = "sentence-transformers/all-MiniLM-L6-v2"
    from sentence_transformers import SentenceTransformer
    st_model = SentenceTransformer(model_name)
    def embed_queries(texts):
        vecs = st_model.encode(texts, convert_to_numpy=True, normalize_embeddings=was_normalized)
        return vecs.astype("float32")


## Retrieval

In [None]:
def kb_search(text, k=20, prefilter=None):
    v = embed_queries([text])
    scores, ids = index_kb.search(v, k)
    out = []
    for s, fid in zip(scores[0], ids[0]):
        ridx = kb_map.get(str(int(fid)), int(fid))
        row = kb_rows[ridx]
        if (prefilter is None) or (float(s) >= prefilter):
            out.append({"kb_id": int(fid), "score": float(s), **row})
    return out


## Option A: simple threshold verdict

In [None]:
def verify_claim_simple(claim_text, k=20, cosine_thresh=0.30):
    hits = kb_search(claim_text, k=k)
    supports = [h for h in hits if h["score"] >= cosine_thresh]
    if supports:
        conf = min(1.0, supports[0]["score"] * 1.5 + 0.1 * (len(supports)-1))
        return {"verdict":"supports", "confidence": conf, "evidence": supports[:3]}
    return {"verdict":"insufficient", "confidence": 0.2, "evidence": hits[:3]}


## Option B: NLI judge (supports/contradicts/insufficient)

In [None]:
# If needed on Colab:
# !pip -q install transformers torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, torch.nn.functional as F

nli_model = "roberta-large-mnli"  # or "microsoft/deberta-large-mnli"
tok = AutoTokenizer.from_pretrained(nli_model)
nli = AutoModelForSequenceClassification.from_pretrained(nli_model).eval()

def nli_label(premise, hypothesis):
    x = tok(premise, hypothesis, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = nli(**x).logits
        probs = F.softmax(logits, dim=-1)[0].tolist()
    # roberta order: [contradiction, neutral, entailment]
    return {"entail": probs[2], "contradict": probs[0], "neutral": probs[1]}

def verify_claim_nli(claim_text, k=12, prefilter=0.25, judge_top=5,
                     entail_thr=0.65, contra_thr=0.65):
    hits = kb_search(claim_text, k=k, prefilter=prefilter)[:judge_top]
    judged = []
    for h in hits:
        prem = h.get("text") or ""
        jl = nli_label(prem, claim_text)
        h.update(jl)
        judged.append(h)

    if not judged:
        return {"verdict":"insufficient", "confidence":0.2, "evidence":[]}

    best_ent = max(judged, key=lambda r: r["entail"])
    best_con = max(judged, key=lambda r: r["contradict"])

    if best_ent["entail"] >= entail_thr:
        ev = sorted(judged, key=lambda r: -r["entail"])[:3]
        return {"verdict":"supports", "confidence": float(best_ent["entail"]), "evidence": ev}

    if best_con["contradict"] >= contra_thr:
        ev = sorted(judged, key=lambda r: -r["contradict"])[:3]
        return {"verdict":"contradicts", "confidence": float(best_con["contradict"]), "evidence": ev}

    return {"verdict":"insufficient", "confidence": 0.4, "evidence": judged[:3]}


## Run across all claims + save

In [None]:
OUT = os.path.join(DATA_DIR, "VERIFICATION.jsonl")
use_nli = True  # set False to use the simple threshold

out_rows = []
for r in claims:
    txt = r.get("claim_text") or r.get("claim") or r.get("text") or ""
    v = verify_claim_nli(txt, k=12) if use_nli else verify_claim_simple(txt, k=20)
    out_rows.append({
        "claim_id": r.get("row_id") or r.get("id"),
        "claim_text": txt,
        **v
    })

with open(OUT, "w", encoding="utf-8") as f:
    for row in out_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Wrote:", OUT, "| rows:", len(out_rows))


## Quick browse + summary

In [None]:
import pandas as pd
dfv = pd.read_json(OUT, lines=True)
display(dfv.head(10))
print("\nVerdict distribution:")
print(dfv["verdict"].value_counts(normalize=True).round(3))
