# FACTR — Claims Extraction + Embeddings
**Version:** v2025-09-07_1.0  
**Purpose:** Read UTTERANCES.parquet → extract claims (OpenAI) → write CLAIMS_raw.jsonl → compute embeddings (stub).


In [None]:
# Config
BATCH = 25
MODEL_CHAT = "gpt-4o-mini"
MODEL_EMB = "text-embedding-3-small"
PROMPT_PATH = "claim_extraction_prompt.txt"  # optional; else fall back to inline prompt


In [None]:
import os, json, pandas as pd
from openai import OpenAI

assert os.path.exists("UTTERANCES.parquet"), "Run ASR+Diarize first."
df = pd.read_parquet("UTTERANCES.parquet")
print("Utterances:", len(df))

# Load prompt
if os.path.exists(PROMPT_PATH):
    prompt_text = open(PROMPT_PATH, "r", encoding="utf-8").read()
else:
    prompt_text = (
        "Extract theological claims as JSON list with fields: "
        "claim_text, type, topic, stance, confidence (0-1). If none, return []."
    )

# OpenAI
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set. Add it to Colab/Env and rerun.")
client = OpenAI(api_key=api_key)

def extract_claims_batch(texts):
    resp = client.chat.completions.create(
        model=MODEL_CHAT,
        messages=[{"role":"system","content":prompt_text},
                  {"role":"user","content":"\n\n".join(texts)}],
        temperature=0.2,
    )
    try:
        data = json.loads(resp.choices[0].message.content)
        if isinstance(data, list):
            return data
    except Exception:
        return []
    return []

out_lines = []
for i in range(0, len(df), BATCH):
    batch = df.iloc[i:i+BATCH]
    texts = [f"{r.speaker}: {r.text}" for r in batch.itertuples()]
    claims = extract_claims_batch(texts)
    if claims:
        for c in claims:
            out_lines.append(json.dumps({
                "utterance_range":[int(i), int(i+len(batch)-1)],
                "claim_text": c.get("claim_text","")[:300],
                "type": c.get("type","other"),
                "topic": c.get("topic","other"),
                "stance": c.get("stance","neutral"),
                "confidence": float(c.get("confidence",0)),
            }, ensure_ascii=False))

with open("CLAIMS_raw.jsonl","w",encoding="utf-8") as f:
    f.write("\n".join(out_lines))
print("✅ Wrote CLAIMS_raw.jsonl:", len(out_lines), "items")


In [None]:
# Embeddings stub (plug FAISS/Chroma later)
from openai import OpenAI
import json, numpy as np, os

assert os.path.exists("CLAIMS_raw.jsonl"), "Run extraction first."
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set.")
client = OpenAI(api_key=api_key)

records = [json.loads(x) for x in open("CLAIMS_raw.jsonl","r",encoding="utf-8").read().splitlines() if x.strip()]
texts = [r["claim_text"] for r in records]
print("Claims:", len(texts))

vecs = []
for t in texts:
    emb = client.embeddings.create(model=MODEL_EMB, input=t).data[0].embedding
    vecs.append(emb)
vecs = np.array(vecs, dtype="float32")
print("Embeddings shape:", vecs.shape)
np.save("CLAIMS_embeddings.npy", vecs)
print("✅ Saved embeddings → CLAIMS_embeddings.npy")


In [None]:
# Smoke test
import os, json, numpy as np
assert os.path.exists("CLAIMS_raw.jsonl"), "Missing CLAIMS_raw.jsonl"
lines = [x for x in open("CLAIMS_raw.jsonl","r",encoding="utf-8").read().splitlines() if x.strip()]
assert len(lines) > 0, "No claims extracted"
assert os.path.exists("CLAIMS_embeddings.npy"), "Missing embeddings file"
arr = np.load("CLAIMS_embeddings.npy")
assert arr.ndim == 2 and arr.shape[0] == len(lines), "Embeddings size mismatch"
print("✅ Claims+Embeddings smoke test passed.")


In [None]:
# Snapshot
import json, time, os, subprocess
snap = {
  "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
  "claims_lines": sum(1 for _ in open("CLAIMS_raw.jsonl","r",encoding="utf-8")) if os.path.exists("CLAIMS_raw.jsonl") else 0,
  "emb_file": os.path.exists("CLAIMS_embeddings.npy"),
  "pip_freeze": subprocess.check_output(["pip","freeze"], text=True).splitlines()[:150],
}
os.makedirs("snapshots", exist_ok=True)
import time as _t
p = f"snapshots/CLAIMS_EMB_SNAPSHOT_{int(_t.time())}.json"
with open(p,"w") as f: json.dump(snap,f,indent=2)
print("📸 Saved:", p)
