# FACTR ‚Äî Claims Extraction + Embeddings
**Version:** v2025-09-07_1.0  
**Purpose:** Read UTTERANCES.parquet ‚Üí extract claims (OpenAI) ‚Üí write CLAIMS_raw.jsonl ‚Üí compute embeddings (stub).


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Config
BATCH = 25
MODEL_CHAT = "gpt-4o-mini"
MODEL_EMB = "text-embedding-3-small"
PROMPT_PATH = "claim_extraction_prompt.txt"  # optional; else fall back to inline prompt


In [None]:
from google.colab import userdata

api_key = userdata.get("OPENAI_API_KEY")
print("Loaded?", bool(api_key))
print("First 6 chars:", api_key[:6] if api_key else None)


Loaded? True
First 6 chars: sk-pro


In [None]:
# --- Robust claims extraction (JSON only, with fallbacks & logging) ---
import os, json, time, pandas as pd
from openai import OpenAI
from google.colab import userdata

# ---- Config (use existing vars if already defined) ----
MODEL_CHAT   = globals().get("MODEL_CHAT", "gpt-4o-mini")  # pick any chat model you have access to
BATCH        = globals().get("BATCH", 20)                  # number of utterances per request
PROMPT_PATH  = globals().get("PROMPT_PATH", "prompts/claims_prompt.txt")
UTTS_PARQUET = "/content/drive/MyDrive/FATCR/data/processed/UTTERANCES.parquet"

# ---- Load utterances ----
assert os.path.exists(UTTS_PARQUET), "Run ASR+Diarize first."
df = pd.read_parquet(UTTS_PARQUET)
print("Utterances:", len(df))

# ---- Prompt (force JSON-only) ----
if os.path.exists(PROMPT_PATH):
    prompt_text = open(PROMPT_PATH, "r", encoding="utf-8").read().strip()
else:
    prompt_text = """You are a strict JSON generator.

Extract theological claims as a **valid JSON array** only.
Each array item MUST be a JSON object with the fields:
  "claim_text"  (string)
  "type"        (string: e.g., "doctrine", "ethics", "history", or "other")
  "topic"       (string, brief topic label)
  "stance"      (string: "affirm", "deny", "neutral")
  "confidence"  (number 0..1)

Rules:
- Output **JSON only**, no prose, no markdown, no preamble, no trailing text.
- If there are no claims, output [].
- Never wrap in code fences.
"""

# ---- OpenAI client (from Colab Secrets) ----
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set. Add it in Colab Secrets and rerun.")
client = OpenAI(api_key=api_key)

# ---- Helper: safe JSON parse with logging ----
def parse_json_or_log(raw: str, dbg_tag: str) -> list:
    raw = (raw or "").strip()
    if not raw:
        print("‚ö†Ô∏è Empty model response.")
        return []
    try:
        data = json.loads(raw)
        if isinstance(data, list):
            return data
        else:
            print("‚ö†Ô∏è Model returned non-list JSON. Type =", type(data).__name__)
    except Exception as e:
        print("‚ö†Ô∏è Parse failed:", e)
        print("   Raw head (first 300 chars):\n", raw[:300])
    # Save full raw to snapshots for inspection
    os.makedirs("snapshots", exist_ok=True)
    dbg_path = f"snapshots/CLAIMS_DEBUG_{dbg_tag}_{int(time.time())}.txt"
    with open(dbg_path, "w", encoding="utf-8") as f:
        f.write(raw)
    print("   Saved raw to:", dbg_path)
    return []

# ---- Call model for a batch of texts ----
def extract_claims_batch(texts):
    """
    texts: list[str]  (concatenated utterances; we rely on the system prompt to return JSON list)
    """
    resp = client.chat.completions.create(
        model=MODEL_CHAT,
        messages=[
            {"role": "system", "content": prompt_text},
            {"role": "user", "content": "\n\n".join(texts)},
        ],
        temperature=0.2,
    )
    raw = resp.choices[0].message.content
    # dbg tag includes batch size so you can correlate later
    return parse_json_or_log(raw, dbg_tag=f"b{len(texts)}")

# ---- Drive the batching & write JSONL lines ----
out_lines = []
for i in range(0, len(df), BATCH):
    batch = df.iloc[i:i+BATCH]
    # keep speaker label (helps the model separate claims)
    texts = [f"{r.speaker}: {r.text}" for r in batch.itertuples()]
    claims = extract_claims_batch(texts)

    if claims:
        for c in claims:
            out_lines.append(json.dumps({
                "utterance_range": [int(i), int(i + len(batch) - 1)],
                "claim_text":  c.get("claim_text", "")[:300],
                "type":        c.get("type", "other"),
                "topic":       c.get("topic", "other"),
                "stance":      c.get("stance", "neutral"),
                "confidence":  float(c.get("confidence", 0)),
            }, ensure_ascii=False))

# ---- Save JSONL ----
OUT_PATH = "/content/drive/MyDrive/FATCR/data/processed/CLAIMS_raw.jsonl"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(out_lines))

print(f"‚úÖ Wrote {OUT_PATH}:", len(out_lines), "items")


Utterances: 1242
‚úÖ Wrote /content/drive/MyDrive/FATCR/data/processed/CLAIMS_raw.jsonl: 109 items


batching embeddings will be faster, cheaper, and far less likely to hit rate limits. Drop this single cell into FACTR-04 right after you create CLAIMS_raw.jsonl (or wherever you want to embed), then run it.

In [None]:
# === Batch embeddings for CLAIMS_raw.jsonl (fast & robust) ===
import os, json, time, math, numpy as np
from datetime import datetime, timezone # Import timezone here
from openai import OpenAI
from google.colab import userdata

# ---- Config ----
MODEL_EMB   = "text-embedding-3-small"   # or "text-embedding-3-large"
BATCH_EMB   = 64                         # tune for your quota/rate limits
TRUNC_CHARS = 8000                       # hard cap per text to avoid 8192 token issues

# Resolve paths (prefer FACTR/processed; fall back to CWD)
DATA_DIR = "/content/drive/MyDrive/FATCR/data/processed"
RAW_PATH = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl") if os.path.exists(DATA_DIR) else "CLAIMS_raw.jsonl"
EMB_NPY  = os.path.join(DATA_DIR, "CLAIMS_embeddings.npy") if os.path.exists(DATA_DIR) else "CLAIMS_embeddings.npy"
META_JSON= os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json") if os.path.exists(DATA_DIR) else "CLAIMS_embeddings.meta.json"

assert os.path.exists(RAW_PATH), f"Not found: {RAW_PATH}. Run the claims extraction step first."

# ---- Load OpenAI key from Colab Secrets ----
api_key = userdata.get("OPENAI_API_KEY")
assert api_key, "OPENAI_API_KEY missing in Colab Secrets."
client = OpenAI(api_key=api_key)

# ---- Load claims texts in order ----
with open(RAW_PATH, "r", encoding="utf-8") as f:
    records = [json.loads(line) for line in f if line.strip()]

texts = [(rec.get("claim_text") or "").strip()[:TRUNC_CHARS] for rec in records]
print(f"Claims to embed: {len(texts)} | model={MODEL_EMB}")

# ---- Helper with retry/backoff ----
def embed_batch(batch_texts, max_retries=5, base_sleep=2.0):
    for attempt in range(max_retries):
        try:
            resp = client.embeddings.create(model=MODEL_EMB, input=batch_texts)
            return [d.embedding for d in resp.data]
        except Exception as e:
            wait = base_sleep * (2 ** attempt)
            print(f"‚ö†Ô∏è  Embed call failed (attempt {attempt+1}/{max_retries}): {e} ‚Üí sleeping {wait:.1f}s")
            time.sleep(wait)
    raise RuntimeError("Embedding failed after retries.")

# ---- Run in batches ----
all_vecs = []
n = len(texts)
num_batches = math.ceil(n / BATCH_EMB)

t0 = time.time()
for bi in range(num_batches):
    lo, hi = bi*BATCH_EMB, min((bi+1)*BATCH_EMB, n)
    batch = texts[lo:hi]
    vecs  = embed_batch(batch)
    all_vecs.extend(vecs)
    if (bi+1) % 5 == 0 or (bi+1) == num_batches:
        elapsed = time.time() - t0
        print(f"‚Ä¶ {hi}/{n} embedded | elapsed {elapsed:.1f}s")

# ---- Save outputs ----
arr = np.array(all_vecs, dtype="float32")
os.makedirs(os.path.dirname(EMB_NPY) or ".", exist_ok=True)
np.save(EMB_NPY, arr)

meta = {
    "ts": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
    "model": MODEL_EMB,
    "source": RAW_PATH,
    "count": int(arr.shape[0]),
    "dim": int(arr.shape[1]) if arr.size else 0,
    "batch_size": BATCH_EMB,
    "trunc_chars": TRUNC_CHARS,
}
with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print(f"‚úÖ Saved embeddings ‚Üí {EMB_NPY}  shape={arr.shape}")
print(f"üóÇÔ∏è  Meta ‚Üí {META_JSON}")

Claims to embed: 109 | model=text-embedding-3-small
‚Ä¶ 109/109 embedded | elapsed 1.9s
‚úÖ Saved embeddings ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.npy  shape=(109, 1536)
üóÇÔ∏è  Meta ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.meta.json


## Suggestion: before running, quickly confirm your variables:

In [None]:
# Embeddings stub (plug FAISS/Chroma later)
from openai import OpenAI
import json, numpy as np, os
from google.colab import userdata # Import userdata

assert os.path.exists("CLAIMS_raw.jsonl"), "Run extraction first."
# api_key = os.getenv("OPENAI_API_KEY") # Use userdata.get instead
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set.")
client = OpenAI(api_key=api_key)

records = [json.loads(x) for x in open("CLAIMS_raw.jsonl","r",encoding="utf-8").read().splitlines() if x.strip()]
texts = [r["claim_text"] for r in records]
print("Claims:", len(texts))

vecs = []
for t in texts:
    emb = client.embeddings.create(model=MODEL_EMB, input=t).data[0].embedding
    vecs.append(emb)
vecs = np.array(vecs, dtype="float32")
print("Embeddings shape:", vecs.shape)
np.save("CLAIMS_embeddings.npy", vecs)
print("‚úÖ Saved embeddings ‚Üí CLAIMS_embeddings.npy")

Claims: 108
Embeddings shape: (108, 1536)
‚úÖ Saved embeddings ‚Üí CLAIMS_embeddings.npy


In [None]:
# === Embeddings with metadata (save to Drive/processed) ===
from openai import OpenAI
import os, json, time, numpy as np
from google.colab import userdata

# ---- Config ----
DATA_DIR   = "/content/drive/MyDrive/FATCR/data/processed"
RAW_JSON   = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl")
EMB_NPY    = os.path.join(DATA_DIR, "CLAIMS_embeddings.npy")
META_JSON  = os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json")

MODEL_EMB  = "text-embedding-3-small"   # or "text-embedding-3-large"

# ---- API key ----
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set. Add it in Colab Secrets and rerun.")
client = OpenAI(api_key=api_key)

# ---- Load claims ----
assert os.path.exists(RAW_JSON), f"Not found: {RAW_JSON}. Run claims extraction first."
records = [json.loads(x) for x in open(RAW_JSON, "r", encoding="utf-8").read().splitlines() if x.strip()]
texts   = [r["claim_text"] for r in records]
print("Claims:", len(texts))

# ---- Embed ----
vecs = []
for t in texts:
    emb = client.embeddings.create(model=MODEL_EMB, input=t).data[0].embedding
    vecs.append(emb)

arr = np.array(vecs, dtype="float32")
os.makedirs(DATA_DIR, exist_ok=True)
np.save(EMB_NPY, arr)
print(f"‚úÖ Saved embeddings ‚Üí {EMB_NPY} shape={arr.shape}")

# ---- Save metadata ----
meta = {
    "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "model": MODEL_EMB,
    "source": RAW_JSON,
    "count": int(arr.shape[0]),
    "dim": int(arr.shape[1]) if arr.size > 0 else 0,
}
with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print(f"üóÇÔ∏è  Meta ‚Üí {META_JSON}")


Claims: 109
‚úÖ Saved embeddings ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.npy shape=(109, 1536)
üóÇÔ∏è  Meta ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.meta.json


In [None]:
# ‚úÖ Smoke test for FACTR_04
import os, json, numpy as np

DATA_DIR = "/content/drive/MyDrive/FATCR/data/processed"
RAW_JSON = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl")
EMB_NPY  = os.path.join(DATA_DIR, "CLAIMS_embeddings.npy")

# ---- Checks ----
assert os.path.exists(RAW_JSON), "Missing CLAIMS_raw.jsonl"
lines = [x for x in open(RAW_JSON, "r", encoding="utf-8").read().splitlines() if x.strip()]
assert len(lines) > 0, "No claims extracted"

assert os.path.exists(EMB_NPY), "Missing embeddings file"
arr = np.load(EMB_NPY)

assert arr.ndim == 2 and arr.shape[0] == len(lines), "Embeddings size mismatch"

print(f"‚úÖ Claims+Embeddings smoke test passed. {len(lines)} claims, embeddings shape = {arr.shape}")


‚úÖ Claims+Embeddings smoke test passed. 109 claims, embeddings shape = (109, 1536)


## Snapshot (versions, row count, duration) + pointer JSON

In [None]:
# === FACTR_04 Claims+Embeddings Snapshot ===
import os, json, time, numpy as np

ROOT = "/content/drive/MyDrive/FATCR"
DATA_DIR = f"{ROOT}/data/processed"
SNAP_DIR = f"{ROOT}/snapshots"
CLAIMS_JSON = f"{DATA_DIR}/CLAIMS_raw.jsonl"
EMB_NPY = f"{DATA_DIR}/CLAIMS_embeddings.npy"
META_JSON = f"{DATA_DIR}/CLAIMS_embeddings.meta.json"
PTR_PATH  = f"{DATA_DIR}/LAST_CLAIMS.json"

# ---- Checks ----
assert os.path.exists(CLAIMS_JSON), f"Missing {CLAIMS_JSON}"
assert os.path.exists(EMB_NPY), f"Missing {EMB_NPY}"

lines = [x for x in open(CLAIMS_JSON, "r", encoding="utf-8").read().splitlines() if x.strip()]
arr = np.load(EMB_NPY)

assert len(lines) > 0, "No claims extracted"
assert arr.ndim == 2 and arr.shape[0] == len(lines), "Embeddings size mismatch"

print("‚úÖ Claims+Embeddings snapshot")
print("   Claims     :", len(lines))
print("   Embeddings :", arr.shape)

# ---- Save snapshot ----
snap = {
    "ts"    : time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "claims": len(lines),
    "embeddings_shape": arr.shape,
    "raw_json": os.path.relpath(CLAIMS_JSON, ROOT),
    "embeddings_npy": os.path.relpath(EMB_NPY, ROOT),
    "meta_json": os.path.relpath(META_JSON, ROOT),
}

os.makedirs(SNAP_DIR, exist_ok=True)
snap_path = f"{SNAP_DIR}/CLAIMS_SNAPSHOT_{int(time.time())}.json"
with open(snap_path, "w") as f:
    json.dump(snap, f, indent=2)
print("üìù Saved snapshot ->", os.path.relpath(snap_path, ROOT))

# also write a small pointer JSON for git commits
with open(PTR_PATH, "w") as f:
    json.dump({
        "ts"    : snap["ts"],
        "claims": len(lines),
        "shape" : arr.shape,
        "path"  : os.path.relpath(CLAIMS_JSON, ROOT),
    }, f, indent=2)
print("üîó Wrote pointer JSON ->", os.path.relpath(PTR_PATH, ROOT))


‚úÖ Claims+Embeddings snapshot
   Claims     : 109
   Embeddings : (109, 1536)
üìù Saved snapshot -> snapshots/CLAIMS_SNAPSHOT_1757874346.json
üîó Wrote pointer JSON -> data/processed/LAST_CLAIMS.json


## Git push helper (commit notebook + pointer JSON + snapshots)

In [None]:
# === FACTR push (commit notebook + pointer JSON + snapshots + optional tag) ===
from google.colab import userdata
import urllib.parse, os, subprocess, shlex, time, re

# Install ipynbname if not already installed
try:
    import ipynbname
except ImportError:
    print("Installing ipynbname...")
    !pip install ipynbname -q
    import ipynbname


ROOT = "/content/drive/MyDrive/FATCR"
os.chdir(ROOT)

# ---- Config ----
# Change this string if you want a milestone tag (otherwise leave as "")
MILESTONE = "FACTR_04: End-to-end pipeline (Claims+Embeddings working)"

# ---- Show repo status first ----
print("üìÇ Repo status:")
!git status -sb

# ---- Pull (rebase) to avoid non-fast-forward errors ----
print("\nüîÑ Pulling (rebase)‚Ä¶")
pat = userdata.get("GITHUB_PAT")
assert pat, "Missing GITHUB_PAT in Colab Secrets."
enc_pat = urllib.parse.quote(pat, safe="")
PULL_URL = f"https://LukmaanViscomi:{enc_pat}@github.com/LukmaanViscomi/FATCR.git"
!git pull --rebase {PULL_URL} main || true

# ---- Stage files ----
print("\n‚ûï Staging files‚Ä¶")
# Track notebooks + snapshots + pointer JSONs + standard top-level files
!git add notebooks snapshots data/processed/LAST_UTTERANCES.json data/processed/LAST_CLAIMS.json README.md .gitignore 2>/dev/null || true

# include the notebook you‚Äôre running:
nb = ipynbname.path().name  # current .ipynb filename
!git add notebooks/{nb} 2>/dev/null || true

# ---- Commit only if there are changes ----
changed = subprocess.run(["git", "diff", "--cached", "--quiet"]).returncode != 0
if changed:
    msg = f"FACTR snapshot + pointer update [{int(time.time())}]"
    print("\n‚úèÔ∏è Commit:", msg)
    !git commit -m {shlex.quote(msg)}
else:
    print("\n‚ÑπÔ∏è Nothing new to commit.")

# ---- Push (inject PAT only for the network call) ----
print("\n‚¨ÜÔ∏è Pushing to main‚Ä¶")
!git push {PULL_URL} HEAD:main

# ---- Optional: Milestone tag ----
def make_tag_slug(name: str) -> str:
    # keep letters/numbers and . _ - ; replace everything else with -
    slug = re.sub(r"[^A-Za-z0-9._-]+", "-", name.strip())
    slug = slug.strip(".-")  # trim leading/trailing invalid chars
    return slug or "milestone"

if MILESTONE:
    tag = make_tag_slug(MILESTONE)
    print(f"\nüè∑Ô∏è Creating tag: {tag}")
    subprocess.run(["git", "tag", "-f", tag], check=True)        # lightweight tag at HEAD
    subprocess.run(["git", "push", "origin", tag], check=True)   # push to remote
    print("‚úÖ Tag pushed:", tag)

print("\n‚úÖ Push complete.")

Installing ipynbname...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.6/1.6 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hüìÇ Repo status:
## [32mmain[m...[31morigin/main[m [ahead [32m3[m]
 [31mM[m notebooks/FACTR_04_Claims+Embeddings_v2025-09-07_1.0.ipynb
[31m??[m data/processed/CLAIMS_embeddings.meta.json

üîÑ Pulling (rebase)‚Ä¶
error: cannot pull with rebase: You have unstaged changes.
error: please commit or stash them.

‚ûï Staging files‚Ä¶

‚ÑπÔ∏è Nothing new to commit.

‚¨ÜÔ∏è Pushing to main‚Ä¶
Everything up-to-date

üè∑Ô∏è Creating tag: FACTR_04-End-to-end-pipeline-Claims-Embeddings-working
‚úÖ Tag pushed: FACTR_04-End-to-end-pipeline-Claims-Embeddings-working

‚úÖ Push complete.


## Cell 1 ‚Äî Build & save FAISS index (cosine)

In [None]:
# # === Build & save FAISS index for CLAIMS embeddings (cosine) ===
# # Saves: /content/drive/MyDrive/FATCR/data/processed/CLAIMS.faiss
# #        /content/drive/MyDrive/FATCR/data/processed/CLAIMS.index.json  (row metadata)

# !pip -q install faiss-cpu

# import os, json, numpy as np, faiss, time

# ROOT      = "/content/drive/MyDrive/FATCR"
# DATA_DIR  = f"{ROOT}/data/processed"
# EMB_NPY   = f"{DATA_DIR}/CLAIMS_embeddings.npy"
# RAW_JSONL = f"{DATA_DIR}/CLAIMS_raw.jsonl"
# FAISS_IDX = f"{DATA_DIR}/CLAIMS.faiss"
# META_JSON = f"{DATA_DIR}/CLAIMS.index.json"

# assert os.path.exists(EMB_NPY),   f"Missing embeddings: {EMB_NPY}"
# assert os.path.exists(RAW_JSONL), f"Missing claims: {RAW_JSONL}"

# # 1) Load vectors
# vecs = np.load(EMB_NPY).astype("float32")  # (N, D)
# N, D = vecs.shape
# print(f"Embeddings: {vecs.shape}")

# # 2) Cosine similarity via inner product on L2-normalized vectors
# faiss.normalize_L2(vecs)
# index = faiss.IndexFlatIP(D)   # inner-product (with normalized vectors == cosine)
# index.add(vecs)
# print("Index type:", type(index).__name__)
# print("Index size:", index.ntotal)

# # 3) Minimal row metadata (keep it small + easy to join later)
# rows = []
# with open(RAW_JSONL, "r", encoding="utf-8") as f:
#     for i, line in enumerate(f):
#         obj = json.loads(line)
#         rows.append({
#             "row_id": i,
#             "utterance_range": obj.get("utterance_range", [None, None]),
#             "claim_text": obj.get("claim_text", ""),
#             "type": obj.get("type", "other"),
#             "topic": obj.get("topic", "other"),
#             "stance": obj.get("stance", "neutral"),
#             "confidence": float(obj.get("confidence", 0.0)),
#         })
# assert len(rows) == N, f"Metadata rows ({len(rows)}) != embeddings ({N})"

# # 4) Save
# os.makedirs(DATA_DIR, exist_ok=True)
# faiss.write_index(index, FAISS_IDX)
# with open(META_JSON, "w", encoding="utf-8") as f:
#     json.dump({
#         "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
#         "index": os.path.basename(FAISS_IDX),
#         "vectors": os.path.basename(EMB_NPY),
#         "rows": N,
#         "dim": D,
#         "sim": "cosine (via IP on L2-normalized vectors)",
#         "meta_sample": rows[0] if rows else None,
#     }, f, indent=2)

# # store a compact, query-time metadata file (row-wise)
# META_ROWS_JSON = f"{DATA_DIR}/CLAIMS.rows.min.json"
# with open(META_ROWS_JSON, "w", encoding="utf-8") as f:
#     json.dump(rows, f, ensure_ascii=False)

# print("‚úÖ Saved FAISS index ->", FAISS_IDX)
# print("üóÇÔ∏è  Saved index meta  ->", META_JSON)
# print("üóÇÔ∏è  Saved row meta    ->", META_ROWS_JSON)


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m31.4/31.4 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hEmbeddings: (109, 1536)
Index type: IndexFlatIP
Index size: 109
‚úÖ Saved FAISS index -> /content/drive/MyDrive/FATCR/data/processed/CLAIMS.faiss
üóÇÔ∏è  Saved index meta  -> /content/drive/MyDrive/FATCR/data/processed/CLAIMS.index.json
üóÇÔ∏è  Saved row meta    -> /content/drive/MyDrive/FATCR/data/processed/CLAIMS.rows.min.json


## here‚Äôs a clean, copy-paste full cell you can drop into Cell 1 ‚Äî Build & save FAISS index (cosine). It rebuilds the FAISS index and a consistent CLAIMS.index.json from your saved embeddings, updates LAST_FAISS.json, and has guardrails so mismatches can‚Äôt be written again.

In [None]:
# Cell 1 ‚Äî Build & save FAISS index (cosine)
# Rebuild FAISS + id_map from saved embeddings (no re-embedding)

# Optional: install faiss-cpu if missing (Colab safety net)
try:
    import faiss  # noqa
except Exception:
    !pip -q install faiss-cpu
    import faiss

import os, json, numpy as np
from datetime import datetime, timezone

# Resolve ROOT/DATA_DIR (reuse if already defined)
try:
    ROOT
except NameError:
    ROOT = "/content/drive/MyDrive/FATCR"
try:
    DATA_DIR
except NameError:
    DATA_DIR = os.path.join(ROOT, "data", "processed")

EMB_PATH  = os.path.join(DATA_DIR, "CLAIMS_embeddings.npy")
META_PATH = os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json")
FAISS_PATH = os.path.join(DATA_DIR, "CLAIMS.faiss")
MAP_PATH   = os.path.join(DATA_DIR, "CLAIMS.index.json")

# 1) Load embeddings + meta
emb = np.load(EMB_PATH).astype("float32")
with open(META_PATH, "r", encoding="utf-8") as f:
    meta = json.load(f)

print("Embeddings:", emb.shape)

# 2) Choose metric and normalise if your pipeline used cosine/IP
was_normalized = bool(meta.get("normalized", True))
d = emb.shape[1]
if was_normalized:
    faiss.normalize_L2(emb)
    index = faiss.IndexFlatIP(d)   # cosine via inner product on L2-normalised vecs
    index_type = "IndexFlatIP"
else:
    index = faiss.IndexFlatL2(d)
    index_type = "IndexFlatL2"

# 3) Build index in order
index.add(emb)
print("Index type:", index_type)
print("Index size:", index.ntotal)

# Guardrail before writing
assert index.ntotal == emb.shape[0], "Index count != embedding rows. Refusing to write."

# 4) Persist index + a fresh identity id_map (faiss_id -> row_index)
faiss.write_index(index, FAISS_PATH)
id_map = {str(i): i for i in range(index.ntotal)}
with open(MAP_PATH, "w", encoding="utf-8") as f:
    json.dump(id_map, f, ensure_ascii=False, indent=2)

print("‚úÖ Saved FAISS index  ->", FAISS_PATH)
print("‚úÖ Saved index map    ->", MAP_PATH)

# 5) Refresh pointer meta (optional but recommended)
LAST_FAISS = {
    "time": datetime.now(timezone.utc).isoformat(),
    "artefacts": {
        "faiss": os.path.relpath(FAISS_PATH, ROOT),
        "index_json": os.path.relpath(MAP_PATH, ROOT),
    },
    "dim": d,
    "normalized": was_normalized,
    "model": meta.get("model_name") or meta.get("model"),
}
with open(os.path.join(DATA_DIR, "LAST_FAISS.json"), "w", encoding="utf-8") as f:
    json.dump(LAST_FAISS, f, ensure_ascii=False, indent=2)
print("‚úÖ Updated LAST_FAISS.json")

# Final guardrail
assert len(id_map) == index.ntotal, "id_map size != index size. Refusing to proceed."
print("‚úÖ Rebuild complete and consistent.")


## üîé Cell 2 ‚Äî Query helper (embed query ‚Üí top-k claims + source)

In [None]:
# === Query FAISS index by text ===
# Requires OPENAI_API_KEY in Colab Secrets (already used in this notebook).
# Prints top-k claims with claim fields + originating utterance text + timestamps.

import os, json, numpy as np, faiss, pandas as pd
from openai import OpenAI
from google.colab import userdata

ROOT      = "/content/drive/MyDrive/FATCR"
DATA_DIR  = f"{ROOT}/data/processed"
UTTS_PARQ = f"{DATA_DIR}/UTTERANCES.parquet"
FAISS_IDX = f"{DATA_DIR}/CLAIMS.faiss"
ROWS_MIN  = f"{DATA_DIR}/CLAIMS.rows.min.json"

MODEL_EMB = "text-embedding-3-small"  # keep consistent with build step
TOP_K     = 5

# Load artifacts
assert os.path.exists(FAISS_IDX), f"FAISS idx missing: {FAISS_IDX}"
assert os.path.exists(ROWS_MIN),  f"Row meta missing: {ROWS_MIN}"
assert os.path.exists(UTTS_PARQ), f"Utterances parquet missing: {UTTS_PARQ}"

index   = faiss.read_index(FAISS_IDX)
rows    = json.load(open(ROWS_MIN, "r", encoding="utf-8"))
df_utts = pd.read_parquet(UTTS_PARQ)

# OpenAI client
api_key = userdata.get("OPENAI_API_KEY")
assert api_key, "OPENAI_API_KEY missing in Colab Secrets."
client = OpenAI(api_key=api_key)

def embed_text(text: str) -> np.ndarray:
    resp = client.embeddings.create(model=MODEL_EMB, input=text)
    vec = np.array(resp.data[0].embedding, dtype="float32")[None, :]  # (1, D)
    faiss.normalize_L2(vec)
    return vec

def pretty_sec(s):
    try:
        return f"{float(s):.2f}s"
    except Exception:
        return str(s)

def search_claims(query: str, k: int = TOP_K):
    q = embed_text(query)
    sims, idxs = index.search(q, k)  # cosine scores because vectors are normalized
    sims = sims[0]; idxs = idxs[0]

    results = []
    for score, ridx in zip(sims, idxs):
        meta = rows[int(ridx)]
        u0, u1 = meta.get("utterance_range", [None, None])

        # fetch source utterance text & timestamps (grab u0; optionally span to u1)
        src_txt   = None
        t_start   = None
        t_end     = None
        video_id  = None

        try:
            row0 = df_utts.iloc[int(u0)]
            src_txt  = row0.get("text", "")
            t_start  = row0.get("t_start", None)
            t_end    = row0.get("t_end", None)
            video_id = row0.get("video_id", None)
        except Exception:
            pass

        results.append({
            "score": float(score),
            "row_id": int(ridx),
            "claim_text": meta.get("claim_text", ""),
            "type": meta.get("type", "other"),
            "topic": meta.get("topic", "other"),
            "stance": meta.get("stance", "neutral"),
            "confidence": meta.get("confidence", 0.0),
            "utterance_range": meta.get("utterance_range", [None, None]),
            "source_text": src_txt,
            "t_start": t_start,
            "t_end": t_end,
            "video_id": video_id,
        })
    return results

def show(results):
    for i, r in enumerate(results, 1):
        print(f"\n#{i}  score={r['score']:.3f}  row={r['row_id']}  conf={r['confidence']:.2f}")
        print("    claim :", r["claim_text"])
        print(f"    meta  : type={r['type']} | topic={r['topic']} | stance={r['stance']}")
        print(f"    src   : {pretty_sec(r['t_start'])}‚Äì{pretty_sec(r['t_end'])} | video={r['video_id']}")
        if r["source_text"]:
            print("    utt   :", r["source_text"][:220].replace("\n", " "))
        print("    range :", r["utterance_range"])

# Example:
q = "atonement or salvation by faith"
print("Query:", q)
res = search_claims(q, k=5)
show(res)


Query: atonement or salvation by faith

#1  score=0.471  row=31  conf=0.90
    claim : He voluntarily gave himself as the atonement.
    meta  : type=doctrine | topic=atonement | stance=affirm
    src   : 188.28s‚Äì189.88s | video=speFWRuuJNs_16k_mono.wav
    utt   : Thank you, was the cup taken away?
    range : [150, 174]

#2  score=0.424  row=2  conf=0.90
    claim : Jesus voluntarily gave himself as the atonement for us.
    meta  : type=doctrine | topic=atonement | stance=affirm
    src   : 0.00s‚Äì2.28s | video=speFWRuuJNs_16k_mono.wav
    utt   : part of a pagan practice in part of history.
    range : [0, 24]

#3  score=0.341  row=1  conf=0.70
    claim : Just because we believe in the human sacrifice does not mean that we are pagan.
    meta  : type=doctrine | topic=human sacrifice | stance=affirm
    src   : 0.00s‚Äì2.28s | video=speFWRuuJNs_16k_mono.wav
    utt   : part of a pagan practice in part of history.
    range : [0, 24]

#4  score=0.319  row=29  conf=0.80
    claim 

drop-in FAISS snapshot cell that matches the style of your 02/03/04 snapshots.
## It validates the index, logs key stats, and writes both a timestamped snapshot and a tiny pointer JSON you can commit.

In [None]:
# === FACTR_FAISS Snapshot (index health + pointer) ===
# Logs key stats about the FAISS index & related files, and writes:
#  - snapshots/FAISS_SNAPSHOT_*.json
#  - data/processed/LAST_FAISS.json  (tiny pointer you can commit)

import os, json, time, platform
import numpy as np
import faiss

ROOT      = "/content/drive/MyDrive/FATCR"
DATA_DIR  = f"{ROOT}/data/processed"
SNAP_DIR  = f"{ROOT}/snapshots"

FAISS_IDX = f"{DATA_DIR}/CLAIMS.faiss"
IDX_META  = f"{DATA_DIR}/CLAIMS.index.json"     # written in your FAISS build cell
ROWS_MIN  = f"{DATA_DIR}/CLAIMS.rows.min.json"  # compact row metadata (one per claim)
EMB_NPY   = f"{DATA_DIR}/CLAIMS_embeddings.npy" # optional: for extra consistency check
PTR_PATH  = f"{DATA_DIR}/LAST_FAISS.json"       # tiny pointer JSON

# --- Basic existence checks
for p in [FAISS_IDX, IDX_META, ROWS_MIN]:
    assert os.path.exists(p), f"Missing required file: {p}"

# --- Load artifacts
index = faiss.read_index(FAISS_IDX)
with open(IDX_META, "r", encoding="utf-8") as f:
    idx_meta = json.load(f)
with open(ROWS_MIN, "r", encoding="utf-8") as f:
    rows = json.load(f)

# Try loading embeddings for an optional extra check (not required)
emb_arr = None
if os.path.exists(EMB_NPY):
    try:
        emb_arr = np.load(EMB_NPY, mmap_mode="r")
    except Exception:
        emb_arr = None  # ignore if load fails; not critical

# --- Gather stats / health checks
n_index = index.ntotal
dim     = index.d

n_rows  = len(rows)
ok_rows = (n_rows == n_index)

emb_shape = None
ok_emb    = True
if emb_arr is not None:
    emb_shape = tuple(emb_arr.shape)
    ok_emb = (emb_arr.shape[0] == n_index and emb_arr.shape[1] == dim)

# File sizes (human-ish)
def _size_mb(p):
    return round(os.path.getsize(p) / (1024*1024), 2)

sz_faiss   = _size_mb(FAISS_IDX)
sz_rows    = _size_mb(ROWS_MIN)
sz_meta    = _size_mb(IDX_META)

# Try to grab faiss version (not always present)
try:
    faiss_ver = faiss.__version__
except Exception:
    faiss_ver = "unknown"

# --- Print a short summary
print("‚úÖ FAISS snapshot")
print("  index file :", os.path.relpath(FAISS_IDX, ROOT), f"({sz_faiss} MB)")
print("  rows file  :", os.path.relpath(ROWS_MIN, ROOT),  f"({sz_rows} MB)")
print("  meta file  :", os.path.relpath(IDX_META, ROOT),   f"({sz_meta} MB)")
print("  vectors    :", n_index)
print("  dim        :", dim)
print("  rows match :", ok_rows, f"(rows={n_rows} vs index={n_index})")
if emb_shape:
    print("  emb shape  :", emb_shape, "match:", ok_emb)

# --- Compose snapshot record
snap = {
    "ts"       : time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "python"   : platform.python_version(),
    "numpy"    : np.__version__,
    "faiss"    : faiss_ver,
    "index"    : os.path.relpath(FAISS_IDX, ROOT),
    "rows_min" : os.path.relpath(ROWS_MIN, ROOT),
    "idx_meta" : os.path.relpath(IDX_META, ROOT),
    "vectors"  : int(n_index),
    "dim"      : int(dim),
    "rows_ok"  : bool(ok_rows),
    "emb_ok"   : bool(ok_emb),
    "sizes_mb" : {
        "faiss" : sz_faiss,
        "rows"  : sz_rows,
        "meta"  : sz_meta,
    },
}

# --- Save snapshot + pointer
os.makedirs(SNAP_DIR, exist_ok=True)
snap_path = f"{SNAP_DIR}/FAISS_SNAPSHOT_{int(time.time())}.json"
with open(snap_path, "w", encoding="utf-8") as f:
    json.dump(snap, f, indent=2)
print("üìù Saved snapshot ->", os.path.relpath(snap_path, ROOT))

os.makedirs(os.path.dirname(PTR_PATH), exist_ok=True)
with open(PTR_PATH, "w", encoding="utf-8") as f:
    json.dump({
        "ts"      : snap["ts"],
        "index"   : snap["index"],
        "rows"    : snap["rows_min"],
        "vectors" : snap["vectors"],
        "dim"     : snap["dim"],
    }, f, indent=2)
print("üîó Wrote pointer JSON ->", os.path.relpath(PTR_PATH, ROOT))


‚úÖ FAISS snapshot
  index file : data/processed/CLAIMS.faiss (0.64 MB)
  rows file  : data/processed/CLAIMS.rows.min.json (0.02 MB)
  meta file  : data/processed/CLAIMS.index.json (0.0 MB)
  vectors    : 109
  dim        : 1536
  rows match : True (rows=109 vs index=109)
  emb shape  : (109, 1536) match: True
üìù Saved snapshot -> snapshots/FAISS_SNAPSHOT_1757884186.json
üîó Wrote pointer JSON -> data/processed/LAST_FAISS.json


## Unified push helper (works for 04 and FAISS)
Paste this once (in any notebook); it will stage notebooks, snapshots, pointer JSONs, and FAISS artifacts. It also supports an optional milestone tag.

In [None]:
# === FACTR universal push (commit notebook + snapshots + pointers + FAISS + optional tag) ===
from google.colab import userdata
import urllib.parse, os, subprocess, shlex, time, re

ROOT = "/content/drive/MyDrive/FATCR"
os.chdir(ROOT)

# --- Optional milestone tag: set to "" to skip tagging ---
MILESTONE = "FAISS: initial cosine index working 2025-09-14"  # or e.g. "FACTR_04: end-to-end 2025-09-14" / ""

print("üìÇ Repo status:")
!git status -sb

# --- Pull first (rebase) ---
print("\nüîÑ Pulling (rebase)‚Ä¶")
pat = userdata.get("GITHUB_PAT")
assert pat, "Missing GITHUB_PAT in Colab Secrets."
enc_pat = urllib.parse.quote(pat, safe="")
PULL_URL = f"https://LukmaanViscomi:{enc_pat}@github.com/LukmaanViscomi/FATCR.git"
!git pull --rebase {PULL_URL} main || true

# --- Stage everything we might create across 04+05 (missing files are fine) ---
print("\n‚ûï Staging files‚Ä¶")
!git add notebooks snapshots \
  data/processed/LAST_UTTERANCES.json \
  data/processed/LAST_CLAIMS.json \
  data/processed/UTTERANCES.parquet \
  data/processed/CLAIMS_raw.jsonl \
  data/processed/CLAIMS_embeddings.npy \
  data/processed/CLAIMS_embeddings.meta.json \
  data/processed/CLAIMS.faiss \
  data/processed/CLAIMS.index.json \
  data/processed/CLAIMS.rows.min.json \
  data/processed/LAST_FAISS.json \
  README.md .gitignore 2>/dev/null || true

# --- (Optional) also stage the notebook you're running, if ipynbname is present ---
try:
    import ipynbname, sys
    nb = ipynbname.path().name
    os.system(f"git add notebooks/{nb} 2>/dev/null || true")
except Exception:
    pass

# --- Commit if needed ---
changed = subprocess.run(["git", "diff", "--cached", "--quiet"]).returncode != 0
if changed:
    msg = f"FACTR: snapshot/index/pointers update [{int(time.time())}]"
    print("\n‚úèÔ∏è Commit:", msg)
    !git commit -m {shlex.quote(msg)}
else:
    print("\n‚ÑπÔ∏è Nothing new to commit.")

# --- Push commit ---
print("\n‚¨ÜÔ∏è Pushing to main‚Ä¶")
!git push {PULL_URL} HEAD:main

# --- Optional: Milestone tag ---
def make_tag_slug(name: str) -> str:
    slug = re.sub(r"[^A-Za-z0-9._-]", "-", name.strip()).strip("-_.")
    return slug or "milestone"

if MILESTONE:
    tag = make_tag_slug(MILESTONE)
    print(f"\nüè∑Ô∏è Creating tag: {tag}")
    subprocess.run(["git", "tag", "-f", tag], check=True)
    subprocess.run(["git", "push", "origin", tag, "--force"], check=True)
    print(f"‚úÖ Tag pushed: {tag}")

print("\n‚úÖ Push complete.")


üìÇ Repo status:
## [32mmain[m...[31morigin/main[m [ahead [32m3[m]
[32mA[m  notebooks/FACTR_01_Setup_v2025-09-09_V2.0.ipynb
[32mD[m  notebooks/FACTR_01_Setup_v2025-09-09_test.ipynb
[32mM[m  notebooks/FACTR_02_Ingest_v2025-09-07_2.0.ipynb
[32mA[m  notebooks/FACTR_03_ASR+Diarize_v2025-09-07_2.0.ipynb
[32mD[m  notebooks/FACTR_04_Claims+Embeddings_v2025-09-07_1.0.ipynb
[32mA[m  notebooks/FACTR_04_Claims+Embeddings_v2025-09-07_2.0.ipynb
[32mR[m  notebooks/FACTR_03_ASR+Diarize_v2025-09-07_1.0.ipynb -> notebooks/old/FACTR_03_ASR+Diarize_v2025-09-07_1.0.ipynb
[31m??[m data/processed/CLAIMS.faiss
[31m??[m data/processed/CLAIMS.index.json
[31m??[m data/processed/CLAIMS.rows.min.json
[31m??[m data/processed/CLAIMS_embeddings.meta.json
[31m??[m data/processed/LAST_FAISS.json

üîÑ Pulling (rebase)‚Ä¶
error: cannot pull with rebase: Your index contains uncommitted changes.
error: please commit or stash them.

‚ûï Staging files‚Ä¶

‚úèÔ∏è Commit: FACTR: snapshot/index/