# FACTR ‚Äî Claims Extraction + Embeddings
**Version:** v2025-09-07_1.0  
**Purpose:** Read UTTERANCES.parquet ‚Üí extract claims (OpenAI) ‚Üí write CLAIMS_raw.jsonl ‚Üí compute embeddings (stub).


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Config
BATCH = 25
MODEL_CHAT = "gpt-4o-mini"
MODEL_EMB = "text-embedding-3-small"
PROMPT_PATH = "claim_extraction_prompt.txt"  # optional; else fall back to inline prompt


In [13]:
from google.colab import userdata

api_key = userdata.get("OPENAI_API_KEY")
print("Loaded?", bool(api_key))
print("First 6 chars:", api_key[:6] if api_key else None)


Loaded? True
First 6 chars: sk-pro


In [17]:
# --- Robust claims extraction (JSON only, with fallbacks & logging) ---
import os, json, time, pandas as pd
from openai import OpenAI
from google.colab import userdata

# ---- Config (use existing vars if already defined) ----
MODEL_CHAT   = globals().get("MODEL_CHAT", "gpt-4o-mini")  # pick any chat model you have access to
BATCH        = globals().get("BATCH", 20)                  # number of utterances per request
PROMPT_PATH  = globals().get("PROMPT_PATH", "prompts/claims_prompt.txt")
UTTS_PARQUET = "/content/drive/MyDrive/FATCR/data/processed/UTTERANCES.parquet"

# ---- Load utterances ----
assert os.path.exists(UTTS_PARQUET), "Run ASR+Diarize first."
df = pd.read_parquet(UTTS_PARQUET)
print("Utterances:", len(df))

# ---- Prompt (force JSON-only) ----
if os.path.exists(PROMPT_PATH):
    prompt_text = open(PROMPT_PATH, "r", encoding="utf-8").read().strip()
else:
    prompt_text = """You are a strict JSON generator.

Extract theological claims as a **valid JSON array** only.
Each array item MUST be a JSON object with the fields:
  "claim_text"  (string)
  "type"        (string: e.g., "doctrine", "ethics", "history", or "other")
  "topic"       (string, brief topic label)
  "stance"      (string: "affirm", "deny", "neutral")
  "confidence"  (number 0..1)

Rules:
- Output **JSON only**, no prose, no markdown, no preamble, no trailing text.
- If there are no claims, output [].
- Never wrap in code fences.
"""

# ---- OpenAI client (from Colab Secrets) ----
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set. Add it in Colab Secrets and rerun.")
client = OpenAI(api_key=api_key)

# ---- Helper: safe JSON parse with logging ----
def parse_json_or_log(raw: str, dbg_tag: str) -> list:
    raw = (raw or "").strip()
    if not raw:
        print("‚ö†Ô∏è Empty model response.")
        return []
    try:
        data = json.loads(raw)
        if isinstance(data, list):
            return data
        else:
            print("‚ö†Ô∏è Model returned non-list JSON. Type =", type(data).__name__)
    except Exception as e:
        print("‚ö†Ô∏è Parse failed:", e)
        print("   Raw head (first 300 chars):\n", raw[:300])
    # Save full raw to snapshots for inspection
    os.makedirs("snapshots", exist_ok=True)
    dbg_path = f"snapshots/CLAIMS_DEBUG_{dbg_tag}_{int(time.time())}.txt"
    with open(dbg_path, "w", encoding="utf-8") as f:
        f.write(raw)
    print("   Saved raw to:", dbg_path)
    return []

# ---- Call model for a batch of texts ----
def extract_claims_batch(texts):
    """
    texts: list[str]  (concatenated utterances; we rely on the system prompt to return JSON list)
    """
    resp = client.chat.completions.create(
        model=MODEL_CHAT,
        messages=[
            {"role": "system", "content": prompt_text},
            {"role": "user", "content": "\n\n".join(texts)},
        ],
        temperature=0.2,
    )
    raw = resp.choices[0].message.content
    # dbg tag includes batch size so you can correlate later
    return parse_json_or_log(raw, dbg_tag=f"b{len(texts)}")

# ---- Drive the batching & write JSONL lines ----
out_lines = []
for i in range(0, len(df), BATCH):
    batch = df.iloc[i:i+BATCH]
    # keep speaker label (helps the model separate claims)
    texts = [f"{r.speaker}: {r.text}" for r in batch.itertuples()]
    claims = extract_claims_batch(texts)

    if claims:
        for c in claims:
            out_lines.append(json.dumps({
                "utterance_range": [int(i), int(i + len(batch) - 1)],
                "claim_text":  c.get("claim_text", "")[:300],
                "type":        c.get("type", "other"),
                "topic":       c.get("topic", "other"),
                "stance":      c.get("stance", "neutral"),
                "confidence":  float(c.get("confidence", 0)),
            }, ensure_ascii=False))

# ---- Save JSONL ----
OUT_PATH = "/content/drive/MyDrive/FATCR/data/processed/CLAIMS_raw.jsonl"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(out_lines))

print(f"‚úÖ Wrote {OUT_PATH}:", len(out_lines), "items")


Utterances: 1242
‚úÖ Wrote /content/drive/MyDrive/FATCR/data/processed/CLAIMS_raw.jsonl: 109 items


batching embeddings will be faster, cheaper, and far less likely to hit rate limits. Drop this single cell into FACTR-04 right after you create CLAIMS_raw.jsonl (or wherever you want to embed), then run it.

In [20]:
# === Batch embeddings for CLAIMS_raw.jsonl (fast & robust) ===
import os, json, time, math, numpy as np
from datetime import datetime, timezone # Import timezone here
from openai import OpenAI
from google.colab import userdata

# ---- Config ----
MODEL_EMB   = "text-embedding-3-small"   # or "text-embedding-3-large"
BATCH_EMB   = 64                         # tune for your quota/rate limits
TRUNC_CHARS = 8000                       # hard cap per text to avoid 8192 token issues

# Resolve paths (prefer FACTR/processed; fall back to CWD)
DATA_DIR = "/content/drive/MyDrive/FATCR/data/processed"
RAW_PATH = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl") if os.path.exists(DATA_DIR) else "CLAIMS_raw.jsonl"
EMB_NPY  = os.path.join(DATA_DIR, "CLAIMS_embeddings.npy") if os.path.exists(DATA_DIR) else "CLAIMS_embeddings.npy"
META_JSON= os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json") if os.path.exists(DATA_DIR) else "CLAIMS_embeddings.meta.json"

assert os.path.exists(RAW_PATH), f"Not found: {RAW_PATH}. Run the claims extraction step first."

# ---- Load OpenAI key from Colab Secrets ----
api_key = userdata.get("OPENAI_API_KEY")
assert api_key, "OPENAI_API_KEY missing in Colab Secrets."
client = OpenAI(api_key=api_key)

# ---- Load claims texts in order ----
with open(RAW_PATH, "r", encoding="utf-8") as f:
    records = [json.loads(line) for line in f if line.strip()]

texts = [(rec.get("claim_text") or "").strip()[:TRUNC_CHARS] for rec in records]
print(f"Claims to embed: {len(texts)} | model={MODEL_EMB}")

# ---- Helper with retry/backoff ----
def embed_batch(batch_texts, max_retries=5, base_sleep=2.0):
    for attempt in range(max_retries):
        try:
            resp = client.embeddings.create(model=MODEL_EMB, input=batch_texts)
            return [d.embedding for d in resp.data]
        except Exception as e:
            wait = base_sleep * (2 ** attempt)
            print(f"‚ö†Ô∏è  Embed call failed (attempt {attempt+1}/{max_retries}): {e} ‚Üí sleeping {wait:.1f}s")
            time.sleep(wait)
    raise RuntimeError("Embedding failed after retries.")

# ---- Run in batches ----
all_vecs = []
n = len(texts)
num_batches = math.ceil(n / BATCH_EMB)

t0 = time.time()
for bi in range(num_batches):
    lo, hi = bi*BATCH_EMB, min((bi+1)*BATCH_EMB, n)
    batch = texts[lo:hi]
    vecs  = embed_batch(batch)
    all_vecs.extend(vecs)
    if (bi+1) % 5 == 0 or (bi+1) == num_batches:
        elapsed = time.time() - t0
        print(f"‚Ä¶ {hi}/{n} embedded | elapsed {elapsed:.1f}s")

# ---- Save outputs ----
arr = np.array(all_vecs, dtype="float32")
os.makedirs(os.path.dirname(EMB_NPY) or ".", exist_ok=True)
np.save(EMB_NPY, arr)

meta = {
    "ts": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
    "model": MODEL_EMB,
    "source": RAW_PATH,
    "count": int(arr.shape[0]),
    "dim": int(arr.shape[1]) if arr.size else 0,
    "batch_size": BATCH_EMB,
    "trunc_chars": TRUNC_CHARS,
}
with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print(f"‚úÖ Saved embeddings ‚Üí {EMB_NPY}  shape={arr.shape}")
print(f"üóÇÔ∏è  Meta ‚Üí {META_JSON}")

Claims to embed: 109 | model=text-embedding-3-small
‚Ä¶ 109/109 embedded | elapsed 1.9s
‚úÖ Saved embeddings ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.npy  shape=(109, 1536)
üóÇÔ∏è  Meta ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.meta.json


## Suggestion: before running, quickly confirm your variables:

In [22]:
# Embeddings stub (plug FAISS/Chroma later)
from openai import OpenAI
import json, numpy as np, os
from google.colab import userdata # Import userdata

assert os.path.exists("CLAIMS_raw.jsonl"), "Run extraction first."
# api_key = os.getenv("OPENAI_API_KEY") # Use userdata.get instead
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set.")
client = OpenAI(api_key=api_key)

records = [json.loads(x) for x in open("CLAIMS_raw.jsonl","r",encoding="utf-8").read().splitlines() if x.strip()]
texts = [r["claim_text"] for r in records]
print("Claims:", len(texts))

vecs = []
for t in texts:
    emb = client.embeddings.create(model=MODEL_EMB, input=t).data[0].embedding
    vecs.append(emb)
vecs = np.array(vecs, dtype="float32")
print("Embeddings shape:", vecs.shape)
np.save("CLAIMS_embeddings.npy", vecs)
print("‚úÖ Saved embeddings ‚Üí CLAIMS_embeddings.npy")

Claims: 108
Embeddings shape: (108, 1536)
‚úÖ Saved embeddings ‚Üí CLAIMS_embeddings.npy


In [23]:
# === Embeddings with metadata (save to Drive/processed) ===
from openai import OpenAI
import os, json, time, numpy as np
from google.colab import userdata

# ---- Config ----
DATA_DIR   = "/content/drive/MyDrive/FATCR/data/processed"
RAW_JSON   = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl")
EMB_NPY    = os.path.join(DATA_DIR, "CLAIMS_embeddings.npy")
META_JSON  = os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json")

MODEL_EMB  = "text-embedding-3-small"   # or "text-embedding-3-large"

# ---- API key ----
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise SystemExit("OPENAI_API_KEY not set. Add it in Colab Secrets and rerun.")
client = OpenAI(api_key=api_key)

# ---- Load claims ----
assert os.path.exists(RAW_JSON), f"Not found: {RAW_JSON}. Run claims extraction first."
records = [json.loads(x) for x in open(RAW_JSON, "r", encoding="utf-8").read().splitlines() if x.strip()]
texts   = [r["claim_text"] for r in records]
print("Claims:", len(texts))

# ---- Embed ----
vecs = []
for t in texts:
    emb = client.embeddings.create(model=MODEL_EMB, input=t).data[0].embedding
    vecs.append(emb)

arr = np.array(vecs, dtype="float32")
os.makedirs(DATA_DIR, exist_ok=True)
np.save(EMB_NPY, arr)
print(f"‚úÖ Saved embeddings ‚Üí {EMB_NPY} shape={arr.shape}")

# ---- Save metadata ----
meta = {
    "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "model": MODEL_EMB,
    "source": RAW_JSON,
    "count": int(arr.shape[0]),
    "dim": int(arr.shape[1]) if arr.size > 0 else 0,
}
with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print(f"üóÇÔ∏è  Meta ‚Üí {META_JSON}")


Claims: 109
‚úÖ Saved embeddings ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.npy shape=(109, 1536)
üóÇÔ∏è  Meta ‚Üí /content/drive/MyDrive/FATCR/data/processed/CLAIMS_embeddings.meta.json


In [24]:
# ‚úÖ Smoke test for FACTR_04
import os, json, numpy as np

DATA_DIR = "/content/drive/MyDrive/FATCR/data/processed"
RAW_JSON = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl")
EMB_NPY  = os.path.join(DATA_DIR, "CLAIMS_embeddings.npy")

# ---- Checks ----
assert os.path.exists(RAW_JSON), "Missing CLAIMS_raw.jsonl"
lines = [x for x in open(RAW_JSON, "r", encoding="utf-8").read().splitlines() if x.strip()]
assert len(lines) > 0, "No claims extracted"

assert os.path.exists(EMB_NPY), "Missing embeddings file"
arr = np.load(EMB_NPY)

assert arr.ndim == 2 and arr.shape[0] == len(lines), "Embeddings size mismatch"

print(f"‚úÖ Claims+Embeddings smoke test passed. {len(lines)} claims, embeddings shape = {arr.shape}")


‚úÖ Claims+Embeddings smoke test passed. 109 claims, embeddings shape = (109, 1536)


## Snapshot (versions, row count, duration) + pointer JSON

In [25]:
# === FACTR_04 Claims+Embeddings Snapshot ===
import os, json, time, numpy as np

ROOT = "/content/drive/MyDrive/FATCR"
DATA_DIR = f"{ROOT}/data/processed"
SNAP_DIR = f"{ROOT}/snapshots"
CLAIMS_JSON = f"{DATA_DIR}/CLAIMS_raw.jsonl"
EMB_NPY = f"{DATA_DIR}/CLAIMS_embeddings.npy"
META_JSON = f"{DATA_DIR}/CLAIMS_embeddings.meta.json"
PTR_PATH  = f"{DATA_DIR}/LAST_CLAIMS.json"

# ---- Checks ----
assert os.path.exists(CLAIMS_JSON), f"Missing {CLAIMS_JSON}"
assert os.path.exists(EMB_NPY), f"Missing {EMB_NPY}"

lines = [x for x in open(CLAIMS_JSON, "r", encoding="utf-8").read().splitlines() if x.strip()]
arr = np.load(EMB_NPY)

assert len(lines) > 0, "No claims extracted"
assert arr.ndim == 2 and arr.shape[0] == len(lines), "Embeddings size mismatch"

print("‚úÖ Claims+Embeddings snapshot")
print("   Claims     :", len(lines))
print("   Embeddings :", arr.shape)

# ---- Save snapshot ----
snap = {
    "ts"    : time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "claims": len(lines),
    "embeddings_shape": arr.shape,
    "raw_json": os.path.relpath(CLAIMS_JSON, ROOT),
    "embeddings_npy": os.path.relpath(EMB_NPY, ROOT),
    "meta_json": os.path.relpath(META_JSON, ROOT),
}

os.makedirs(SNAP_DIR, exist_ok=True)
snap_path = f"{SNAP_DIR}/CLAIMS_SNAPSHOT_{int(time.time())}.json"
with open(snap_path, "w") as f:
    json.dump(snap, f, indent=2)
print("üìù Saved snapshot ->", os.path.relpath(snap_path, ROOT))

# also write a small pointer JSON for git commits
with open(PTR_PATH, "w") as f:
    json.dump({
        "ts"    : snap["ts"],
        "claims": len(lines),
        "shape" : arr.shape,
        "path"  : os.path.relpath(CLAIMS_JSON, ROOT),
    }, f, indent=2)
print("üîó Wrote pointer JSON ->", os.path.relpath(PTR_PATH, ROOT))


‚úÖ Claims+Embeddings snapshot
   Claims     : 109
   Embeddings : (109, 1536)
üìù Saved snapshot -> snapshots/CLAIMS_SNAPSHOT_1757874346.json
üîó Wrote pointer JSON -> data/processed/LAST_CLAIMS.json


## Git push helper (commit notebook + pointer JSON + snapshots)

In [27]:
# # === FACTR push (commit notebook + pointer JSON + snapshots) ===
# from google.colab import userdata
# import urllib.parse, os, subprocess, shlex, time

# ROOT = "/content/drive/MyDrive/FATCR"
# os.chdir(ROOT)

# # Ensure git identity (set once per runtime)
# !git config --global user.email "lukmaan@example.com"
# !git config --global user.name "Lukmaan Viscomi"

# print("üìÇ Repo status:")
# !git status -sb

# print("\nüîÑ Pulling (rebase)‚Ä¶")
# pat = userdata.get("GITHUB_PAT")
# assert pat, "Missing GITHUB_PAT in Colab Secrets."
# enc_pat = urllib.parse.quote(pat, safe="")
# PULL_URL = f"https://LukmaanViscomi:{enc_pat}@github.com/LukmaanViscomi/FATCR.git"
# !git pull --rebase --autostash {PULL_URL} main || true

# print("\n‚ûï Staging files‚Ä¶")
# !git add notebooks snapshots data/processed/LAST_UTTERANCES.json data/processed/LAST_CLAIMS.json README.md .gitignore 2>/dev/null || true

# changed = subprocess.run(["git", "diff", "--cached", "--quiet"]).returncode != 0
# if changed:
#     msg = f"FACTR_04: snapshots + pointers update [{int(time.time())}]"
#     print("\n‚úèÔ∏è Commit:", msg)
#     !git commit -m {shlex.quote(msg)}
# else:
#     print("\n‚ÑπÔ∏è Nothing new to commit.")

# print("\n‚¨ÜÔ∏è Pushing to main‚Ä¶")
# !git push {PULL_URL} HEAD:main

# print("\n‚úÖ Push complete.")



üìÇ Repo status:
## [32mmain[m...[31morigin/main[m [ahead [32m1[m]
[32mA[m  data/processed/LAST_ASR.json
 [31mD[m notebooks/FACTR_02_Ingest_v2025-09-07_1.0.ipynb
[32mM[m  notebooks/FACTR_03_ASR+Diarize_v2025-09-07_1.0.ipynb
 [31mM[m notebooks/FACTR_04_Claims+Embeddings_v2025-09-07_1.0.ipynb
[31m??[m data/processed/CLAIMS_embeddings.meta.json
[31m??[m data/processed/LAST_CLAIMS.json
[31m??[m data/processed/LAST_INGEST.json
[31m??[m data/processed/LAST_INGEST.json,old
[31m??[m notebooks/FACTR_02_Ingest_v2025-09-07_2.0.ipynb

üîÑ Pulling (rebase)‚Ä¶
From https://github.com/LukmaanViscomi/FATCR
 * branch            main       -> FETCH_HEAD
Already up to date.

‚ûï Staging files‚Ä¶

‚úèÔ∏è Commit: FACTR_04: snapshots + pointers update [1757874763]
[main 310cdab] FACTR_04: snapshots + pointers update [1757874763]
 2 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 data/processed/LAST_ASR.json
 rewrite notebooks/FACTR_03_ASR+Diarize_v2025-09-07_1.0.i

In [28]:
# !git tag -a v2025-09-14 -m "FACTR_03 ‚Üí FACTR_04 pipeline working end-to-end"


In [29]:
# !git push origin v2025-09-14


Enumerating objects: 1, done.
Counting objects: 100% (1/1)Counting objects: 100% (1/1), done.
Writing objects: 100% (1/1)Writing objects: 100% (1/1), 204 bytes | 20.00 KiB/s, done.
Total 1 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/LukmaanViscomi/FATCR.git
 * [new tag]         v2025-09-14 -> v2025-09-14


In [30]:
# !git tag


v2025-09-14


In [None]:
# === FACTR push (commit notebook + pointer JSON + snapshots + optional tag) ===
from google.colab import userdata
import urllib.parse, os, subprocess, shlex

ROOT = "/content/drive/MyDrive/FATCR"
os.chdir(ROOT)

# --- Config ---
MILESTONE = ""   # e.g., "v1.0-2025-09-14" (leave blank if no tag wanted)

# Show current status first
print("üìÇ Repo status:")
!git status -sb

# Pull (rebase) to avoid non-fast-forward errors
print("\nüîÑ Pulling (rebase)‚Ä¶")
pat = userdata.get("GITHUB_PAT")
assert pat, "Missing GITHUB_PAT in Colab Secrets."
enc_pat = urllib.parse.quote(pat, safe="")
PULL_URL = f"https://LukmaanViscomi:{enc_pat}@github.com/LukmaanViscomi/FATCR.git"
!git pull --rebase {PULL_URL} main || true

# Stage tracked files
print("\n‚ûï Staging files‚Ä¶")
!git add notebooks snapshots data/processed/LAST_*.json README.md .gitignore 2>/dev/null || true

# Commit only if there are changes
changed = subprocess.run(["git", "diff", "--cached", "--quiet"]).returncode != 0
if changed:
    msg = "FACTR: snapshot + pointer update"
    print("\n‚úèÔ∏è Commit:", msg)
    !git commit -m {shlex.quote(msg)}
else:
    print("\n‚ÑπÔ∏è Nothing new to commit.")

# Push (inject PAT only for the network call)
print("\n‚¨ÜÔ∏è Pushing to main‚Ä¶")
!git push {PULL_URL} HEAD:main

# --- Optional: Milestone tag ---
if MILESTONE:
    print(f"\nüè∑Ô∏è Creating tag: {MILESTONE}")
    # create/update the tag locally
    subprocess.run(["git", "tag", "-f", MILESTONE], check=True)
    # push the tag to GitHub
    subprocess.run(["git", "push", "origin", MILESTONE], check=True)
    print("‚úÖ Tag pushed:", MILESTONE)

print("\n‚úÖ Push complete.")
