# FACTR_05 — Search + Eval (MVP)
_Loads FAISS index and claim metadata, embeds queries using the same model noted in meta, and returns top‑k matches. Includes quick sanity checks and a tiny eval harness._

### Install dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# If running in Colab: FAISS is usually present; otherwise install CPU build.
try:
    import faiss  # noqa
except Exception:
    !pip -q install faiss-cpu

# Embeddings + utils
!pip -q install sentence-transformers pandas pyarrow openai



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25h

### Paths & environment detection

In [None]:

import os, json, glob, pathlib

# Try the canonical Colab path, then fall back to current working directory
CANDIDATES = [
    "/content/drive/MyDrive/FATCR",
    os.getcwd(),
    os.path.abspath(os.path.join(os.getcwd(), "FATCR")),
]

def find_root(cands):
    for p in cands:
        if os.path.isdir(os.path.join(p, ".git")) and os.path.isdir(os.path.join(p, "data")):
            return p
    # last resort: if current has data/, use it
    if os.path.isdir(os.path.join(os.getcwd(), "data")):
        return os.getcwd()
    return cands[0]

ROOT = find_root(CANDIDATES)
DATA_DIR = os.path.join(ROOT, "data", "processed")
SNAP_DIR = os.path.join(ROOT, "snapshots")
NB_NAME = "FACTR_05_Search+Eval_v2025-09-16.ipynb"

print("ROOT:", ROOT)
print("DATA_DIR:", DATA_DIR)
print("SNAP_DIR:", SNAP_DIR)


ROOT: /content/drive/MyDrive/FATCR
DATA_DIR: /content/drive/MyDrive/FATCR/data/processed
SNAP_DIR: /content/drive/MyDrive/FATCR/snapshots


## (Optional but recommended) Make the meta explicit

In [None]:
import json, os
META_PATH = os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json")
with open(META_PATH, "r", encoding="utf-8") as f:
    meta = json.load(f)

meta.setdefault("model_name", meta.get("model"))  # keep both keys
meta["normalized"] = True                         # you rebuilt with L2-normalised vectors
meta["faiss_metric"] = "ip"                       # using IndexFlatIP (cosine via inner product)

with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("✅ Updated meta:", meta)


✅ Updated meta: {'ts': '2025-09-14T18:17:28Z', 'model': 'text-embedding-3-small', 'source': '/content/drive/MyDrive/FATCR/data/processed/CLAIMS_raw.jsonl', 'count': 109, 'dim': 1536, 'model_name': 'text-embedding-3-small', 'normalized': True, 'faiss_metric': 'ip'}


### Load artefacts (FAISS index, mapping, rows, meta)

In [None]:

import json, os, faiss, numpy as np, pandas as pd

# Expected artefacts from 04
FAISS_PATH = os.path.join(DATA_DIR, "CLAIMS.faiss")
MAP_PATH   = os.path.join(DATA_DIR, "CLAIMS.index.json")
ROWS_MIN   = os.path.join(DATA_DIR, "CLAIMS.rows.min.json")
META_PATH  = os.path.join(DATA_DIR, "CLAIMS_embeddings.meta.json")
RAW_CLAIMS = os.path.join(DATA_DIR, "CLAIMS_raw.jsonl")

assert os.path.exists(FAISS_PATH), f"Missing {FAISS_PATH}"
assert os.path.exists(MAP_PATH),   f"Missing {MAP_PATH}"
assert os.path.exists(META_PATH),  f"Missing {META_PATH}"

index = faiss.read_index(FAISS_PATH)
with open(MAP_PATH, "r", encoding="utf-8") as f:
    id_map = json.load(f)  # {str(faiss_id): row_index}

# Optional rows file (UI-friendly)
rows_min = []
if os.path.exists(ROWS_MIN):
    with open(ROWS_MIN, "r", encoding="utf-8") as f:
        rows_min = json.load(f)

# Raw claims (for full text fallback)
raw_claims = []
if os.path.exists(RAW_CLAIMS):
    with open(RAW_CLAIMS, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                raw_claims.append(json.loads(line))

with open(META_PATH, "r", encoding="utf-8") as f:
    embed_meta = json.load(f)

dim = index.d
print("FAISS dim:", dim)
print("id_map size:", len(id_map))
print("rows_min len:", len(rows_min))
print("raw_claims len:", len(raw_claims))
print("embed meta:", embed_meta)


FAISS dim: 1536
id_map size: 109
rows_min len: 109
raw_claims len: 109
embed meta: {'ts': '2025-09-14T18:17:28Z', 'model': 'text-embedding-3-small', 'source': '/content/drive/MyDrive/FATCR/data/processed/CLAIMS_raw.jsonl', 'count': 109, 'dim': 1536, 'model_name': 'text-embedding-3-small', 'normalized': True, 'faiss_metric': 'ip'}


## Quick diagnosis cell

In [None]:
import os, json, itertools

print("FAISS vectors:", index.ntotal)
print("len(id_map):", len(id_map))
print("len(rows_min):", len(rows_min))
print("len(raw_claims):", len(raw_claims))
print("MAP_PATH:", MAP_PATH)

# Peek at the first 10 mapping entries
items = list(itertools.islice(id_map.items(), 10))
print("id_map sample:", items)


FAISS vectors: 109
len(id_map): 109
len(rows_min): 109
len(raw_claims): 109
MAP_PATH: /content/drive/MyDrive/FATCR/data/processed/CLAIMS.index.json
id_map sample: [('0', 0), ('1', 1), ('2', 2), ('3', 3), ('4', 4), ('5', 5), ('6', 6), ('7', 7), ('8', 8), ('9', 9)]


### Build query embedder (model inferred from meta; fallback to MiniLM)

In [None]:
import os
import numpy as np

# Try Colab Secrets first, then env var
try:
    from google.colab import userdata  # type: ignore
    openai_api_key = userdata.get("OPENAI_API_KEY")
except Exception:
    openai_api_key = os.getenv("OPENAI_API_KEY")

model_name = embed_meta.get("model_name") or embed_meta.get("model", "")
was_normalized = bool(embed_meta.get("normalized", True))
metric = (embed_meta.get("metric") or embed_meta.get("faiss_metric") or "ip").lower()

def _l2_normalize(vecs: np.ndarray) -> np.ndarray:
    if not was_normalized:
        return vecs
    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-8
    return vecs / norms

def _is_openai_model(name: str) -> bool:
    return name.startswith("text-embedding-")  # covers text-embedding-3-small/large

if not model_name:
    # Only safe if your index was built with MiniLM; otherwise reindex first
    model_name = "sentence-transformers/all-MiniLM-L6-v2"

# Decide provider once and log it
use_openai = _is_openai_model(model_name) and bool(openai_api_key)
print("Loading embedding model:", model_name)
print("Using provider:", "OpenAI" if use_openai else "SentenceTransformers")

if use_openai:
    # --- OpenAI Embeddings path (matches index built with OpenAI) ---
    from openai import OpenAI
    client = OpenAI(api_key=openai_api_key)

    def embed_queries(texts):
        out = []
        for i in range(0, len(texts), 96):  # reasonable batching
            batch = texts[i:i+96]
            resp = client.embeddings.create(model=model_name, input=batch)
            out.extend([d.embedding for d in resp.data])
        vecs = np.asarray(out, dtype="float32")
        return _l2_normalize(vecs)
else:
    # --- Sentence-Transformers path (HF/ST models) ---
    # If an OpenAI model was requested but no key, fall back to MiniLM for testing only
    if _is_openai_model(model_name) and not openai_api_key:
        print("Warning: OpenAI model specified but no API key found; falling back to MiniLM (for testing).")
        model_name = "sentence-transformers/all-MiniLM-L6-v2"

    from sentence_transformers import SentenceTransformer
    st_model = SentenceTransformer(model_name)

    def embed_queries(texts):
        vecs = st_model.encode(texts, convert_to_numpy=True, normalize_embeddings=was_normalized)
        return vecs.astype("float32")


Loading embedding model: text-embedding-3-small
Using provider: OpenAI


## Search helpers

What this improves:

Robust mapping: if id_map is missing a key, it falls back to identity (faiss_id == row_index) so testing never explodes.

Better text selection: prioritises claim_text (if present), then claim, then text/content.

Useful defaults: results auto-show claim_text/type/topic/stance/confidence etc. when they exist.

Customizable: pass include_cols=[...] to enforce a specific ordering per use case.

In [None]:
# --- Search helpers (safer mapping + nicer defaults) ---
import numpy as np
import pandas as pd

# Which columns to show first (only those that exist are kept)
_DEFAULT_COLS = [
    "faiss_id", "score", "text", "claim_text",
    "type", "topic", "stance", "confidence",
    "row_id", "utterance_range", "speaker", "start", "end", "source", "timestamp",
]

def faiss_search(query: str, k: int = 10):
    """Embed the query and search FAISS. Returns (scores[0], ids[0])."""
    vec = embed_queries([query])             # respects 'was_normalized'
    scores, ids = index.search(vec, k)
    return scores[0], ids[0]

def row_from_faiss_id(fid: int):
    """
    Map FAISS ID -> row.
    - Falls back to identity mapping if id_map has no key.
    - Prefers rows_min when available, else raw_claims.
    """
    try:
        fid_int = int(fid)
    except Exception:
        return {}

    ridx = id_map.get(str(fid_int), fid_int)  # fallback to identity

    # Safe bounds check helpers
    def _get_safe(seq, i):
        return seq[i] if 0 <= i < len(seq) else None

    row = None
    if rows_min:
        row = _get_safe(rows_min, ridx)
    if row is None and raw_claims:
        row = _get_safe(raw_claims, ridx)

    if row is None:
        return {"row_index": ridx}  # last-resort breadcrumb
    return row

def _pick_text(row: dict) -> str:
    """Choose the best text field to display."""
    return (
        row.get("claim")
        or row.get("claim_text")
        or row.get("text")
        or row.get("content")
        or ""
    )

def search(query: str, k: int = 10, include_cols=None) -> pd.DataFrame:
    """
    Return a tidy DataFrame with best-guess text and useful metadata.
    Set include_cols to a list to force a specific column order.
    """
    scores, ids = faiss_search(query, k=k)
    out = []
    for s, fid in zip(scores, ids):
        row = row_from_faiss_id(fid)
        text = _pick_text(row)
        base = {
            "faiss_id": int(fid),
            "score": float(s),      # cosine if vectors were normalized
            "text": text[:300],     # short preview
        }
        # Merge row but avoid duplicating the text field
        extra = {k: v for k, v in row.items() if k not in {"claim", "claim_text", "text", "content"}}
        out.append({**base, **extra})

    df = pd.DataFrame(out)

    # Nice default column ordering
    cols = include_cols if include_cols is not None else _DEFAULT_COLS
    ordered = [c for c in cols if c in df.columns]
    remainder = [c for c in df.columns if c not in ordered]
    return df[ordered + remainder]


## Try a search

In [None]:
df = search("main thesis", k=10)
cols = [c for c in ["faiss_id","score","speaker","start","end","source","timestamp","text","claim"] if c in df.columns]
df[cols].head(10)


Unnamed: 0,faiss_id,score,text
0,35,0.2388,His human nature...
1,44,0.210141,One divine nature.
2,105,0.20954,The human nature and divine nature of Jesus ar...
3,43,0.209186,The interpretation is addition.
4,49,0.207735,Nature is essence.
5,53,0.206796,The nature of Christ is to his personhood.
6,5,0.205755,He has one divine will.
7,101,0.204216,He alone describes the divine essence.
8,47,0.194516,He doesn't change in essence.
9,19,0.187369,"The Father has the Hyposthesis, which is a per..."


## Vector + dimension check

In [None]:
# Identify which path is active and check dimensions
provider = "openai" if model_name.startswith("text-embedding-") and bool(openai_api_key) else "sentence-transformers"
print("provider:", provider)

vec = embed_queries(["hello world"])     # uses your function from the cell
print("vec shape:", vec.shape)           # should be (1, 1536) for text-embedding-3-small
print("L2 norm:", float(np.linalg.norm(vec[0])))  # ~1.0 if normalized=True

# Must match the FAISS index dimension
assert vec.shape[1] == index.d, f"dim mismatch: {vec.shape[1]} vs index.d={index.d}"
print("✅ Embedding dimension matches FAISS (index.d).")


provider: openai
vec shape: (1, 1536)
L2 norm: 1.0
✅ Embedding dimension matches FAISS (index.d).


## Smoke test the search

In [None]:
df = search(
    "numerical claims",
    k=8,
    include_cols=["faiss_id","score","claim_text","type","topic","stance","confidence","text"]
)
df

# demo (remove if you don't want auto-output)
try:
    display(search("main thesis", k=5))
except NameError:
    pass


Unnamed: 0,faiss_id,score,text,type,topic,stance,confidence,row_id,utterance_range
0,35,0.2388,His human nature...,doctrine,Christology,neutral,0.7,35,"[175, 199]"
1,44,0.210141,One divine nature.,doctrine,nature of God,affirm,0.9,44,"[500, 524]"
2,105,0.20954,The human nature and divine nature of Jesus ar...,doctrine,nature of Christ,affirm,0.8,105,"[1125, 1149]"
3,43,0.209186,The interpretation is addition.,doctrine,theological interpretation,affirm,0.8,43,"[425, 449]"
4,49,0.207735,Nature is essence.,doctrine,Nature and essence,affirm,0.8,49,"[525, 549]"


## Handy extras (drop-in, optional)

## 1) Quick top-k to CSV (for sharing or labeling)

In [None]:
df = search("numerical claims", k=20,
            include_cols=["faiss_id","score","claim_text","type","topic","stance","confidence","text"])
out_path = os.path.join(DATA_DIR, "search_numerical_claims_top20.csv")
df.to_csv(out_path, index=False)
print("Saved ->", out_path)


Saved -> /content/drive/MyDrive/FATCR/data/processed/search_numerical_claims_top20.csv


## 2) Tiny helper to inspect the source utterances
(uses your UTTERANCES.parquet with row_id/utterance_range if present)

In [None]:
import pandas as pd, os

UTT_PATH = os.path.join(DATA_DIR, "UTTERANCES.parquet")
utts = pd.read_parquet(UTT_PATH) if os.path.exists(UTT_PATH) else None

def show_context(row_id: int, window: int = 1):
    if utts is None:
        print("No UTTERANCES.parquet found"); return
    # assumes row_id points to a row index in rows_min/raw_claims
    r = row_from_faiss_id(row_id)
    rng = r.get("utterance_range") or [row_id, row_id]
    a, b = (int(rng[0]), int(rng[1])) if isinstance(rng, (list,tuple)) else (row_id, row_id)
    a = max(a - window, 0); b = min(b + window, len(utts)-1)

    # Check for existing columns before displaying
    cols_to_display = [col for col in ["speaker","start","end","text"] if col in utts.columns]
    if cols_to_display:
        display(utts.iloc[a:b+1][cols_to_display])
    else:
        print("No relevant columns to display in UTTERANCES.parquet")


# Example: show context for the top hit
top = df.iloc[0]
show_context(top["row_id"] if "row_id" in df.columns else int(top["faiss_id"]))

Unnamed: 0,speaker,text
424,SPEAKER_00,You're the one conveying my question.
425,SPEAKER_00,I just told you that his addition of limitations
426,SPEAKER_00,was strictly the coming into the world
427,SPEAKER_00,with his human nature.
428,SPEAKER_00,And you still don't understand
429,SPEAKER_00,because you are not prepared for the philosoph...
430,SPEAKER_00,You said it was added to the human nature?
431,SPEAKER_00,The addition?
432,SPEAKER_00,"Yes, that's exactly what I said."
433,SPEAKER_00,It wasn't added to the divine nature.


## What is Precision@K (P@K)?

For a query, look at the top K results your search returns.
P@K = (how many of those K are actually relevant) / K.

Example: top-10 has 3 relevant hits → P@10 = 0.3.

## How to set it up in your 05 notebook
# Step 1 — Pick a few queries

Choose 3–5 realistic queries (e.g., “main thesis”, “numerical claims”, “theological assertions”).

# Step 2 — Generate a labeling CSV (template)

Run this in a cell in FACTR_05 (below your search helpers). It exports the top-10 results per query to a CSV you can label.

In [None]:
# Build a labeling template from your current index
QUERIES = [
    "main thesis",
    "numerical claims",
    "theological assertions",
]

K = 10
rows = []
for q in QUERIES:
    df = search(q, k=K)
    for fid in df["faiss_id"].astype(int).tolist():
        rows.append({"query": q, "faiss_id": int(fid), "relevant": 0})  # default 0

import pandas as pd, os
LABELS_PATH = os.path.join(DATA_DIR, "eval_labels.csv")
pd.DataFrame(rows).drop_duplicates().to_csv(LABELS_PATH, index=False)
print("Wrote labeling file →", LABELS_PATH)


Wrote labeling file → /content/drive/MyDrive/FATCR/data/processed/eval_labels.csv


In [None]:
q = "main thesis"   # change to your query
K = 10
df = search(q, k=K)
df_preview = df[[c for c in ["faiss_id","score","claim_text","text","type","topic","stance","confidence"] if c in df.columns]]
df_preview


Unnamed: 0,faiss_id,score,text,type,topic,stance,confidence
0,35,0.2388,His human nature...,doctrine,Christology,neutral,0.7
1,44,0.210141,One divine nature.,doctrine,nature of God,affirm,0.9
2,105,0.20954,The human nature and divine nature of Jesus ar...,doctrine,nature of Christ,affirm,0.8
3,43,0.209186,The interpretation is addition.,doctrine,theological interpretation,affirm,0.8
4,49,0.207735,Nature is essence.,doctrine,Nature and essence,affirm,0.8
5,53,0.206796,The nature of Christ is to his personhood.,doctrine,Christology,affirm,0.9
6,5,0.205755,He has one divine will.,doctrine,Christology,affirm,0.9
7,101,0.204216,He alone describes the divine essence.,doctrine,divine essence,affirm,0.8
8,47,0.194516,He doesn't change in essence.,doctrine,nature of God,affirm,0.9
9,19,0.187369,"The Father has the Hyposthesis, which is a per...",doctrine,Nature of God,affirm,0.9


In [None]:
q = "main thesis"
df = search(q, k=10)
df_preview = df[[c for c in ["faiss_id","score","claim_text","text","type","topic","stance","confidence"] if c in df.columns]]
df_preview


Unnamed: 0,faiss_id,score,text,type,topic,stance,confidence
0,35,0.2388,His human nature...,doctrine,Christology,neutral,0.7
1,44,0.210141,One divine nature.,doctrine,nature of God,affirm,0.9
2,105,0.20954,The human nature and divine nature of Jesus ar...,doctrine,nature of Christ,affirm,0.8
3,43,0.209186,The interpretation is addition.,doctrine,theological interpretation,affirm,0.8
4,49,0.207735,Nature is essence.,doctrine,Nature and essence,affirm,0.8
5,53,0.206796,The nature of Christ is to his personhood.,doctrine,Christology,affirm,0.9
6,5,0.205755,He has one divine will.,doctrine,Christology,affirm,0.9
7,101,0.204216,He alone describes the divine essence.,doctrine,divine essence,affirm,0.8
8,47,0.194516,He doesn't change in essence.,doctrine,nature of God,affirm,0.9
9,19,0.187369,"The Father has the Hyposthesis, which is a per...",doctrine,Nature of God,affirm,0.9


In [None]:
import pandas as pd, os

LABELS_PATH = os.path.join(DATA_DIR, "eval_labels.csv")

def labeling_table(query, k=10):
    df_top = search(query, k=k)
    # Select columns only if they exist in the DataFrame
    cols_to_select = [c for c in ["faiss_id","score","claim_text","text","type","topic","stance","confidence"] if c in df_top.columns]
    df_top = df_top[cols_to_select]
    labels = pd.read_csv(LABELS_PATH) if os.path.exists(LABELS_PATH) else pd.DataFrame(columns=["query","faiss_id","relevant"])
    lab = labels[labels["query"]==query][["faiss_id","relevant"]].drop_duplicates()
    merged = df_top.merge(lab, on="faiss_id", how="left").fillna({"relevant":0})
    return merged

display(labeling_table("main thesis", k=10))
display(labeling_table("numerical claims", k=10))
display(labeling_table("theological assertion", k=10))

Unnamed: 0,faiss_id,score,text,type,topic,stance,confidence,relevant
0,35,0.2388,His human nature...,doctrine,Christology,neutral,0.7,0
1,44,0.210141,One divine nature.,doctrine,nature of God,affirm,0.9,0
2,105,0.20954,The human nature and divine nature of Jesus ar...,doctrine,nature of Christ,affirm,0.8,0
3,43,0.209186,The interpretation is addition.,doctrine,theological interpretation,affirm,0.8,0
4,49,0.207735,Nature is essence.,doctrine,Nature and essence,affirm,0.8,0
5,53,0.206796,The nature of Christ is to his personhood.,doctrine,Christology,affirm,0.9,0
6,5,0.205755,He has one divine will.,doctrine,Christology,affirm,0.9,0
7,101,0.204216,He alone describes the divine essence.,doctrine,divine essence,affirm,0.8,0
8,47,0.194516,He doesn't change in essence.,doctrine,nature of God,affirm,0.9,0
9,19,0.187369,"The Father has the Hyposthesis, which is a per...",doctrine,Nature of God,affirm,0.9,0


Unnamed: 0,faiss_id,score,text,type,topic,stance,confidence,relevant
0,43,0.280309,The interpretation is addition.,doctrine,theological interpretation,affirm,0.8,0
1,44,0.175022,One divine nature.,doctrine,nature of God,affirm,0.9,0
2,45,0.167951,Two natures.,doctrine,Christology,affirm,0.9,0
3,84,0.164874,"Even if they have a mustard seed of Iman, they...",doctrine,salvation,affirm,0.9,0
4,48,0.164766,Malachi six-three says God does not change in ...,doctrine,God's nature,affirm,0.9,0
5,24,0.164139,The fact that he prayed implies that he needs ...,doctrine,Prayer,affirm,0.6,0
6,80,0.150924,Immortal is an eternal attribute of God.,doctrine,God's attributes,affirm,0.9,0
7,5,0.148166,He has one divine will.,doctrine,Christology,affirm,0.9,0
8,41,0.144143,There were no limitations to his divine nature.,doctrine,divine nature,affirm,0.9,0
9,95,0.135634,"In 1 Timothy 6 verse 16, it says he alone is i...",doctrine,immortality,affirm,0.9,0


Unnamed: 0,faiss_id,score,text,type,topic,stance,confidence,relevant
0,81,0.373589,God's eternal attributes are his essence.,doctrine,God's essence,affirm,0.9,0
1,105,0.358954,The human nature and divine nature of Jesus ar...,doctrine,nature of Christ,affirm,0.8,0
2,62,0.344535,Jesus has a divine nature.,doctrine,nature of Christ,affirm,0.9,0
3,89,0.340367,The essence of God is distinct from human esse...,doctrine,God's nature,affirm,0.8,0
4,19,0.337264,"The Father has the Hyposthesis, which is a per...",doctrine,Nature of God,affirm,0.9,0
5,14,0.321624,"The Father, Son and Holy Spirit are one in ess...",doctrine,Trinity,affirm,0.9,0
6,48,0.315162,Malachi six-three says God does not change in ...,doctrine,God's nature,affirm,0.9,0
7,29,0.314876,Human prayers can be rejected.,doctrine,prayer,affirm,0.8,0
8,9,0.314561,Jesus demonstrated a posture of prayer by putt...,doctrine,prayer,affirm,0.8,0
9,53,0.314149,The nature of Christ is to his personhood.,doctrine,Christology,affirm,0.9,0


In [None]:
# ==== Inspect full text by faiss_id (with optional context) ====
import os, pandas as pd

UTT_PATH = os.path.join(DATA_DIR, "UTTERANCES.parquet")  # optional

def _get_row_by_id(fid: int):
    """faiss_id -> row dict (prefers rows_min, falls back to raw_claims)."""
    ridx = id_map.get(str(int(fid)), int(fid))  # identity fallback
    row = None
    if rows_min and isinstance(ridx, int) and 0 <= ridx < len(rows_min):
        row = rows_min[ridx]
    if row is None and raw_claims and isinstance(ridx, int) and 0 <= ridx < len(raw_claims):
        row = raw_claims[ridx]
    return row or {"row_index": ridx}

def _pick_text(row: dict) -> str:
    return (
        row.get("claim_text")
        or row.get("claim")
        or row.get("text")
        or row.get("content")
        or ""
    )

def show_faiss_id(fid: int, context_window: int = 1):
    """Pretty-print the full text for a faiss_id and (if available) nearby utterances."""
    row = _get_row_by_id(fid)
    print(f"\n=== faiss_id: {fid} ===")
    for k in ["row_id","type","topic","stance","confidence","source","timestamp","utterance_range"]:
        if k in row:
            print(f"{k}: {row[k]}")
    full_text = _pick_text(row)
    print("\nTEXT:\n" + full_text)

    # Optional transcript context from UTTERANCES.parquet
    if os.path.exists(UTT_PATH):
        try:
            utts = pd.read_parquet(UTT_PATH)
            rng = row.get("utterance_range")
            if isinstance(rng, (list, tuple)) and len(rng) == 2:
                a, b = int(rng[0]), int(rng[1])
                a = max(a - context_window, 0)
                b = min(b + context_window, len(utts) - 1)
                print("\nContext (utterances):")
                display(utts.iloc[a:b+1][["speaker","start","end","text"]])
        except Exception as e:
            print("[context unavailable]", e)

def show_many(ids, context_window: int = 1):
    """Convenience: inspect several ids in one go."""
    for fid in ids:
        show_faiss_id(int(fid), context_window=context_window)


In [None]:
LABELS_PATH = os.path.join(DATA_DIR, "eval_labels.csv")

def set_label(query: str, faiss_id: int, relevant: int):
    import pandas as pd
    labels = pd.read_csv(LABELS_PATH) if os.path.exists(LABELS_PATH) else pd.DataFrame(columns=["query","faiss_id","relevant"])
    mask = (labels["query"]==query) & (labels["faiss_id"]==int(faiss_id))
    if mask.any(): labels.loc[mask, "relevant"] = int(relevant)
    else: labels = pd.concat([labels, pd.DataFrame([{"query":query,"faiss_id":int(faiss_id),"relevant":int(relevant)}])], ignore_index=True)
    labels.to_csv(LABELS_PATH, index=False)
    print(f"Saved: ({query}, faiss_id={faiss_id}) -> relevant={relevant}")

# Example:
# show_faiss_id(49); set_label("main thesis", 49, 1)


## Totally doable. Drop this one cell into your 05 notebook (anywhere after you’ve loaded artefacts and defined id_map, rows_min, raw_claims). It will:

read data/processed/eval_labels.csv

look up each faiss_id

attach the full claim text (plus useful metadata)

save eval_labels_with_text.csv

In [None]:
# === Expand eval_labels.csv with full text (and helpers) ===
import os, json, pandas as pd

# Resolve paths
try:
    DATA_DIR
except NameError:
    ROOT = "/content/drive/MyDrive/FATCR"
    DATA_DIR = os.path.join(ROOT, "data", "processed")

LABELS_PATH = os.path.join(DATA_DIR, "eval_labels.csv")
OUT_PATH    = os.path.join(DATA_DIR, "eval_labels_with_text.csv")
UTT_PATH    = os.path.join(DATA_DIR, "UTTERANCES.parquet")  # optional

# --- helpers to fetch rows/text by faiss_id ---
def _get_row_by_id(fid: int):
    ridx = id_map.get(str(int(fid)), int(fid))  # identity fallback
    row = None
    if rows_min and isinstance(ridx, int) and 0 <= ridx < len(rows_min):
        row = rows_min[ridx]
    if row is None and raw_claims and isinstance(ridx, int) and 0 <= ridx < len(raw_claims):
        row = raw_claims[ridx]
    return row or {"row_index": ridx}

def _pick_text(row: dict) -> str:
    return (
        row.get("claim_text")
        or row.get("claim")
        or row.get("text")
        or row.get("content")
        or ""
    )

# 1) Load labels
assert os.path.exists(LABELS_PATH), f"Missing labels file: {LABELS_PATH}"
labels = pd.read_csv(LABELS_PATH)
labels["faiss_id"] = labels["faiss_id"].astype(int)

# 2) Attach full text + handy metadata
aug = []
for _, r in labels.iterrows():
    row = _get_row_by_id(int(r["faiss_id"]))
    aug.append({
        "query": r["query"],
        "faiss_id": int(r["faiss_id"]),
        "relevant": int(r.get("relevant", 0)),
        "text_full": _pick_text(row),
        "type": row.get("type"),
        "topic": row.get("topic"),
        "stance": row.get("stance"),
        "confidence": row.get("confidence"),
        "row_id": row.get("row_id"),
        "utterance_range": row.get("utterance_range"),
        "source": row.get("source"),
        "timestamp": row.get("timestamp"),
    })

df_labels = pd.DataFrame(aug)
df_labels.to_csv(OUT_PATH, index=False)
print("✅ Wrote:", OUT_PATH)
display(df_labels.head(20))

# 3) Convenience viewers + quick label setter
def review_labels(query: str = None):
    df = df_labels if query is None else df_labels[df_labels["query"] == query]
    cols = [c for c in ["query","faiss_id","relevant","type","topic","stance","confidence","text_full"] if c in df.columns]
    return df[cols].reset_index(drop=True)

def show_faiss_id(fid: int, context_window: int = 1):
    """Print full text and optional nearby utterances from UTTERANCES.parquet."""
    row = _get_row_by_id(int(fid))
    print(f"\n=== faiss_id: {fid} ===")
    for k in ["row_id","type","topic","stance","confidence","source","timestamp","utterance_range"]:
        if k in row: print(f"{k}: {row[k]}")
    print("\nTEXT:\n" + _pick_text(row))
    if os.path.exists(UTT_PATH):
        try:
            import pandas as pd
            utts = pd.read_parquet(UTT_PATH)
            rng = row.get("utterance_range")
            if isinstance(rng, (list, tuple)) and len(rng) == 2:
                a, b = int(rng[0]), int(rng[1])
                a = max(a - context_window, 0)
                b = min(b + context_window, len(utts) - 1)
                print("\nContext (utterances):")
                display(utts.iloc[a:b+1][["speaker","start","end","text"]])
        except Exception as e:
            print("[context unavailable]", e)

def set_label(query: str, faiss_id: int, relevant: int):
    """Update eval_labels.csv (so you can flip 0/1 without leaving the notebook)."""
    labs = pd.read_csv(LABELS_PATH)
    faiss_id = int(faiss_id); relevant = int(relevant)
    mask = (labs["query"] == query) & (labs["faiss_id"].astype(int) == faiss_id)
    if mask.any():
        labs.loc[mask, "relevant"] = relevant
    else:
        labs = pd.concat([labs, pd.DataFrame([{"query": query, "faiss_id": faiss_id, "relevant": relevant}])], ignore_index=True)
    labs.to_csv(LABELS_PATH, index=False)
    print(f"Saved label -> ({query}, faiss_id={faiss_id}) = {relevant}")


✅ Wrote: /content/drive/MyDrive/FATCR/data/processed/eval_labels_with_text.csv


Unnamed: 0,query,faiss_id,relevant,text_full,type,topic,stance,confidence,row_id,utterance_range,source,timestamp
0,main thesis,35,0,His human nature...,doctrine,Christology,neutral,0.7,35,"[175, 199]",,
1,main thesis,44,0,One divine nature.,doctrine,nature of God,affirm,0.9,44,"[500, 524]",,
2,main thesis,105,0,The human nature and divine nature of Jesus ar...,doctrine,nature of Christ,affirm,0.8,105,"[1125, 1149]",,
3,main thesis,43,0,The interpretation is addition.,doctrine,theological interpretation,affirm,0.8,43,"[425, 449]",,
4,main thesis,49,1,Nature is essence.,doctrine,Nature and essence,affirm,0.8,49,"[525, 549]",,
5,main thesis,53,0,The nature of Christ is to his personhood.,doctrine,Christology,affirm,0.9,53,"[550, 574]",,
6,main thesis,5,0,He has one divine will.,doctrine,Christology,affirm,0.9,5,"[25, 49]",,
7,main thesis,101,0,He alone describes the divine essence.,doctrine,divine essence,affirm,0.8,101,"[1125, 1149]",,
8,main thesis,47,0,He doesn't change in essence.,doctrine,nature of God,affirm,0.9,47,"[500, 524]",,
9,main thesis,19,0,"The Father has the Hyposthesis, which is a per...",doctrine,Nature of God,affirm,0.9,19,"[100, 124]",,


# Step 3 — Label in Drive

Open data/processed/eval_labels.csv (the file you just wrote) and mark relevant=1 for rows that truly answer the query. Leave others as 0.

Totally—here’s a drop-in cell that will apply those labels to your file. Paste it into FACTR_05 (anywhere after DATA_DIR is defined) and run it.

# That will:

set 1 for main thesis → faiss_id 44, 49

set 0 for the other main thesis IDs you listed

set 0 for all shown numerical claims

set 1 for the theological assertion IDs you listed

save back to data/processed/eval_labels.csv (and update eval_labels_with_text.csv if it exists)

In [None]:
# === Apply provisional labels to data/processed/eval_labels.csv ===
import os, pandas as pd

# Resolve path (uses your existing DATA_DIR)
LABELS_PATH = os.path.join(DATA_DIR, "eval_labels.csv")
LABELS_WITH_TEXT = os.path.join(DATA_DIR, "eval_labels_with_text.csv")  # update if present

assert os.path.exists(LABELS_PATH), f"Missing: {LABELS_PATH}"
labels = pd.read_csv(LABELS_PATH)
labels["faiss_id"] = labels["faiss_id"].astype(int)

# ---- Your requested edits ----
# main thesis
mt_ones  = {44, 49}
mt_zeros = {35, 105, 43, 53, 5, 101, 47, 19}

# numerical claims
nc_ones  = set()  # none
nc_zeros = {43, 44, 45, 84, 48, 24, 80, 5, 41, 95}

# theological assertion
ta_ones  = {81, 105, 62, 89, 19, 14, 48, 29, 3, 53}
ta_zeros = set()  # none in this batch

def apply(df, query, ones, zeros):
    m = df["query"] == query
    if zeros:
        df.loc[m & df["faiss_id"].isin(zeros), "relevant"] = 0
    if ones:
        df.loc[m & df["faiss_id"].isin(ones), "relevant"] = 1

apply(labels, "main thesis", mt_ones, mt_zeros)
apply(labels, "numerical claims", nc_ones, nc_zeros)
apply(labels, "theological assertion", ta_ones, ta_zeros)

labels.to_csv(LABELS_PATH, index=False)
print("✅ Saved:", LABELS_PATH)

# If you already created eval_labels_with_text.csv, update its 'relevant' too
if os.path.exists(LABELS_WITH_TEXT):
    df_txt = pd.read_csv(LABELS_WITH_TEXT)
    df_txt["faiss_id"] = df_txt["faiss_id"].astype(int)
    # merge only the relevant column from fresh labels
    df_txt = df_txt.drop(columns=["relevant"], errors="ignore").merge(
        labels[["query","faiss_id","relevant"]],
        on=["query","faiss_id"], how="left"
    )
    df_txt.to_csv(LABELS_WITH_TEXT, index=False)
    print("✅ Updated:", LABELS_WITH_TEXT)

# Quick preview of what changed
for q in ["main thesis", "numerical claims", "theological assertion"]:
    sub = labels[labels["query"] == q]
    if not sub.empty:
        print(f"[{q}] positives:", int((sub["relevant"]==1).sum()), "/", len(sub))
        print(sub.sort_values(["relevant","faiss_id"], ascending=[False,True]).head(12))


✅ Saved: /content/drive/MyDrive/FATCR/data/processed/eval_labels.csv
✅ Updated: /content/drive/MyDrive/FATCR/data/processed/eval_labels_with_text.csv
[main thesis] positives: 2 / 10
         query  faiss_id  relevant
1  main thesis        44         1
4  main thesis        49         1
6  main thesis         5         0
9  main thesis        19         0
0  main thesis        35         0
3  main thesis        43         0
8  main thesis        47         0
5  main thesis        53         0
7  main thesis       101         0
2  main thesis       105         0
[numerical claims] positives: 0 / 10
               query  faiss_id  relevant
17  numerical claims         5         0
15  numerical claims        24         0
18  numerical claims        41         0
10  numerical claims        43         0
11  numerical claims        44         0
12  numerical claims        45         0
14  numerical claims        48         0
16  numerical claims        80         0
13  numerical claims       

now that everything’s labeled, the next step is to compute your retrieval scores. Paste this cell in FACTR_05 (below your search helpers) and run it.

# What you’ll get

A summary table with precision@K, recall@K, and mAP@K per query (and saved to data/processed/eval_summary_k10.csv).

A detailed CSV per query (e.g., eval_main_thesis_top10.csv) showing each hit’s faiss_id, score, and your relevant flag—handy for debugging.

In [None]:
# === Evaluate retrieval: Precision@K (+ Recall@K and mAP@K) ===
import os, pandas as pd

K = 10  # keep consistent with how you built the label file

LABELS_PATH = os.path.join(DATA_DIR, "eval_labels.csv")
labels = pd.read_csv(LABELS_PATH)
labels["faiss_id"] = labels["faiss_id"].astype(int)

queries = labels["query"].dropna().unique().tolist()

def evaluate_query(q, k=K):
    df = search(q, k=k)  # uses your helper; returns faiss_id + text + score
    preds = df["faiss_id"].astype(int).tolist()
    gold  = set(labels[(labels["query"]==q) & (labels["relevant"]==1)]["faiss_id"].astype(int))
    hits  = [1 if p in gold else 0 for p in preds]

    # Precision@K
    p = sum(hits) / float(k)

    # Recall@K (if we have any positives labeled)
    r = (sum(hits) / float(len(gold))) if len(gold) > 0 else None

    # mAP@K (average precision truncated at K)
    ap_num = 0.0; tp = 0
    for i, h in enumerate(hits, start=1):
        if h:
            tp += 1
            ap_num += tp / float(i)
    ap = (ap_num / max(1, len(gold))) if len(gold) > 0 else None

    df_out = df.copy()
    df_out["relevant"] = hits
    return p, r, ap, df_out

summary_rows = []
for q in queries:
    p, r, ap, dfq = evaluate_query(q, k=K)
    summary_rows.append({
        "query": q,
        f"precision@{K}": round(p, 3),
        f"recall@{K}": (None if r is None else round(r, 3)),
        f"mAP@{K}": (None if ap is None else round(ap, 3)),
    })
    # Save detailed per-query table with your labels next to scores
    out_q = os.path.join(DATA_DIR, f"eval_{q.replace(' ','_')}_top{K}.csv")
    dfq.to_csv(out_q, index=False)
    print(f"Saved per-query results -> {out_q}")

summary = pd.DataFrame(summary_rows)
sum_path = os.path.join(DATA_DIR, f"eval_summary_k{K}.csv")
summary.to_csv(sum_path, index=False)
print("\n=== Summary ===")
display(summary)
print("Saved summary ->", sum_path)


Saved per-query results -> /content/drive/MyDrive/FATCR/data/processed/eval_main_thesis_top10.csv
Saved per-query results -> /content/drive/MyDrive/FATCR/data/processed/eval_numerical_claims_top10.csv
Saved per-query results -> /content/drive/MyDrive/FATCR/data/processed/eval_theological_assertion_top10.csv

=== Summary ===


Unnamed: 0,query,precision@10,recall@10,mAP@10
0,main thesis,0.2,1.0,0.45
1,numerical claims,0.0,,
2,theological assertion,0.9,1.0,0.989


Saved summary -> /content/drive/MyDrive/FATCR/data/processed/eval_summary_k10.csv


### Quick sanity checks

In [None]:

# 1) FAISS size vs id_map
ntotal = index.ntotal
print("index.ntotal:", ntotal)
assert ntotal == len(id_map), f"Index size ({ntotal}) != id_map size ({len(id_map)})"

# 2) Embedding dimension matches FAISS
test_vec = embed_queries(["test"])
assert test_vec.shape[1] == index.d, f"Embedding dim {test_vec.shape[1]} != FAISS dim {index.d}"
print("✅ Sanity checks passed")


index.ntotal: 109
✅ Sanity checks passed


### (optional) Single-query demo

In [None]:

QUERY = "What did the speaker claim about X?"
K = 8

df = search(QUERY, k=K)
try:
    import pandas as pd
    from IPython.display import display
    display(df)
except Exception:
    print(df.head(10).to_string(index=False))


Unnamed: 0,faiss_id,score,text,type,topic,stance,confidence,row_id,utterance_range
0,35,0.218881,His human nature...,doctrine,Christology,neutral,0.7,35,"[175, 199]"
1,48,0.217896,Malachi six-three says God does not change in ...,doctrine,God's nature,affirm,0.9,48,"[525, 549]"
2,101,0.214911,He alone describes the divine essence.,doctrine,divine essence,affirm,0.8,101,"[1125, 1149]"
3,19,0.197645,"The Father has the Hyposthesis, which is a per...",doctrine,Nature of God,affirm,0.9,19,"[100, 124]"
4,39,0.196809,The word is he emptied something.,doctrine,Kenosis,neutral,0.7,39,"[250, 274]"
5,64,0.195878,Everyone saw Jesus' human nature.,doctrine,nature of Christ,affirm,0.8,64,"[675, 699]"
6,47,0.193349,He doesn't change in essence.,doctrine,nature of God,affirm,0.9,47,"[500, 524]"
7,53,0.190122,The nature of Christ is to his personhood.,doctrine,Christology,affirm,0.9,53,"[550, 574]"


### (Optional) Save LAST_SEARCH pointer

In [None]:
# Safe NB_NAME
try:
    NB_NAME
except NameError:
    try:
        import ipynbname
        NB_NAME = ipynbname.path().name
    except Exception:
        NB_NAME = "FACTR_05"


In [None]:
from datetime import datetime, timezone

os.makedirs(DATA_DIR, exist_ok=True)
last = {
    "notebook": NB_NAME,
    "time": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
    "artefacts": {
        "faiss": os.path.relpath(FAISS_PATH, ROOT),
        "map": os.path.relpath(MAP_PATH, ROOT),
        "rows_min": os.path.relpath(ROWS_MIN, ROOT) if os.path.exists(ROWS_MIN) else None,
        "raw_claims": os.path.relpath(RAW_CLAIMS, ROOT) if os.path.exists(RAW_CLAIMS) else None,
    },
    "embed_model": model_name,
    "normalized": was_normalized,
    "metric": metric,
}
with open(os.path.join(DATA_DIR, "LAST_SEARCH.json"), "w", encoding="utf-8") as f:
    json.dump(last, f, ensure_ascii=False, indent=2)
print("Wrote", os.path.join(DATA_DIR, "LAST_SEARCH.json"))


Wrote /content/drive/MyDrive/FATCR/data/processed/LAST_SEARCH.json


### Snapshot (05) — Save small run context

In [None]:

import json, os, time
os.makedirs(SNAP_DIR, exist_ok=True)
ts = time.strftime("%Y%m%d-%H%M%S")
snap_base = os.path.join(SNAP_DIR, f"05_search_{ts}")
os.makedirs(snap_base, exist_ok=True)
ctx = {
    "notebook": NB_NAME,
    "query_examples": QUERIES if 'QUERIES' in globals() else [],
    "embed_model": model_name,
    "normalized": was_normalized,
    "metric": metric,
    "faiss_path": os.path.relpath(FAISS_PATH, ROOT),
    "map_path": os.path.relpath(MAP_PATH, ROOT),
}
with open(os.path.join(snap_base, "context.json"), "w", encoding="utf-8") as f:
    json.dump(ctx, f, ensure_ascii=False, indent=2)
print("Snapshot saved to:", snap_base)


Snapshot saved to: /content/drive/MyDrive/FATCR/snapshots/05_search_20250919-215157


### Push (commit notebooks + snapshots + pointers)

In [None]:

# === FACTR universal push (commit notebook + snapshots + pointers + FAISS + optional tag) ===
try:
    from google.colab import userdata
except Exception:
    class _UD:
        @staticmethod
        def get(k):
            import os
            return os.getenv(k)
    userdata = _UD()

import urllib.parse, os, subprocess, shlex, time, re

ROOT = ROOT  # reuse detected ROOT
os.chdir(ROOT)

# --- Optional milestone tag: set to "" to skip tagging ---
MILESTONE = ""  # e.g., "FACTR_05: search MVP 2025-09-16"

print("📂 Repo status:")
!git status -sb

# --- Pull first (rebase) ---
print("\n🔄 Pulling (rebase)…")
pat = userdata.get("GITHUB_PAT")
assert pat, "Missing GITHUB_PAT in Colab Secrets or env."
enc_pat = urllib.parse.quote(pat, safe="")
PULL_URL = f"https://LukmaanViscomi:{enc_pat}@github.com/LukmaanViscomi/FATCR.git"
!git pull --rebase {PULL_URL} main || true

# --- Stage likely artefacts ---
print("\n➕ Staging files…")
!git add notebooks snapshots       data/processed/LAST_SEARCH.json      data/processed/LAST_*       data/processed/UTTERANCES.parquet       data/processed/CLAIMS_raw.jsonl       data/processed/CLAIMS_embeddings.npy       data/processed/CLAIMS_embeddings.meta.json       data/processed/CLAIMS.faiss       data/processed/CLAIMS.index.json       data/processed/CLAIMS.rows.min.json       README.md .gitignore 2>/dev/null || true

# --- Stage this notebook if present under notebooks/ ---
try:
    import ipynbname, sys
    nb = ipynbname.path().name
    os.system(f"git add notebooks/{nb} 2>/dev/null || true")
except Exception:
    pass

# --- Commit if needed ---
changed = subprocess.run(["git", "diff", "--cached", "--quiet"]).returncode != 0
if changed:
    msg = f"FACTR: search/eval update [{int(time.time())}]"
    print("\n✏️ Commit:", msg)
    !git commit -m {shlex.quote(msg)}
else:
    print("\nℹ️ Nothing new to commit.")

# --- Push commit ---
print("\n⬆️ Pushing to main…")
!git push {PULL_URL} HEAD:main

# --- Optional: Milestone tag ---
def make_tag_slug(name: str) -> str:
    slug = re.sub(r"[^A-Za-z0-9._-]", "-", name.strip()).strip("-_.")
    return slug or "milestone"

if MILESTONE:
    tag = make_tag_slug(MILESTONE)
    print(f"\n🏷️ Creating tag: {tag}")
    subprocess.run(["git", "tag", "-f", tag], check=True)
    subprocess.run(["git", "push", "origin", tag, "--force"], check=True)
    print(f"✅ Tag pushed: {tag}")

print("\n✅ Push complete.")


📂 Repo status:
Refresh index: 100% (10/10), done.
## [32mmain[m...[31morigin/main[m [ahead [32m4[m]
 [31mM[m notebooks/FACTR_02_Ingest_v2025-09-07_2.0.ipynb
 [31mM[m notebooks/FACTR_03_ASR+Diarize_v2025-09-07_2.0.ipynb
 [31mD[m notebooks/FACTR_04_Claims+Embeddings_v2025-09-07_2.0.ipynb
[31m??[m data/processed/CLAIMS.faiss
[31m??[m data/processed/CLAIMS.index.json
[31m??[m data/processed/CLAIMS.rows.min.json
[31m??[m data/processed/CLAIMS_embeddings.meta.json
[31m??[m data/processed/LAST_FAISS.json
[31m??[m data/processed/LAST_SEARCH.json
[31m??[m data/processed/eval_labels.csv
[31m??[m data/processed/eval_labels.gsheet
[31m??[m data/processed/eval_labels_with_text.csv
[31m??[m data/processed/eval_labels_with_text.gsheet
[31m??[m data/processed/eval_main_thesis_top10.csv
[31m??[m data/processed/eval_numerical_claims_top10.csv
[31m??[m data/processed/eval_summary_k10.csv
[31m??[m data/processed/eval_theological_assertion_top10.csv
[31m??[m "data/