# FACTR_02 — KB Ingest (Quran/Hadith/Tafsir/etc.)
Build a ground-truth vector store using the **same embedding model** and **normalisation** as your claims index.

**Outputs** under `data/processed/`:
- `KB_passages.jsonl`
- `KB_embeddings.npy`
- `KB_embeddings.meta.json`
- `KB.faiss`
- `KB.index.json`
- `LAST_KB.json`


In [None]:
# --- Setup
import os, json, re, numpy as np

ROOT = "/content/drive/MyDrive/FATCR"
DATA_DIR = os.path.join(ROOT, "data", "processed")
RAW_KB_DIR = os.path.join(ROOT, "data", "raw", "kb")  # put sources here
os.makedirs(DATA_DIR, exist_ok=True)
print("RAW_KB_DIR:", RAW_KB_DIR)
print("DATA_DIR:", DATA_DIR)


## Loaders & normalisers

In [None]:
import pandas as pd

def _clean(s):
    import re
    return re.sub(r"\s+", " ", str(s)).strip()

def load_jsonl(path, source):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            r = json.loads(line)
            text = _clean(r.get("text") or r.get("passage") or "")
            if not text:
                continue
            rows.append({
                "source": source,
                "collection": _clean(r.get("collection") or r.get("book") or source),
                "book": _clean(r.get("book") or r.get("surah") or r.get("work") or ""),
                "chapter": r.get("chapter"),
                "verse": r.get("verse"),
                "number": r.get("number"),
                "grade": r.get("grade"),
                "lang": r.get("lang") or "en",
                "ref": _clean(r.get("ref") or ""),
                "text": text
            })
    return rows

def load_csv(path, source):
    df = pd.read_csv(path)
    rows = []
    for _, r in df.iterrows():
        text = _clean(r.get("text") or r.get("passage") or "")
        if not text:
            continue
        rows.append({
            "source": source,
            "collection": _clean(r.get("collection") or r.get("book") or source),
            "book": _clean(r.get("book") or ""),
            "chapter": r.get("chapter"),
            "verse": r.get("verse"),
            "number": r.get("number"),
            "grade": r.get("grade"),
            "lang": r.get("lang") or "en",
            "ref": _clean(r.get("ref") or ""),
            "text": text
        })
    return rows

def load_txt_paragraphs(path, source, book=""):
    # Split on blank lines -> paragraphs (good for tafsir/exegesis)
    text = open(path, "r", encoding="utf-8").read()
    import re
    paras = [_clean(p) for p in re.split(r"\n\s*\n", text) if _clean(p)]
    out = []
    for p in paras:
        out.append({
            "source": source, "collection": source, "book": book,
            "chapter": None, "verse": None, "number": None, "grade": None,
            "lang": "en", "ref": "", "text": p
        })
    return out

def harvest_kb(raw_dir):
    all_rows = []
    for root, _, files in os.walk(raw_dir):
        for name in files:
            p = os.path.join(root, name)
            lname = name.lower()
            src = os.path.basename(root)
            try:
                if lname.endswith(".jsonl"):
                    all_rows += load_jsonl(p, src)
                elif lname.endswith(".csv"):
                    all_rows += load_csv(p, src)
                elif lname.endswith(".txt"):
                    all_rows += load_txt_paragraphs(p, src, book=os.path.splitext(name)[0])
            except Exception as e:
                print("Skipping", p, "->", e)
    return all_rows

kb_rows = harvest_kb(RAW_KB_DIR)
print("Loaded KB rows:", len(kb_rows))
kb_rows[:2]


## Persist as `KB_passages.jsonl`

In [None]:
KB_PASS = os.path.join(DATA_DIR, "KB_passages.jsonl")

with open(KB_PASS, "w", encoding="utf-8") as f:
    for i, r in enumerate(kb_rows):
        r_out = {**r, "id": i}
        f.write(json.dumps(r_out, ensure_ascii=False) + "\n")

print("Wrote:", KB_PASS, "| count:", len(kb_rows))


## Embed (same model + normalisation as CLAIMS)

In [None]:
# Try Colab Secrets first, then env var
try:
    from google.colab import userdata  # type: ignore
    openai_api_key = userdata.get("OPENAI_API_KEY")
except Exception:
    openai_api_key = os.getenv("OPENAI_API_KEY")

model_name = "text-embedding-3-small"  # keep same as claims
was_normalized = True
metric = "ip"

def _is_openai_model(name: str) -> bool:
    return name.startswith("text-embedding-")

def embed_openai(texts, model):
    from openai import OpenAI
    import numpy as np
    client = OpenAI(api_key=openai_api_key)
    out = []
    for i in range(0, len(texts), 96):
        batch = texts[i:i+96]
        resp = client.embeddings.create(model=model, input=batch)
        out.extend([d.embedding for d in resp.data])
    arr = np.asarray(out, dtype="float32")
    norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-8
    return arr / norms

def embed_st(texts, model):
    from sentence_transformers import SentenceTransformer
    st = SentenceTransformer(model)
    arr = st.encode(texts, convert_to_numpy=True, normalize_embeddings=was_normalized)
    return arr.astype("float32")

texts = [json.loads(l)["text"] for l in open(KB_PASS, "r", encoding="utf-8") if l.strip()]
if _is_openai_model(model_name) and openai_api_key:
    print("Using provider: OpenAI")
    vecs = embed_openai(texts, model_name)
else:
    print("Using provider: SentenceTransformers (fallback MiniLM)")
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    vecs = embed_st(texts, model_name)

KB_EMB  = os.path.join(DATA_DIR, "KB_embeddings.npy")
np.save(KB_EMB, vecs)
KB_META = os.path.join(DATA_DIR, "KB_embeddings.meta.json")
meta = {"model_name": model_name, "normalized": was_normalized, "faiss_metric": metric,
        "count": len(texts), "dim": int(vecs.shape[1])}
json.dump(meta, open(KB_META, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
print("Saved:", KB_EMB, "|", KB_META, "| shape:", vecs.shape)


## Build FAISS + id_map (identity) + pointer

In [None]:
import faiss, json, os, numpy as np
vecs = np.load(KB_EMB).astype("float32")
if was_normalized:
    faiss.normalize_L2(vecs)
d = vecs.shape[1]
index = faiss.IndexFlatIP(d) if was_normalized else faiss.IndexFlatL2(d)
index.add(vecs)
print("index.ntotal:", index.ntotal)

KB_FAISS = os.path.join(DATA_DIR, "KB.faiss")
KB_MAP   = os.path.join(DATA_DIR, "KB.index.json")
faiss.write_index(index, KB_FAISS)
id_map = {str(i): i for i in range(index.ntotal)}
json.dump(id_map, open(KB_MAP, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
assert index.ntotal == len(id_map) == meta["count"], "Inconsistent counts for KB index/id_map/meta"

LAST_KB = {
    "artefacts": {
        "kb_faiss": os.path.relpath(KB_FAISS, ROOT),
        "kb_index_json": os.path.relpath(KB_MAP, ROOT),
        "kb_passages": os.path.relpath(KB_PASS, ROOT),
    },
    "dim": d, "normalized": was_normalized, "model": model_name
}
json.dump(LAST_KB, open(os.path.join(DATA_DIR, "LAST_KB.json"), "w", encoding="utf-8"), ensure_ascii=False, indent=2)
print("✅ KB index built and consistent.")


## Smoke test

In [None]:
def kb_search(text, k=5):
    if model_name.startswith("text-embedding-") and (openai_api_key is not None):
        from numpy import array
        # embed_openai returns already-normalised vectors
        v = embed_openai([text], model_name)
    else:
        v = embed_st([text], model_name)
    scores, ids = index.search(v, k)
    rows = [json.loads(l) for l in open(KB_PASS, "r", encoding="utf-8") if l.strip()]
    out = []
    for s, fid in zip(scores[0], ids[0]):
        out.append({"kb_id": int(fid), "score": float(s), **rows[id_map[str(int(fid))]]})
    import pandas as pd
    return pd.DataFrame(out)

kb_search("two natures", k=5).head()
