In [1]:
# Build combined text+metadata, embed with OpenAI, and save artifacts for FAISS search
# !pip install -q pandas numpy faiss-cpu openai==1.*

import os
import json
import time
import hashlib
import numpy as np
import pandas as pd
import faiss
from openai import OpenAI

# Configuration
INPUT_CSV = "CHUNKS_DB.csv"           # expects columns: id, text, metadata
CACHE_DIR = "rag_cache"
EMBED_MODEL = "text-embedding-3-small"  # 1536-dim vectors
BATCH_SIZE = 2048

if not os.getenv("OPENAI_API_KEY"):
    raise RuntimeError("Set OPENAI_API_KEY in your environment before running this cell.")

os.makedirs(CACHE_DIR, exist_ok=True)
INDEX_PATH = os.path.join(CACHE_DIR, "faiss.index")
IDMAP_PATH = os.path.join(CACHE_DIR, "id_map.npy")
TEXTS_PATH = os.path.join(CACHE_DIR, "rag_texts.jsonl")
MANIFEST_PATH = os.path.join(CACHE_DIR, "manifest.json")

# 1) Load data
chunks_df = pd.read_csv(INPUT_CSV)
required_cols = {"id", "text", "metadata"}
if not required_cols.issubset(set(chunks_df.columns)):
    raise ValueError(f"Input must contain columns: {sorted(required_cols)}")

# 2) Build a single combined field for RAG (text + JSON-serialized metadata)
def _normalize_text(value):
    try:
        if pd.isna(value):
            return ""
    except Exception:
        pass
    return str(value).strip()

def _metadata_to_string(value):
    if isinstance(value, dict):
        return json.dumps(value, ensure_ascii=False, sort_keys=True)
    if isinstance(value, str):
        v = value.strip()
        try:
            parsed = json.loads(v)
            if isinstance(parsed, dict):
                return json.dumps(parsed, ensure_ascii=False, sort_keys=True)
        except Exception:
            pass
        return v
    try:
        if pd.isna(value):
            return ""
    except Exception:
        pass
    return str(value)

chunks_df["rag_text"] = (
    chunks_df["text"].apply(_normalize_text)
    + "\n\nMETADATA: "
    + chunks_df["metadata"].apply(_metadata_to_string)
)

rag_texts = chunks_df["rag_text"].astype(str).tolist()
ids = chunks_df["id"].to_numpy()

# 3) Compute dataset hash to enable caching across runs
joined = "\n".join(rag_texts)
dataset_hash = hashlib.sha256(joined.encode("utf-8")).hexdigest()

# If cache exists and matches, skip embedding
use_cache = (
    os.path.exists(INDEX_PATH)
    and os.path.exists(IDMAP_PATH)
    and os.path.exists(TEXTS_PATH)
    and os.path.exists(MANIFEST_PATH)
)
if use_cache:
    try:
        with open(MANIFEST_PATH, "r", encoding="utf-8") as f:
            manifest = json.load(f)
        if manifest.get("dataset_hash") == dataset_hash and manifest.get("embed_model") == EMBED_MODEL:
            print("Cache up to date. Skipping embedding and index rebuild.")
        else:
            use_cache = False
    except Exception:
        use_cache = False

client = OpenAI()

if not use_cache:
    # 4) Create embeddings via OpenAI in batches
    vectors = []
    for start in range(0, len(rag_texts), BATCH_SIZE):
        end = min(start + BATCH_SIZE, len(rag_texts))
        resp = client.embeddings.create(model=EMBED_MODEL, input=rag_texts[start:end])
        vectors.extend([d.embedding for d in resp.data])
        print(f"Embedded {end}/{len(rag_texts)}")

    emb = np.array(vectors, dtype=np.float32)
    # Normalize for cosine similarity using inner product in FAISS
    emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)

    # 5) Build FAISS index and save artifacts
    dim = emb.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(emb)

    faiss.write_index(index, INDEX_PATH)
    np.save(IDMAP_PATH, ids)
    with open(TEXTS_PATH, "w", encoding="utf-8") as f:
        for _id, txt in zip(ids, rag_texts):
            f.write(json.dumps({"id": int(_id), "rag_text": txt}, ensure_ascii=False) + "\n")

    manifest = {
        "dataset_hash": dataset_hash,
        "embed_model": EMBED_MODEL,
        "num_vectors": int(index.ntotal),
        "dim": int(dim),
        "created_at": int(time.time()),
        "source": os.path.abspath(INPUT_CSV),
    }
    with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    print(f"Built and saved FAISS index with {index.ntotal} vectors (dim={dim}).")
    print(f"Artifacts saved to: {CACHE_DIR}")
else:
    print("Artifacts already exist and match current data. Nothing to do.")



Cache up to date. Skipping embedding and index rebuild.
Artifacts already exist and match current data. Nothing to do.
