### Introduction

This notebook takes the manifest and uses it to embed all image captions available avaible in a FAISS vector databse with langchain and granite-embedding:30m

```
ollama pull granite-embedding:30m
``` 

### Importing and Paths

Change the ROOT  paths as needed. It should point to to the main knowledge pack dir.

In [2]:
# --- A. Imports & config ---
from pathlib import Path
import json, hashlib, uuid, yaml
from typing import List, Dict
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Paths (adapt for your pack root)
ROOT = Path("/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_pack_demo_v2")
MANIFEST = ROOT / "manifest.yaml"
print(ROOT)
print(MANIFEST)

/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_pack_demo_v2
/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_pack_demo_v2/manifest.yaml


### Parsing YAML, Embedding Documents, and Creating Vector Store

NOTES: 
1. Below cell will create a new directory inisde the knolwedge pack:
- Example: first_aid_pack_demo_v2/vector_db/text/faiss_index <br>
This directory will have the actual .faiss store and index pickle file

2. embeddings.jsonl, index.bin, and meta.json under first_aid_pack_demo_v2/vector_db/text/faiss_index will be overwritten.

In [None]:
with open(MANIFEST, "r", encoding="utf-8") as f:
    manifest = yaml.safe_load(f)

# Pull embedding config from manifest
embed_model_name = manifest["embedding_config"]["images"]["model"]     # e.g., "granite-embedding:278m"
normalize = bool(manifest["embedding_config"]["images"].get("normalize", True))

# Resolve precomputed index paths from manifest
images_idx_cfg   = manifest["precomputed_indices"]["images"]
embeddings_path = ROOT / images_idx_cfg["embeddings"]                  # "vector_db/images/embeddings.jsonl"
meta_path       = ROOT / images_idx_cfg["meta"]                        # "vector_db/images/meta.json"
faiss_dir       = ROOT / images_idx_cfg["faiss"]["dir"]                # "vector_db/images/faiss_index"
faiss_index_path    = ROOT / images_idx_cfg["faiss"]["index"]          # ".../index.faiss"
faiss_docstore_path = ROOT / images_idx_cfg["faiss"]["docstore"]       # ".../index.pkl"
captions_path = ROOT / images_idx_cfg["captions"]


faiss_dir.mkdir(parents=True, exist_ok=True)
embeddings_path.parent.mkdir(parents=True, exist_ok=True)
meta_path.parent.mkdir(parents=True, exist_ok=True)
captions_path.parent.mkdir(parents=True, exist_ok=True)

print("Using FAISS dir:", faiss_dir)
print("Embeddings JSONL:", embeddings_path)
print("Meta JSON:", meta_path)

# ---------- 1) Embeddings ----------
from langchain_ollama import OllamaEmbeddings

# Local Ollama embeddings (granite)
emb = OllamaEmbeddings(model=embed_model_name)


# ---------- 2) Build LangChain Documents with rich metadata ----------
docs: List[Document] = []
pack_name    = manifest["name"]
pack_ver     = manifest["version"]
pack_locales = manifest["locales"]

# citation id -> full object (already built)
citations = {c["id"]: c for c in manifest.get("citations", [])}

assets = manifest.get("assets", [])
for a in assets:
    caption = (a.get("image_description") or a.get("alt_text") or "").strip()
    tags    = " ".join(a.get("tags", [])).strip()
    text    = " | ".join(t for t in [caption, tags] if t)

    # --- expand image citation IDs to full objects (like text pipeline) ---
    c_ids  = a.get("citations", []) or []
    c_full = [citations[cid] for cid in c_ids if cid in citations]


    docs.append(Document(
        page_content=text,
        metadata={
            "pack_name": pack_name,
            "pack_version": pack_ver,
            "id": a["id"],
            "path": a["path"],
            "media_type": a.get("media_type", "image/png"),
            "locale": a.get("locale", "hi_en"),
            # Keep BOTH for convenience:
            "citation_ids": c_ids,   # raw IDs as in manifest
            "citations": c_full,     # expanded objects (title/url/license)
        }
    ))


print(f"Prepared {len(docs)} image-caption docs")


# ---------- 3) Create FAISS & persist (no extra copies) ----------
from langchain_community.vectorstores import FAISS

vs = FAISS.from_documents(docs, emb)
vs.save_local(str(faiss_dir))  # writes index.faiss + index.pkl (overwrites if they exist)

# sanity check
assert faiss_index_path.exists(), f"Missing {faiss_index_path}"
assert faiss_docstore_path.exists(), f"Missing {faiss_docstore_path}"
print("FAISS artifacts saved ✅", faiss_index_path.name, "&", faiss_docstore_path.name)

# ---------- 4) Export JSONL embeddings + meta (portable) ----------
# Note: this re-embeds each chunk for JSONL output. For big corpora, cache vectors during creation.
records = []
# Access the docstore that FAISS is holding (doc_id -> Document)
doc_items = getattr(vs.docstore, "_dict", {})  # internal but commonly used

for doc_id, doc in doc_items.items():
    vec = emb.embed_query(doc.page_content)  
    rec = {
        "id": doc_id,
        "embedding": vec,
        "metadata": doc.metadata,
        "text": doc.page_content
    }
    records.append(rec)

with open(embeddings_path, "w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

with open(meta_path, "w", encoding="utf-8") as f:
    json.dump({
        "model": embed_model_name,
        "dim": manifest["embedding_config"]["images"]["dim"],
        "normalize": normalize,
        "count": len(records),
        "pack": {"name": pack_name, "version": pack_ver, "locales": pack_locales}
    }, f, ensure_ascii=False, indent=2)

n = 0
with open(captions_path, "w", encoding="utf-8") as f:
    for doc_id, doc in doc_items.items():
        md = doc.metadata or {}
        # Build a compact record with helpful fields
        rec = {
            "pack_name": pack_name,
            "pack_version": pack_ver,
            "id": doc_id,                               
            "asset_id": md.get("id"),                  
            "path": md.get("path"),
            "locale": md.get("locale", "hi_en"),
            "media_type": md.get("media_type", "image/png"),
            "citation_ids": md.get("citation_ids", []),
            "citation_titles": [c.get("title", "") for c in md.get("citations", [])],
            "text": doc.page_content                    # caption 
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        n += 1

print(f"Captions JSONL saved ✅ {captions_path} ({n} rows)")

print("JSONL/meta saved ✅")


Using FAISS dir: /Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_pack_demo_v2/vector_db/text/faiss_index
Embeddings JSONL: /Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_pack_demo_v2/vector_db/text/embeddings.jsonl
Meta JSON: /Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_pack_demo_v2/vector_db/text/meta.json
Prepared 11 chunks
FAISS artifacts saved ✅ index.faiss & index.pkl
JSONL/meta saved ✅


### Testing it out

In [28]:
# Typical retriever usage
retriever = vs.as_retriever(search_kwargs={"k": 4})  # if you used Option A 'vs'
query = "What to do for bleeding?"  #
hits = retriever.invoke(query)

for i, d in enumerate(hits, 1):
    print(d)
    print(f"\n[{i}]")
    print("Topic:", d.metadata["topic_id"])
    print("File:", d.metadata["file_id"])
    print("Locale:", d.metadata["locale"])
    print("Citations:", [c["title"] for c in d.metadata.get("citations", [])])
    print("Chunk text:")
    print(d.page_content[:300], "..." if len(d.page_content) > 300 else "")

# Filter to a topic or locale:
hits = retriever.invoke("tourniquet steps", filter={"topic_id": "bleed-control", "locale": "en"})


page_content='# Severe Bleeding Control
Severe bleeding can quickly become life-threatening if not controlled.  
Apply firm direct pressure with a clean cloth or sterile gauze.  
If bleeding soaks through, add more cloths without removing the first.  
Elevate the injured limb if possible while maintaining pressure.  
Use a tourniquet if direct pressure fails and bleeding is from an arm or leg.  
Note the time a tourniquet is applied and do not remove it until in a hospital.  
Check the person’s airway, breathing, and circulation while giving first aid.  
Reassure the patient and keep them warm to prevent shock.  
Transport urgently to the nearest medical facility if bleeding does not stop.  
Do not apply mud, ash, or unclean materials to the wound.' metadata={'pack_name': 'Eastern Bihar – First Aid & Community Health | पूर्वी बिहार – प्राथमिक चिकित्सा', 'pack_version': '0.3.1', 'topic_id': 'bleed-control', 'file_id': 'guide-bleed-overview', 'path': 'core/bleed-control/hi_en/bleeding_co

:)