### Introduction

This notebook embeds all PDF documents + citations avaible in a FAISS vector databse with langchain and nomic-embed-text:v1.5. Expands on previosu text embeddings notebook.

```
ollama pull nomic-embed-text:v1.5
``` 

### Importing and Paths

Change the ROOT  paths as needed. It should point to to the main knowledge pack dir.

In [4]:
# --- A. Imports & config ---
from pathlib import Path
import json, hashlib, uuid, yaml
from typing import List, Dict
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Paths (adapt for your pack root)
ROOT = Path('/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_knowledge_pack_v3')
MANIFEST = ROOT / "manifest.yaml"
print(ROOT)
print(MANIFEST)

/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_knowledge_pack_v3
/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/first_aid_knowledge_pack_v3/manifest.yaml


### Parsing YAML, Embedding Documents, and Creating Vector Store

NOTES: 
1. Below cell will create a new directory inisde the knolwedge pack:
- Example: first_aid_pack_demo_v2/vector_db/text/faiss_index <br>
This directory will have the actual .faiss store and index pickle file

2. embeddings.jsonl, index.bin, and meta.json under first_aid_pack_demo_v2/vector_db/text/faiss_index will be overwritten.

In [10]:
from __future__ import annotations

import json
from pathlib import Path
from typing import List, Dict, Any, Tuple

import yaml

# LangChain docs object (compat across versions)
try:
    from langchain_core.documents import Document
except ImportError:
    from langchain.schema import Document  # older LC

from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

ans = input("Y/N")
if ans == "Y":
    # === PDF chunk & embed (manifest-driven) ===
    # deps: pip install pypdf

    # -------------------- 0) Load manifest & resolve paths --------------------
    with open(MANIFEST, "r", encoding="utf-8") as f:
        manifest = yaml.safe_load(f)

    # Embedding + chunking config from manifest (with sensible fallbacks)
    embed_model_name = manifest["embedding_config"]["text"]["model"]  # e.g., "granite-embedding:278m"
    normalize   = bool(manifest["embedding_config"]["text"].get("normalize", True))
    max_tokens  = int(manifest["embedding_config"]["text"]["chunking"].get("max_tokens", 512))
    overlap_toks= int(manifest["embedding_config"]["text"]["chunking"].get("overlap_tokens", 64))

    # very rough char≈token conversion for character-based splitter
    TOK_TO_CHAR = 4  # tune if your corpora are very different
    chunk_size_chars = max(64, max_tokens * TOK_TO_CHAR)
    overlap_chars    = max(0,  overlap_toks * TOK_TO_CHAR)

    # Resolve precomputed index paths from manifest
    text_idx_cfg        = manifest["precomputed_indices"]["text"]
    embeddings_path     = ROOT / text_idx_cfg["embeddings"]             # "vector_db/text/embeddings.jsonl"
    meta_path           = ROOT / text_idx_cfg["meta"]                   # "vector_db/text/meta.json"
    faiss_dir           = ROOT / text_idx_cfg["faiss"]["dir"]           # "vector_db/text/faiss_index"
    faiss_index_path    = ROOT / text_idx_cfg["faiss"]["index"]         # ".../index.faiss"
    faiss_docstore_path = ROOT / text_idx_cfg["faiss"]["docstore"]      # ".../index.pkl"

    faiss_dir.mkdir(parents=True, exist_ok=True)
    embeddings_path.parent.mkdir(parents=True, exist_ok=True)
    meta_path.parent.mkdir(parents=True, exist_ok=True)

    print("Using FAISS dir:", faiss_dir)
    print("Embeddings JSONL:", embeddings_path)
    print("Meta JSON:", meta_path)

    # -------------------- 1) PDF extraction helpers --------------------
    from pypdf import PdfReader

    def extract_pdf_pages(pdf_path: Path) -> List[Tuple[int, str]]:
        """Return list of (1-indexed page_number, text). Empty pages become ''."""
        pages: List[Tuple[int, str]] = []
        reader = PdfReader(str(pdf_path))
        for i, page in enumerate(reader.pages):
            txt = page.extract_text() or ""
            txt = txt.replace("\u00A0", " ").strip()
            pages.append((i + 1, txt))
        return pages

    # -------------------- 2) Chunker (character-based with overlap) --------------------
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", "। ", ". ", "?", "!", " "],
        chunk_size=chunk_size_chars,
        chunk_overlap=overlap_chars,
        length_function=len,
    )

    def chunk_page_text(page_text: str) -> List[str]:
        if not page_text:
            return []
        return [c.strip() for c in splitter.split_text(page_text) if c.strip()]

    # -------------------- 3) Build Documents (PDF-aware) --------------------
    docs: List[Document] = []
    pack_name    = manifest.get("name")
    pack_ver     = manifest.get("version")
    pack_locales = manifest.get("locales", [])

    # map citation id -> full citation object (preserves your manifest’s structure)
    citations_by_id = {c["id"]: c for c in manifest.get("citations", [])}

    num_files = 0
    num_skipped_empty = 0

    for topic in manifest.get("index_of_topics", []):
        topic_id = topic["id"]
        for fmeta in topic.get("core_files", []):
            # Only process PDFs (by media_type or extension)
            media_type = fmeta.get("media_type", "").lower()
            fpath = ROOT / fmeta["path"]
            if not fpath.exists():
                print("! Skipping missing file:", fpath)
                continue
            if not (media_type == "pdf" or fpath.suffix.lower() == ".pdf"):
                # skip non-PDFs in this cell; handle elsewhere if needed
                continue

            pages = extract_pdf_pages(fpath)
            if not any(p_txt for _, p_txt in pages):
                print(f"! PDF has no extractable text (scanned images?): {fpath}")
                num_skipped_empty += 1
                continue

            # crude locale inference: keep your existing heuristic
            locale = "hi_en" if "/hi_en/" in fmeta["path"] else fmeta.get("locale", "en")

            # expand citations
            c_full = [citations_by_id[cid] for cid in fmeta.get("citations", []) if cid in citations_by_id]

            file_chunk_counter = 0
            for page_num, page_text in pages:
                for piece in chunk_page_text(page_text):
                    docs.append(
                        Document(
                            page_content=piece,
                            metadata={
                                "pack_name": pack_name,
                                "pack_version": pack_ver,
                                "topic_id": topic_id,
                                "file_id": fmeta["id"],
                                "path": str(fmeta["path"]),
                                "media_type": media_type or "pdf",
                                "locale": locale,
                                "citations": c_full,
                                "page": page_num,
                                "chunk_index": file_chunk_counter,
                                "chunk_id": f"{fmeta['id']}::p{page_num}::chunk::{file_chunk_counter}",
                                "doc_type": "pdf",
                            },
                        )
                    )
                    file_chunk_counter += 1

            num_files += 1

    print(f"Prepared {len(docs)} PDF chunks from {num_files} PDFs (skipped empty-text PDFs: {num_skipped_empty})")

    # -------------------- 4) Embeddings + FAISS persist --------------------
    # Local Ollama embeddings (e.g., granite-embedding:278m)
    emb = OllamaEmbeddings(model=embed_model_name)

    vs = FAISS.from_documents(docs, emb)
    vs.save_local(str(faiss_dir))  # writes index.faiss + index.pkl (overwrites if they exist)

    # sanity check
    assert faiss_index_path.exists(), f"Missing {faiss_index_path}"
    assert faiss_docstore_path.exists(), f"Missing {faiss_docstore_path}"
    print("FAISS artifacts saved ✅", faiss_index_path.name, "&", faiss_docstore_path.name)

    # -------------------- 5) Export JSONL embeddings + meta (portable) --------------------
    # Note: This re-embeds each chunk for JSONL output (simple + explicit).
    records = []
    doc_items = getattr(vs.docstore, "_dict", {})  # (doc_id -> Document), common LC pattern

    for doc_id, doc in doc_items.items():
        vec = emb.embed_query(doc.page_content)  # dim depends on your Ollama embedding model
        rec = {
            "id": doc_id,
            "embedding": vec,
            "metadata": doc.metadata,
            "text": doc.page_content,
        }
        records.append(rec)

    with open(embeddings_path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(
            {
                "model": embed_model_name,
                "dim": manifest["embedding_config"]["text"].get("dim"),  # keep your manifest’s declared dim
                "normalize": normalize,
                "count": len(records),
                "pack": {"name": pack_name, "version": pack_ver, "locales": pack_locales},
                "chunking": {
                    "max_tokens": max_tokens,
                    "overlap_tokens": overlap_toks,
                    "approx_chars_per_token": TOK_TO_CHAR,
                    "chunk_size_chars": chunk_size_chars,
                    "overlap_chars": overlap_chars,
                },
            },
            f,
            ensure_ascii=False,
            indent=2,
        )

    print("JSONL/meta saved ✅")


### Testing it out

In [17]:
# Typical retriever usage
retriever = vs.as_retriever(search_kwargs={"k": 10})  # if you used Option A 'vs'
query = "Subdivisional Police Officers contacts S.D.P.O., Forbesganj?"  #
hits = retriever.invoke(query)

for i, d in enumerate(hits, 1):
    print(f"\n[{i}]")
    print("Topic:", d.metadata["topic_id"])
    print("File:", d.metadata["file_id"])
    print("Locale:", d.metadata["locale"])
    print("Citations:", [c["title"] for c in d.metadata.get("citations", [])])
    print("Chunk text:")
    print(d.page_content[:300], "..." if len(d.page_content) > 300 else "")

# Filter to a topic or locale:
# hits = retriever.invoke("tourniquet steps")



[1]
Topic: bleeding
File: bleeding-ircs-manual
Locale: hi_en
Citations: ['First Aid Manual — Indian Red Cross Society']
Chunk text:
New Delhi – 110001 
Ph.: 011-23322237 
       011-23720143 
       011-23717789 
 
45 The Commissioner 
Tamil Nadu Brigade Wing St John 
Ambulance Association (India),  
No. 1, Mayor V.R. Ramanathan Road 
(East), Chetput,  
Chennai – 600031 
 
40 The Commissioner 
Haryana Brigade Wing St. John 
Ambu ...

[2]
Topic: bleeding
File: bleeding-ircs-manual
Locale: hi_en
Citations: ['First Aid Manual — Indian Red Cross Society']
Chunk text:
350 
37 Chief Medical Director, Honorary 
Secretary & The Addl. Commissioner 
Northern Railway Centre St. John 
Ambulance Association (India), 
Baroda House, Medical Dept, 1st 
Floor,  
New Delhi – 110001 
 
43 The Commissioner 
Punjab Brigade Wing St. John 
Ambulance Association (India), 
Punjab Re ...

[3]
Topic: bleeding
File: bleeding-ircs-manual
Locale: hi_en
Citations: ['First Aid Manual — Indian Red Cross Society']
Chu

:)