### Introduction

This notebook embeds all PDF documents + citations avaible in a FAISS vector databse with langchain and nomic-embed-text:v1.5. Expands on previosu text embeddings notebook.

```
ollama pull nomic-embed-text:v1.5
``` 

### Importing and Paths

Change the ROOT  paths as needed. It should point to to the main knowledge pack dir.

In [2]:
# --- A. Imports & config ---
from pathlib import Path
import json, hashlib, uuid, yaml
from typing import List, Dict
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Paths (adapt for your pack root)
ROOT = Path('/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/hurricane_disaster_response_pack')
MANIFEST = ROOT / "manifest.yaml"
print(ROOT)
print(MANIFEST)

/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/hurricane_disaster_response_pack
/Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/hurricane_disaster_response_pack/manifest.yaml


### Parsing YAML, Embedding Documents, and Creating Vector Store

NOTES: 
1. Below cell will create a new directory inisde the knolwedge pack:
- Example: first_aid_pack_demo_v2/vector_db/text/faiss_index <br>
This directory will have the actual .faiss store and index pickle file

2. embeddings.jsonl, index.bin, and meta.json under first_aid_pack_demo_v2/vector_db/text/faiss_index will be overwritten.

In [8]:
from __future__ import annotations

import json
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional

import yaml

# LangChain docs object (compat across versions)
try:
    from langchain_core.documents import Document
except ImportError:
    from langchain.schema import Document  # older LC

from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

# Expect ROOT and MANIFEST to be defined in your environment
# ROOT = Path("first_aid_knowledge_pack_v3")
# MANIFEST = ROOT / "manifest.yaml"

ans = input("Y/N")
if ans == "Y":
    # === PDF & Markdown chunk & embed (manifest-driven) ===
    # deps for PDFs: pip install pypdf

    # -------------------- 0) Load manifest & resolve paths --------------------
    with open(MANIFEST, "r", encoding="utf-8") as f:
        manifest = yaml.safe_load(f)

    # Embedding + chunking config from manifest (with sensible fallbacks)
    embed_model_name = manifest["embedding_config"]["text"]["model"]  # e.g., "granite-embedding:278m"
    normalize   = bool(manifest["embedding_config"]["text"].get("normalize", True))
    max_tokens  = int(manifest["embedding_config"]["text"]["chunking"].get("max_tokens", 512))
    overlap_toks= int(manifest["embedding_config"]["text"]["chunking"].get("overlap_tokens", 64))

    # very rough char≈token conversion for character-based splitter
    TOK_TO_CHAR = 4  # tune if your corpora are very different
    chunk_size_chars = max(64, max_tokens * TOK_TO_CHAR)
    overlap_chars    = max(0,  overlap_toks * TOK_TO_CHAR)

    # Resolve precomputed index paths from manifest
    text_idx_cfg        = manifest["precomputed_indices"]["text"]
    embeddings_path     = ROOT / text_idx_cfg["embeddings"]             # "vector_db/text/embeddings.jsonl"
    meta_path           = ROOT / text_idx_cfg["meta"]                   # "vector_db/text/meta.json"
    faiss_dir           = ROOT / text_idx_cfg["faiss"]["dir"]           # "vector_db/text/faiss_index"
    faiss_index_path    = ROOT / text_idx_cfg["faiss"]["index"]         # ".../index.faiss"
    faiss_docstore_path = ROOT / text_idx_cfg["faiss"]["docstore"]      # ".../index.pkl"

    faiss_dir.mkdir(parents=True, exist_ok=True)
    embeddings_path.parent.mkdir(parents=True, exist_ok=True)
    meta_path.parent.mkdir(parents=True, exist_ok=True)

    print("Using FAISS dir:", faiss_dir)
    print("Embeddings JSONL:", embeddings_path)
    print("Meta JSON:", meta_path)

    # -------------------- 1) Extraction helpers --------------------
    # PDFs
    from pypdf import PdfReader

    def extract_pdf_pages(pdf_path: Path) -> List[Tuple[int, str]]:
        """Return list of (1-indexed page_number, text). Empty pages become ''."""
        pages: List[Tuple[int, str]] = []
        reader = PdfReader(str(pdf_path))
        for i, page in enumerate(reader.pages):
            txt = page.extract_text() or ""
            txt = txt.replace("\u00A0", " ").strip()
            pages.append((i + 1, txt))
        return pages

    # Markdown
    FM_RE = re.compile(r"^\s*---\s*\n(.*?)\n---\s*\n?", re.DOTALL)

    def strip_markdown_syntax(md: str) -> str:
        """Lightweight MD→text. Keeps content, removes common syntax; OK for embeddings."""
        # remove code fences
        md = re.sub(r"```.*?```", "", md, flags=re.DOTALL)
        # remove inline code backticks
        md = md.replace("`", "")
        # images/links: keep label + URL text-ish
        md = re.sub(r"!\[([^\]]*)\]\([^\)]*\)", r"\1", md)
        md = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", md)
        # headings/bold/italics
        md = re.sub(r"^\s{0,3}#{1,6}\s*", "", md, flags=re.MULTILINE)
        md = re.sub(r"[*_]{1,3}([^*_]+)[*_]{1,3}", r"\1", md)
        # blockquotes / lists / tables pipes
        md = re.sub(r"^\s{0,3}>\s?", "", md, flags=re.MULTILINE)
        md = re.sub(r"^\s*[-*+]\s+", "", md, flags=re.MULTILINE)
        md = re.sub(r"^\s*\d+\.\s+", "", md, flags=re.MULTILINE)
        md = md.replace("|", " ")
        # collapse whitespace
        md = re.sub(r"[ \t]+", " ", md)
        md = re.sub(r"\n{3,}", "\n\n", md)
        return md.strip()

    def extract_markdown_blocks(md_path: Path) -> Tuple[Optional[dict], str]:
        """
        Returns (front_matter_dict_or_none, plain_text_body).
        Front matter (if present) is parsed as YAML and removed from body.
        """
        raw = md_path.read_text(encoding="utf-8", errors="ignore")
        fm_match = FM_RE.match(raw)
        front = None
        if fm_match:
            try:
                front = yaml.safe_load(fm_match.group(1)) or {}
            except Exception:
                front = {"_parse_error": "front_matter"}
            raw = raw[fm_match.end():]
        text = strip_markdown_syntax(raw)
        return front, text

    # -------------------- 2) Chunker (character-based with overlap) --------------------
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", "। ", ". ", "?", "!", " "],
        chunk_size=chunk_size_chars,
        chunk_overlap=overlap_chars,
        length_function=len,
    )

    def chunk_text(text: str) -> List[str]:
        if not text:
            return []
        return [c.strip() for c in splitter.split_text(text) if c.strip()]

    # -------------------- 3) Build Documents (PDF + MD aware) --------------------
    docs: List[Document] = []
    pack_name    = manifest.get("name")
    pack_ver     = manifest.get("version")
    pack_locales = manifest.get("locales", [])

    citations_by_id = {c["id"]: c for c in manifest.get("citations", [])}

    num_pdf_files = 0
    num_md_files = 0
    num_skipped_empty = 0

    def infer_locale(path_str: str, default: str = "en") -> str:
        # Keep your existing heuristic; extend if you add more
        return "hi_en" if "/hi_en/" in path_str else default

    for topic in manifest.get("index_of_topics", []):
        topic_id = topic["id"]
        for fmeta in topic.get("core_files", []):
            fpath = ROOT / fmeta["path"]
            if not fpath.exists():
                print("! Skipping missing file:", fpath)
                continue

            media_type = (fmeta.get("media_type") or fpath.suffix.lstrip(".")).lower()
            is_pdf = (media_type == "pdf") or (fpath.suffix.lower() == ".pdf")
            is_md  = (media_type in {"md", "markdown"}) or (fpath.suffix.lower() in {".md", ".markdown"})

            if not (is_pdf or is_md):
                # not a core text doc type we embed here
                continue

            locale = infer_locale(fmeta["path"], fmeta.get("locale", "en"))
            c_full = [citations_by_id[cid] for cid in fmeta.get("citations", []) if cid in citations_by_id]

            file_chunk_counter = 0

            if is_pdf:
                pages = extract_pdf_pages(fpath)
                if not any(p_txt for _, p_txt in pages):
                    print(f"! PDF has no extractable text (scanned images?): {fpath}")
                    num_skipped_empty += 1
                    continue

                for page_num, page_text in pages:
                    for piece in chunk_text(page_text):
                        docs.append(
                            Document(
                                page_content=piece,
                                metadata={
                                    "pack_name": pack_name,
                                    "pack_version": pack_ver,
                                    "topic_id": topic_id,
                                    "file_id": fmeta["id"],
                                    "path": str(fmeta["path"]),
                                    "media_type": "pdf",
                                    "locale": locale,
                                    "citations": c_full,
                                    "page": page_num,
                                    "chunk_index": file_chunk_counter,
                                    "chunk_id": f"{fmeta['id']}::p{page_num}::chunk::{file_chunk_counter}",
                                    "doc_type": "pdf",
                                },
                            )
                        )
                        file_chunk_counter += 1
                num_pdf_files += 1

            elif is_md:
                front_matter, body_text = extract_markdown_blocks(fpath)
                if not body_text.strip():
                    print(f"! Markdown empty after stripping syntax: {fpath}")
                    num_skipped_empty += 1
                    continue

                for piece in chunk_text(body_text):
                    docs.append(
                        Document(
                            page_content=piece,
                            metadata={
                                "pack_name": pack_name,
                                "pack_version": pack_ver,
                                "topic_id": topic_id,
                                "file_id": fmeta["id"],
                                "path": str(fmeta["path"]),
                                "media_type": "md",
                                "locale": locale,
                                "citations": c_full,
                                "page": None,  # no pages for MD
                                "front_matter": front_matter or {},
                                "chunk_index": file_chunk_counter,
                                "chunk_id": f"{fmeta['id']}::md::chunk::{file_chunk_counter}",
                                "doc_type": "markdown",
                            },
                        )
                    )
                    file_chunk_counter += 1
                num_md_files += 1

    print(
        f"Prepared {len(docs)} text chunks "
        f"from {num_pdf_files} PDFs and {num_md_files} MD files "
        f"(skipped empty/scan-only: {num_skipped_empty})"
    )

    # -------------------- 4) Embeddings + FAISS persist --------------------
    emb = OllamaEmbeddings(model=embed_model_name)
    vs = FAISS.from_documents(docs, emb)
    vs.save_local(str(faiss_dir))  # writes index.faiss + index.pkl (overwrites if they exist)

    # sanity check
    assert faiss_index_path.exists(), f"Missing {faiss_index_path}"
    assert faiss_docstore_path.exists(), f"Missing {faiss_docstore_path}"
    print("FAISS artifacts saved ✅", faiss_index_path.name, "&", faiss_docstore_path.name)

    # -------------------- 5) Export JSONL embeddings + meta (portable) --------------------
    records = []
    doc_items = getattr(vs.docstore, "_dict", {})  # (doc_id -> Document), common LC pattern

    for doc_id, doc in doc_items.items():
        vec = emb.embed_query(doc.page_content)  # dim depends on your Ollama embedding model
        rec = {
            "id": doc_id,
            "embedding": vec,
            "metadata": doc.metadata,
            "text": doc.page_content,
        }
        records.append(rec)

    with open(embeddings_path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(
            {
                "model": embed_model_name,
                "dim": manifest["embedding_config"]["text"].get("dim"),
                "normalize": normalize,
                "count": len(records),
                "pack": {"name": pack_name, "version": pack_ver, "locales": pack_locales},
                "chunking": {
                    "max_tokens": max_tokens,
                    "overlap_tokens": overlap_toks,
                    "approx_chars_per_token": TOK_TO_CHAR,
                    "chunk_size_chars": chunk_size_chars,
                    "overlap_chars": overlap_chars,
                },
                "supported_media_types": ["pdf", "md", "markdown"],
            },
            f,
            ensure_ascii=False,
            indent=2,
        )

    print("JSONL/meta saved ✅")


Using FAISS dir: /Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/hurricane_disaster_response_pack/vector_db/text/faiss_index
Embeddings JSONL: /Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/hurricane_disaster_response_pack/vector_db/text/embeddings.jsonl
Meta JSON: /Users/ktejwani/Personal CS Projects/Summer 2025/Offline AI Kiosk/Offline-AI-Kiosk/hurricane_disaster_response_pack/vector_db/text/meta.json
Prepared 263 text chunks from 8 PDFs and 8 MD files (skipped empty/scan-only: 0)
FAISS artifacts saved ✅ index.faiss & index.pkl
JSONL/meta saved ✅


### Testing it out

In [10]:
# Typical retriever usage
retriever = vs.as_retriever(search_kwargs={"k": 10})  # if you used Option A 'vs'
query = "Show me evac center"  #
hits = retriever.invoke(query)

for i, d in enumerate(hits, 1):
    print(f"\n[{i}]")
    print("Topic:", d.metadata["topic_id"])
    print("File:", d.metadata["file_id"])
    print("Locale:", d.metadata["locale"])
    print("Citations:", [c["title"] for c in d.metadata.get("citations", [])])
    print("Chunk text:")
    print(d.page_content[:300], "..." if len(d.page_content) > 300 else "")

# Filter to a topic or locale:
# hits = retriever.invoke("tourniquet steps")



[1]
Topic: flooding-storm-safety
File: all-hazard-preparedness
Locale: en
Citations: ['Pinellas County: All-Hazard Preparedness Guide']
Chunk text:
PAGE 8
   ALL HAZARDS PREPAREDNESS GUIDE    ALL HAZARDS PREPAREDNESS GUIDE
PINELLAS COUNTY EVACUATION ZONE MAP
E
A
B
C
D
Up to 35’
Up to 28’
Up to 20’
Up to 15’
Up to 11’
Potential Surge Heights (in feet)
Areas shown in white are non-evacuation zones.
Surge height will vary depending on ground eleva ...

[2]
Topic: hurricane-readiness
File: all-hazard-guide
Locale: en
Citations: ['Pinellas County: All-Hazard Preparedness Guide (Liberty copy)']
Chunk text:
You can volunteer to help staff an Emergency Evacuation Shelter. The shelters  
need assistance once an evacuation is called and the shelters open, as well as during 
the storm and possibly weeks afterward, depending on the damage to personal 
property. Volunteers are trained to help with registrati ...

[3]
Topic: flooding-storm-safety
File: all-hazard-preparedness
Locale: en
Citations: 

:)