# Stage 2: Semantic chunking with Gemma embeddings

This notebook loads local PDFs, converts them into LangChain `Document` objects, and applies semantic chunking using the Gemma embedding model exposed via the local OpenAI-compatible endpoint.

**What you need running**
- Docker Model Runner exposing embeddings at `http://localhost:12434/engines/v1`
- Model name: `ai/embeddinggemma`
- PDFs located at `C:\Users\Admin\Desktop\rag-hootone\data\uploads\PDF`

Adjust the chunking thresholds as needed to tune chunk sizes for your corpus.


In [1]:
# If needed, install dependencies (uncomment to run in a fresh environment).
# !pip install -q "langchain>=0.2" "langchain-community>=0.2" "langchain-text-splitters>=0.2" "openai>=1.50" "pymupdf" "qdrant-client" "langchain-qdrant"
# use uv add langchain langchain-community langchain-text-splitters openai pymupdf qdrant-client langchain-qdrant


In [2]:
from __future__ import annotations

import os
from pathlib import Path
from typing import List

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
# from langchain_text_splitters import SemanticChunker
from langchain_experimental.text_splitter import SemanticChunker


# Endpoint/model configuration
MODEL_RUNNER_BASE_URL = os.getenv("MODEL_RUNNER_BASE_URL", "http://localhost:12434/engines/v1")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "docker")
EMBED_MODEL_NAME = "ai/embeddinggemma"

# Data location
PDF_DIR = Path(r"C:\Users\Admin\Desktop\rag-hootone\data\uploads\PDF")


In [3]:
import re
import unicodedata
import tempfile
import fitz


def _strip_control_chars(text: str) -> str:
    # Remove non-printable control chars excluding common whitespace
    return "".join(ch for ch in text if ch.isprintable() or ch in {"\n", "\t", " "})


def _collapse_whitespace(text: str) -> str:
    # Collapse runs of whitespace and trim per line
    lines = [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
    return "\n".join(line for line in lines if line)


class PDFPreprocessor:
    """Clean PDF text and emit a temporary sanitized PDF for downstream loading."""

    def __init__(self, output_dir: Path | None = None):
        # Use a temp directory if none provided
        self.output_dir = Path(output_dir) if output_dir else Path(tempfile.gettempdir()) / "cleaned_pdfs"
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def clean_pdf(self, pdf_path: Path) -> Path:
        """Return path to cleaned PDF copy."""
        cleaned_path = self.output_dir / pdf_path.name
        with fitz.open(pdf_path) as src_doc, fitz.open() as new_doc:
            for page in src_doc:
                text = page.get_text("text")
                text = unicodedata.normalize("NFKC", text)
                text = _strip_control_chars(text)
                text = _collapse_whitespace(text)

                # Preserve page size; place cleaned text in a textbox covering the page
                rect = page.rect
                new_page = new_doc.new_page(width=rect.width, height=rect.height)
                new_page.insert_textbox(rect, text, fontsize=11, fontname="helv")

            new_doc.save(cleaned_path)
        return cleaned_path



In [5]:
CLEANED_PDF_DIR = PDF_DIR / "cleaned"
CLEANED_PDF_DIR.mkdir(parents=True, exist_ok=True)

pdf_preprocessor = PDFPreprocessor(output_dir=CLEANED_PDF_DIR)



### PDF preprocessing steps
- Normalize Unicode to NFKC to harmonize accents and symbols.
- Strip control characters while keeping newlines/tabs needed for structure.
- Collapse repeated whitespace and blank lines to reduce noise.
- Re-emit a cleaned PDF copy (page sizes preserved) for `PyMuPDFLoader`.



## Load PDFs into LangChain Documents
We load every PDF in the target directory with `PyMuPDFLoader`, which produces LangChain `Document` objects containing both page text and metadata (source path, page number).


In [6]:
def load_pdfs(pdf_dir: Path, preprocessor: PDFPreprocessor | None = None) -> List[Document]:
    if not pdf_dir.exists():
        raise FileNotFoundError(f"PDF directory not found: {pdf_dir}")

    documents: List[Document] = []
    for pdf_path in sorted(pdf_dir.glob("*.pdf")):
        target_path = preprocessor.clean_pdf(pdf_path) if preprocessor else pdf_path
        loader = PyMuPDFLoader(str(target_path))
        docs = loader.load()
        documents.extend(docs)
    if not documents:
        raise FileNotFoundError(f"No PDFs found in {pdf_dir}")
    return documents

raw_documents = load_pdfs(PDF_DIR, preprocessor=pdf_preprocessor)
print(f"Loaded {len(raw_documents)} pages from {PDF_DIR}")
# Peek at the first document metadata
raw_documents[0]


Loaded 7 pages from C:\Users\Admin\Desktop\rag-hootone\data\uploads\PDF


Document(metadata={'producer': '', 'creator': '', 'creationdate': '', 'source': 'C:\\Users\\Admin\\Desktop\\rag-hootone\\data\\uploads\\PDF\\cleaned\\Data[1].pdf', 'file_path': 'C:\\Users\\Admin\\Desktop\\rag-hootone\\data\\uploads\\PDF\\cleaned\\Data[1].pdf', 'total_pages': 7, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='REPRESENTATIVES DATA ? TEXT ONLY\nHeadquarters (India)\nName: Hootone Remedies\nAddress: 3A, Pitru Chaya, Beside Mehboob Film Studio, 23C, Hill Road, Bandra (W),\nMumbai, Maharashtra 400050, India.\nPhone: +919594000093\nEmail: info@hootone.org\nNorthern Nigeria\n? Representative Name: Pharm. Kabir Hamza Kankara\n? Address: No. 7, Gabasawa Qtrs, Kwado, Off Mani Road, Katsina, Nigeria.\n? Phone/WhatsApp: +2348036785766\n? Email: kabirkt2003@gmail.com\nSouthern Nigeria\n? Representative Name: Hootone Pharmacy Limited\n? Address: 182 Wetheral Road,

In [7]:
raw_documents

[Document(metadata={'producer': '', 'creator': '', 'creationdate': '', 'source': 'C:\\Users\\Admin\\Desktop\\rag-hootone\\data\\uploads\\PDF\\cleaned\\Data[1].pdf', 'file_path': 'C:\\Users\\Admin\\Desktop\\rag-hootone\\data\\uploads\\PDF\\cleaned\\Data[1].pdf', 'total_pages': 7, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='REPRESENTATIVES DATA ? TEXT ONLY\nHeadquarters (India)\nName: Hootone Remedies\nAddress: 3A, Pitru Chaya, Beside Mehboob Film Studio, 23C, Hill Road, Bandra (W),\nMumbai, Maharashtra 400050, India.\nPhone: +919594000093\nEmail: info@hootone.org\nNorthern Nigeria\n? Representative Name: Pharm. Kabir Hamza Kankara\n? Address: No. 7, Gabasawa Qtrs, Kwado, Off Mani Road, Katsina, Nigeria.\n? Phone/WhatsApp: +2348036785766\n? Email: kabirkt2003@gmail.com\nSouthern Nigeria\n? Representative Name: Hootone Pharmacy Limited\n? Address: 182 Wetheral Road

## Semantic chunking with Gemma embeddings
`SemanticChunker` uses the embedding model to find natural breakpoints. Tune the thresholds to trade off chunk granularity vs. coherence.


In [17]:
# Adjust this percentile to control chunk granularity.
# Lower values -> more/smaller chunks; higher values -> fewer/larger chunks.
BREAKPOINT_PERCENTILE = 75

embedding_model = OpenAIEmbeddings(
    model=EMBED_MODEL_NAME,
    base_url=MODEL_RUNNER_BASE_URL,
    api_key=OPENAI_API_KEY,
)

chunker = SemanticChunker(
    embedding_model,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=BREAKPOINT_PERCENTILE,
)


In [18]:
chunked_documents = chunker.split_documents(raw_documents)

print(f"Chunks created: {len(chunked_documents)}")

# Basic stats
lengths = [len(doc.page_content) for doc in chunked_documents]
print(
    f"Mean length: {sum(lengths)/len(lengths):.1f} chars | "
    f"Min: {min(lengths)} | Max: {max(lengths)}"
)

# Preview a sample chunk
sample_idx = min(3, len(chunked_documents) - 1)
chunked_documents[sample_idx]


Chunks created: 50
Mean length: 193.0 chars | Min: 1 | Max: 1012


Document(metadata={'producer': '', 'creator': '', 'creationdate': '', 'source': 'C:\\Users\\Admin\\Desktop\\rag-hootone\\data\\uploads\\PDF\\cleaned\\Data[1].pdf', 'file_path': 'C:\\Users\\Admin\\Desktop\\rag-hootone\\data\\uploads\\PDF\\cleaned\\Data[1].pdf', 'total_pages': 7, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='Phone/WhatsApp: +2348036785766\n? Email: kabirkt2003@gmail.com\nSouthern Nigeria\n? Representative Name: Hootone Pharmacy Limited\n?')

## Persist chunks to Qdrant
Use the running Qdrant instance (see `docker-compose.yml`) to store the semantic chunks. The collection name is `rag-hootone`.


In [19]:
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore

QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")  # leave unset if not configured
QDRANT_COLLECTION = "rag-hootone"

client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

vector_store = QdrantVectorStore.from_documents(
    documents=chunked_documents,
    embedding=embedding_model,
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    collection_name=QDRANT_COLLECTION,
    prefer_grpc=False, # switch to True if using gRPC at 6334, if you want to use HTTP then keep it False
) 

print(f"Upserted {len(chunked_documents)} chunks into Qdrant collection '{QDRANT_COLLECTION}' at {QDRANT_URL}")


Upserted 50 chunks into Qdrant collection 'rag-hootone' at http://localhost:6333


## Next steps
- Adjust `BREAKPOINT_PERCENTILE` (e.g., 85–98) if chunks are too small or too large.
- Persist `chunked_documents` to your vector store of choice (e.g., Chroma, PGVector, LanceDB) using the same `embedding_model` for consistency.
- Incorporate these chunks into your retrieval pipeline and LLM workflow.
