In [1]:
import os
import chromadb
from docling.document_converter import DocumentConverter
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import ollama

# Suppress some HuggingFace warnings for a cleaner notebook output
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Defining the paths
PDF_PATH = "data/raw_pdfs/travel_and_reimbursement_policy.pdf" 
DB_PATH = "data/chromadb_store"

os.makedirs(os.path.dirname(PDF_PATH), exist_ok=True)
os.makedirs(DB_PATH, exist_ok=True)

# Initializing the embedding model
print("Loading embedding model...")
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5") # It outputs 384-dimensional vectors

# Initialize ChromaDB
print("Initializing ChromaDB...")
chroma_client = chromadb.PersistentClient(path=DB_PATH)
collection = chroma_client.get_or_create_collection(
    name="corporate_policies",
    metadata={"hnsw:space": "cosine"}
)

Loading embedding model...
Initializing ChromaDB...


## The chunking strategy:

1. **Parsing the PDF into Markdown**: Using the `Docling` library to read the PDF and convert it directly into Markdown format. By converting it to Markdown, you preserve the structural hierarchy of the corporate policy (e.g., # 1.0 Introduction, ## 1.1 Scope)

2. **Semantic Chunking by Headers**: Uses LangChain to scan the file and split it every time it encounters an H1 (#), H2 (##), or H3 (###). This ensures that concepts stay together. If you have a section on "Flight Reimbursements," all the text under that header is grouped into a single chunk. The MarkdownHeaderTextSplitter also extracts the headers and turns them into dictionary metadata. So, a chunk about economy class flights might automatically get the metadata: {"Header_1": "Travel Policy", "Header_2": "Flights", "Header_3": "Class Types"}.

3. **The Fallback Splitter**: Semantic chunking is great, but what if ## General Rules is a massive wall of text that is 5,000 characters long? That would overwhelm the context window of your embedding model (BAAI/bge-small-en-v1.5), causing it to lose fidelity. This initializes a second splitter that breaks text into 600-character blocks, with a 100-character overlap between them. The fallback splitter acts as a safety net, taking those massive semantic sections and chopping them down into manageable, embeddable sizes while maintaining context via the 100-character overlap.


In [3]:
print(f"Parsing PDF with Docling: {PDF_PATH}")
converter = DocumentConverter()
result = converter.convert(PDF_PATH)
markdown_text = result.document.export_to_markdown()

print("Chunking text semantically by headers...")
# Define the hierarchical structure to split on
headers_to_split_on = [
    ("#", "Header_1"),
    ("##", "Header_2"),
    ("###", "Header_3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
header_splits = markdown_splitter.split_text(markdown_text)

# Fallback for massive single sections
fallback_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)

chunks = []
metadatas = []
ids = []

for i, split in enumerate(header_splits):
    sub_chunks = fallback_splitter.split_text(split.page_content)
    for j, sub_chunk in enumerate(sub_chunks):
        chunks.append(sub_chunk)
        
        # Save the structural headers into the database metadata
        meta = split.metadata.copy()
        meta["source"] = os.path.basename(PDF_PATH)
        metadatas.append(meta)
        
        ids.append(f"chunk_{i}_{j}")

print(f"Total chunks created: {len(chunks)}")

Parsing PDF with Docling: data/raw_pdfs/travel_and_reimbursement_policy.pdf


[32m[INFO] 2026-02-20 14:54:29,085 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-20 14:54:29,105 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\lmbmo\.conda\envs\langraph_env\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-20 14:54:29,107 [RapidOCR] main.py:53: Using C:\Users\lmbmo\.conda\envs\langraph_env\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-20 14:54:29,264 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-20 14:54:29,269 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\lmbmo\.conda\envs\langraph_env\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2026-02-20 14:54:29,270 [RapidOCR] main.py:53: Using C:\Users\lmbmo\.conda\envs\langraph_env\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2026-02-20 14:54:29,353 [RapidOCR] base.py:2

Chunking text semantically by headers...
Total chunks created: 289


### Sanity check

In [None]:
import random

num_samples = 5

sample_size = min(num_samples, len(chunks))
random_indices = random.sample(range(len(chunks)), sample_size)

print(f"--- Inspecting {sample_size} Random Chunks ---\n")

for idx in random_indices:
    print(f"Chunk ID : {ids[idx]}")
    print(f"Metadata : {metadatas[idx]}")
    print(f"Content  :\n{chunks[idx]}")
    print("=" * 60 + "\n")

--- Inspecting 5 Random Chunks ---

Chunk ID : chunk_32_0
Metadata : {'Header_2': 'Responsibility of Accounts Payable/ Business Office', 'source': 'travel_and_reimbursement_policy.pdf'}
Content  :
## Responsibility of Accounts Payable/ Business Office  
The Accounts Payable Department and the Business Office are responsible for the following:  
- Review of forms and attachments for completeness, accuracy, reasonableness and compliance with government regulations and University policies
- Verification of required approved signatures
- Ensuring proper tax treatment of taxable income items and compliance with IRS regulations

Chunk ID : chunk_14_0
Metadata : {'Header_2': 'Personal use of University-owned fleet vehicles is not permitted under any circumstances.', 'source': 'travel_and_reimbursement_policy.pdf'}
Content  :
## Personal use of University-owned fleet vehicles is not permitted under any circumstances.

Chunk ID : chunk_42_2
Metadata : {'Header_2': 'Special or Higher Risk Activi

In [9]:
md_output_path = "data/raw_pdfs/travel_policy_inspected.md"

with open(md_output_path, "w", encoding="utf-8") as file:
    file.write(markdown_text)

## Generating embeddings

In [10]:
print("Generating embeddings...")
# Generate vectors for all chunks
embeddings = embedding_model.encode(chunks).tolist()

print("Upserting into ChromaDB...")
collection.upsert(
    documents=chunks,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)
print("Database populated successfully!")

Generating embeddings...
Upserting into ChromaDB...
Database populated successfully!
