In [None]:
# filename: nlp_chunking_demo.py
import re
from typing import List, Tuple, Optional

# --- NLP libraries ---
import spacy
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
# Optional: for markdown heading parsing (cleaner section boundaries)
from markdown_it import MarkdownIt

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load a HF tokenizer (choose any model; gpt2 is fine for demo)
hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")

# -----------------------------
# Sample larger text for demos
# -----------------------------
SAMPLE_TEXT = """
# Introduction

In 2024, enterprises accelerated the adoption of retrieval-augmented generation (RAG) to unlock knowledge embedded in sprawling document collections. 
However, real-world content is messy: PDFs mix tables and prose, wikis interleave code blocks with narratives, and emails include replies quoted in-line. 
Robust chunking strategies are essential to make downstream retrieval both accurate and efficient.

This guide surveys foundational chunking approaches, explains trade-offs, and shows how to implement them. 
We illustrate with a fictional case study of a product team documenting a cross-platform release. 
The goal is to choose chunk boundaries that preserve meaning while enabling fast indexing.

## Background

RAG systems attach embeddings to chunks. Embeddings are most effective when chunks are semantically coherent and not too long for the model's context window. 
Choosing chunk sizes is a balancing act: larger chunks capture more context but risk dilution; smaller chunks increase recall but may fragment meaning.

### Key Terms

Token: a unit used by language models; not the same as a word. Context window: the max tokens a model can process at once. Overlap: repeated text between adjacent chunks to preserve continuity.

# Chapter 1 — Requirements

The product targets desktop and mobile. The primary user stories include reading, annotating, and sharing rich documents. 
Accessibility must meet WCAG criteria. Localization covers five languages: English, Hindi, Telugu, Spanish, and German. 
Performance requirements specify cold-start under two seconds on mid-tier devices.

Paragraph one elaborates on security: the system enforces least-privilege, audits admin actions, and encrypts data at rest and in transit. 
Paragraph two explains telemetry: we record opt-in usage metrics, minimize personally identifiable information (PII), and provide dashboards.

The final paragraph enumerates integration points: identity (SSO), storage (cloud + on-prem), and collaboration (comments, mentions, notifications).

# Chapter 2 — Architecture

We adopt a modular architecture. The client layer handles UI and offline caching; the sync layer reconciles edits; the service layer persists data. 
An event bus connects modules via publish-subscribe.

Design decisions: we prefer eventual consistency where strict ordering is costly. 
We use idempotent operations so retries are safe. 
We separate read paths from write paths to optimize latencies.

Failures are expected: we budget for partial outages and degrade gracefully. 
Monitoring includes traces, metrics, and logs. 
We practice chaos testing to discover hidden coupling.

## Data Model

Documents contain sections, paragraphs, and blocks (text, table, image). 
Each block references its parent and maintains version history. 
Indexes store derived views for search.

# Appendix — Release Notes

v1.0.0 (Jan 2025): Initial release with editor, comments, and export to PDF. 
v1.1.0 (Mar 2025): Added offline mode, improved accessibility, and telemetry opt-in. 
v1.2.0 (Jun 2025): Introduced collaboration mentions and richer notifications.

Known issues: large tables render slowly on older devices. 
Workarounds: switch to compact table view, or split tables into multiple blocks.
"""

# -----------------------------
# Utilities
# -----------------------------
def add_overlap_windows(items: List[str], max_items: int, overlap: int) -> List[List[str]]:
    """Window items with overlap. Returns a list of lists."""
    chunks = []
    i = 0
    n = len(items)
    while i < n:
        end = min(i + max_items, n)
        window = items[i:end]
        chunks.append(window)
        if end == n:
            break
        i = end - overlap if overlap > 0 else end
    return chunks

def preview(text: str, length: int = 220) -> str:
    text = re.sub(r"\s+", " ", text.strip())
    return (text[:length] + "...") if len(text) > length else text

# -----------------------------
# 1) Fixed-Length Chunking (characters)
# -----------------------------
def chunk_by_fixed_chars(text: str, max_chars: int = 800, overlap: int = 80) -> List[str]:
    chunks = []
    i = 0
    n = len(text)
    while i < n:
        end = min(i + max_chars, n)
        chunk = text[i:end]
        chunks.append(chunk)
        if end == n: break
        i = end - overlap if overlap > 0 else end
        i = max(i, 0)
    return chunks

# -----------------------------
# 1b) Fixed-Length Chunking (tokens via HF tokenizer)
# -----------------------------
def chunk_by_fixed_tokens_hf(text: str, max_tokens: int = 350, overlap: int = 50) -> List[str]:
    ids = hf_tokenizer.encode(text)
    chunks = []
    i = 0
    n = len(ids)
    while i < n:
        end = min(i + max_tokens, n)
        window_ids = ids[i:end]
        chunk = hf_tokenizer.decode(window_ids)
        chunks.append(chunk)
        if end == n: break
        i = end - overlap if overlap > 0 else end
    return chunks

# -----------------------------
# 2) Sentence-Based Chunking (spaCy or NLTK)
# -----------------------------
def sentences_spacy(text: str) -> List[str]:
    doc = nlp(text)
    return [s.text.strip() for s in doc.sents if s.text.strip()]

def sentences_nltk(text: str) -> List[str]:
    # Normalize newlines to avoid weird splits
    normalized = re.sub(r"\s+", " ", text.strip())
    return [s.strip() for s in sent_tokenize(normalized) if s.strip()]

def chunk_by_sentences(text: str, max_sentences: int = 6, overlap: int = 2, use_spacy: bool = True) -> List[str]:
    sents = sentences_spacy(text) if use_spacy else sentences_nltk(text)
    windows = add_overlap_windows(sents, max_sentences, overlap)
    return [" ".join(w) for w in windows]

# -----------------------------
# 3) Paragraph-Based Chunking
# (Structure split + optional token capping inside)
# -----------------------------
def split_paragraphs(text: str) -> List[str]:
    # Split on one or more blank lines; keep paragraphs intact
    return [p.strip() for p in re.split(r"\n\s*\n", text.strip()) if p.strip()]

def chunk_by_paragraphs(text: str, max_paragraphs: int = 2, overlap: int = 1, max_tokens_cap: Optional[int] = None) -> List[str]:
    paras = split_paragraphs(text)
    windows = add_overlap_windows(paras, max_paragraphs, overlap)
    chunks = ["\n\n".join(w) for w in windows]
    # Optional: cap by token size using HF tokenizer
    if max_tokens_cap:
        capped = []
        for ch in chunks:
            if len(hf_tokenizer.encode(ch)) <= max_tokens_cap:
                capped.append(ch)
            else:
                # If too long, sub-split by tokens
                capped.extend(chunk_by_fixed_tokens_hf(ch, max_tokens=max_tokens_cap, overlap=max(0, max_tokens_cap // 10)))
        return capped
    return chunks

# -----------------------------
# 4) Section-Based Chunking (Markdown headings)
# -----------------------------
def split_sections_markdown(text: str) -> List[Tuple[str, str]]:
    """
    Parse headings using markdown-it-py. Returns list of (heading, content).
    Non-heading prefix content goes under 'Document'.
    """
    md = MarkdownIt()
    tokens = md.parse(text)
    sections = []
    current_heading = None
    current_content_lines = []

    def push_section():
        nonlocal current_heading, current_content_lines
        content = "\n".join(current_content_lines).strip()
        if current_heading is None and content:
            sections.append(("Document", content))
        elif current_heading is not None:
            sections.append((current_heading, content))
        current_heading = None
        current_content_lines = []

    i = 0
    # Build a flat text from tokens with heading boundaries
    while i < len(tokens):
        tok = tokens[i]
        if tok.type == "heading_open":
            # Push previous section
            if current_heading is not None or current_content_lines:
                push_section()
            # Next token is heading text
            j = i + 1
            heading_text = ""
            while j < len(tokens) and tokens[j].type != "heading_close":
                if tokens[j].type == "inline":
                    heading_text = tokens[j].content.strip()
                j += 1
            current_heading = heading_text
            i = j  # skip to heading_close
        else:
            # Accumulate inline text/token content
            if tok.type == "inline":
                current_content_lines.append(tok.content)
            elif tok.type == "paragraph_open":
                # Collect full paragraph content
                j = i + 1
                ptext = []
                while j < len(tokens) and tokens[j].type != "paragraph_close":
                    if tokens[j].type == "inline":
                        ptext.append(tokens[j].content)
                    j += 1
                if ptext:
                    current_content_lines.append(" ".join(ptext))
                i = j
            # Other block tokens (e.g., list items) can be similarly handled
        i += 1

    # Push the trailing section
    if current_heading is not None or current_content_lines:
        push_section()

    return sections

def chunk_by_sections(text: str, max_tokens_per_section: int = 400, overlap_tokens: int = 50) -> List[str]:
    sections = split_sections_markdown(text)
    chunks = []
    for heading, content in sections:
        section_text = f"{heading}\n\n{content}".strip()
        # If too long, sub-chunk by tokens with overlap
        if len(hf_tokenizer.encode(section_text)) <= max_tokens_per_section:
            chunks.append(section_text)
        else:
            # Sub-split by tokens
            for sub in chunk_by_fixed_tokens_hf(section_text, max_tokens=max_tokens_per_section, overlap=overlap_tokens):
                chunks.append(sub)
    return chunks

# -----------------------------
# 5) Document Splitting (each doc as a chunk)
# -----------------------------
def chunk_documents(docs: List[str]) -> List[str]:
    return [d.strip() for d in docs if d and d.strip()]

# -----------------------------
# Demo main
# -----------------------------
if __name__ == "__main__":
    print("=== Fixed-Length Chunking (characters) ===")
    c_char = chunk_by_fixed_chars(SAMPLE_TEXT, max_chars=600, overlap=60)
    print(f"Chunks: {len(c_char)}; Preview: {preview(c_char[0])}\n")

    print("=== Fixed-Length Chunking (HF tokens) ===")
    c_tok = chunk_by_fixed_tokens_hf(SAMPLE_TEXT, max_tokens=300, overlap=40)
    print(f"Chunks: {len(c_tok)}; Preview: {preview(c_tok[0])}\n")

    print("=== Sentence-Based (spaCy) ===")
    c_sent_spacy = chunk_by_sentences(SAMPLE_TEXT, max_sentences=6, overlap=2, use_spacy=True)
    print(f"Chunks: {len(c_sent_spacy)}; Preview: {preview(c_sent_spacy[0])}\n")

    print("=== Sentence-Based (NLTK) ===")
    c_sent_nltk = chunk_by_sentences(SAMPLE_TEXT, max_sentences=6, overlap=2, use_spacy=False)
    print(f"Chunks: {len(c_sent_nltk)}; Preview: {preview(c_sent_nltk[0])}\n")

    print("=== Paragraph-Based (with token cap) ===")
    c_para = chunk_by_paragraphs(SAMPLE_TEXT, max_paragraphs=2, overlap=1, max_tokens_cap=350)
    print(f"Chunks: {len(c_para)}; Preview: {preview(c_para[0])}\n")

    print("=== Section-Based (Markdown headings + HF tokenizer) ===")
    c_sec = chunk_by_sections(SAMPLE_TEXT, max_tokens_per_section=400, overlap_tokens=50)
    print(f"Chunks: {len(c_sec)}; First chunk starts with: {c_sec[0].splitlines()[0]}")
    print(f"Preview: {preview(c_sec[0])}\n")

    print("=== Document Splitting ===")
    docs = [SAMPLE_TEXT, SAMPLE_TEXT.upper()]
    c_docs = chunk_documents(docs)
    print(f"Chunks: {len(c_docs)}; Preview: {preview(c_docs[0])}\n")
