In [None]:
import re
from typing import List, Tuple

# -----------------------------
# Sample larger text for demos
# -----------------------------
sample_text = (
    """
# Introduction

In 2024, enterprises accelerated the adoption of retrieval-augmented generation (RAG) to unlock knowledge embedded in sprawling document collections. 
However, real-world content is messy: PDFs mix tables and prose, wikis interleave code blocks with narratives, and emails include replies quoted in-line. 
Robust chunking strategies are essential to make downstream retrieval both accurate and efficient.

This guide surveys foundational chunking approaches, explains trade-offs, and shows how to implement them. 
We illustrate with a fictional case study of a product team documenting a cross-platform release. 
The goal is to choose chunk boundaries that preserve meaning while enabling fast indexing.

## Background

RAG systems attach embeddings to chunks. Embeddings are most effective when chunks are semantically coherent and not too long for the model's context window. 
Choosing chunk sizes is a balancing act: larger chunks capture more context but risk dilution; smaller chunks increase recall but may fragment meaning.

### Key Terms

Token: a unit used by language models; not the same as a word. Context window: the max tokens a model can process at once. Overlap: repeated text between adjacent chunks to preserve continuity.

# Chapter 1 — Requirements

The product targets desktop and mobile. The primary user stories include reading, annotating, and sharing rich documents. 
Accessibility must meet WCAG criteria. Localization covers five languages: English, Hindi, Telugu, Spanish, and German. 
Performance requirements specify cold-start under two seconds on mid-tier devices.

Paragraph one elaborates on security: the system enforces least-privilege, audits admin actions, and encrypts data at rest and in transit. 
Paragraph two explains telemetry: we record opt-in usage metrics, minimize personally identifiable information (PII), and provide dashboards.

The final paragraph enumerates integration points: identity (SSO), storage (cloud + on-prem), and collaboration (comments, mentions, notifications).

# Chapter 2 — Architecture

We adopt a modular architecture. The client layer handles UI and offline caching; the sync layer reconciles edits; the service layer persists data. 
An event bus connects modules via publish-subscribe.

Design decisions: we prefer eventual consistency where strict ordering is costly. 
We use idempotent operations so retries are safe. 
We separate read paths from write paths to optimize latencies.

Failures are expected: we budget for partial outages and degrade gracefully. 
Monitoring includes traces, metrics, and logs. 
We practice chaos testing to discover hidden coupling.

## Data Model

Documents contain sections, paragraphs, and blocks (text, table, image). 
Each block references its parent and maintains version history. 
Indexes store derived views for search.

# Appendix — Release Notes

v1.0.0 (Jan 2025): Initial release with editor, comments, and export to PDF. 
v1.1.0 (Mar 2025): Added offline mode, improved accessibility, and telemetry opt-in. 
v1.2.0 (Jun 2025): Introduced collaboration mentions and richer notifications.

Known issues: large tables render slowly on older devices. 
Workarounds: switch to compact table view, or split tables into multiple blocks.

"""
)

In [None]:
sample_text

In [None]:

# -----------------------------
# Helpers
# -----------------------------
def simple_tokenize(text: str) -> List[str]:
    """Approximate tokens: words, numbers, and punctuation as separate tokens.
    This is NOT model-accurate but good enough for demonstration without external deps.
    """
    return re.findall(r"\w+|[^\w\s]", text, flags=re.UNICODE)

def tokens_to_text(tokens: List[str]) -> str:
    """Reconstruct text from tokens naively with space joining rules."""
    out = []
    for i, tok in enumerate(tokens):
        if re.match(r"^\w+$", tok):
            # word/number
            if i > 0 and re.match(r"^\w+$", tokens[i-1]):
                out.append(" ")
            out.append(tok)
        else:
            # punctuation
            out.append(tok)
    return "".join(out)

def split_sentences(text: str) -> List[str]:
    """A simple sentence splitter based on punctuation. It keeps delimiters."""
    # Replace newlines with spaces to avoid false splits
    normalized = re.sub(r"\s+", " ", text.strip())
    parts = re.split(r"([.!?])", normalized)
    sentences = []
    for i in range(0, len(parts)-1, 2):
        sent = parts[i].strip()
        punct = parts[i+1]
        if sent:
            sentences.append(sent + punct)
    # Handle any trailing part
    if len(parts) % 2 == 1 and parts[-1].strip():
        sentences.append(parts[-1].strip())
    return sentences

def split_paragraphs(text: str) -> List[str]:
    """Split on one or more blank lines."""
    return [p.strip() for p in re.split(r"\n\s*\n", text.strip()) if p.strip()]

def split_sections(text: str) -> List[Tuple[str, str]]:
    """Return list of (heading, content) pairs using Markdown-style '#' or 'Chapter' lines.
    If no heading is found at the top, we create a default 'Document' section.
    """
    lines = text.splitlines()
    sections = []
    current_heading = None
    current_lines = []

    def push_section():
        nonlocal current_heading, current_lines
        if current_heading is None and current_lines:
            sections.append(("Document", "\n".join(current_lines).strip()))
        elif current_heading is not None:
            sections.append((current_heading.strip(), "\n".join(current_lines).strip()))
        current_heading = None
        current_lines = []

    for line in lines:
        if re.match(r"^\s*#{1,6}\s+", line) or re.match(r"^\s*(Chapter|CHAPTER)\b", line) or re.match(r"^[A-Z][A-Z\s\-]+$", line.strip()):
            # New section boundary
            if current_heading is not None or current_lines:
                push_section()
            current_heading = re.sub(r"^\s*#{1,6}\s+", "", line).strip()
        else:
            current_lines.append(line)
    # Push last
    if current_heading is not None or current_lines:
        push_section()

    return sections

# -----------------------------
# Chunkers
# -----------------------------
def chunk_by_fixed_chars(text: str, max_chars: int = 600, overlap: int = 60) -> List[str]:
    chunks = []
    i = 0
    n = len(text)
    while i < n:
        end = min(i + max_chars, n)
        chunk = text[i:end]
        chunks.append(chunk)
        if end == n:
            break
        i = end - overlap if overlap > 0 else end
        i = max(i, 0)
    return chunks

def chunk_by_fixed_tokens(text: str, max_tokens: int = 200, overlap: int = 20) -> List[str]:
    tokens = simple_tokenize(text)
    chunks = []
    i = 0
    n = len(tokens)
    while i < n:
        end = min(i + max_tokens, n)
        chunk_tokens = tokens[i:end]
        chunk_text = tokens_to_text(chunk_tokens)
        chunks.append(chunk_text)
        if end == n:
            break
        i = end - overlap if overlap > 0 else end
        i = max(i, 0)
    return chunks

def chunk_by_sentences(text: str, max_sentences: int = 5, overlap: int = 1) -> List[str]:
    sents = split_sentences(text)
    chunks = []
    i = 0
    n = len(sents)
    while i < n:
        end = min(i + max_sentences, n)
        chunk = " ".join(sents[i:end])
        chunks.append(chunk)
        if end == n:
            break
        i = end - overlap if overlap > 0 else end
    return chunks

def chunk_by_paragraphs(text: str, max_paragraphs: int = 2, overlap: int = 0) -> List[str]:
    paras = split_paragraphs(text)
    chunks = []
    i = 0
    n = len(paras)
    while i < n:
        end = min(i + max_paragraphs, n)
        chunk = "\n\n".join(paras[i:end])
        chunks.append(chunk)
        if end == n:
            break
        i = end - overlap if overlap > 0 else end
    return chunks

def chunk_by_sections(text: str, max_chars_per_section: int = 1200, overlap: int = 100) -> List[str]:
    sections = split_sections(text)
    chunks = []
    for heading, content in sections:
        section_text = f"{heading}\n\n{content}".strip()
        if len(section_text) <= max_chars_per_section:
            chunks.append(section_text)
        else:
            # further split long sections by chars with overlap
            for sub in chunk_by_fixed_chars(section_text, max_chars=max_chars_per_section, overlap=overlap):
                chunks.append(sub)
    return chunks

def chunk_documents(docs: List[str]) -> List[str]:
    """Each document is its own chunk."""
    return [doc.strip() for doc in docs if doc and doc.strip()]


In [None]:

# -----------------------------
# Demonstration
# -----------------------------
if __name__ == "__main__":
    print("=== Fixed-Length Chunking (characters) ===")
    char_chunks = chunk_by_fixed_chars(sample_text, max_chars=500, overlap=50)
    print(f"Chunks: {len(char_chunks)}; First chunk length: {len(char_chunks[0])}")
    print(char_chunks[0][:200] + "...\n")

    print("=== Fixed-Length Chunking (tokens) ===")
    tok_chunks = chunk_by_fixed_tokens(sample_text, max_tokens=120, overlap=20)
    print(f"Chunks: {len(tok_chunks)}; First chunk tokens: {len(simple_tokenize(tok_chunks[0]))}")
    print(tok_chunks[0][:200] + "...\n")

    print("=== Sentence-Based Chunking ===")
    sent_chunks = chunk_by_sentences(sample_text, max_sentences=6, overlap=2)
    print(f"Chunks: {len(sent_chunks)}; First chunk sentences: {len(split_sentences(sent_chunks[0]))}")
    print(sent_chunks[0][:200] + "...\n")

    print("=== Paragraph-Based Chunking ===")
    para_chunks = chunk_by_paragraphs(sample_text, max_paragraphs=2, overlap=1)
    print(f"Chunks: {len(para_chunks)}; First chunk paragraphs: {len(split_paragraphs(para_chunks[0]))}")
    print(para_chunks[0][:200] + "...\n")

    print("=== Section-Based Chunking ===")
    sec_chunks = chunk_by_sections(sample_text, max_chars_per_section=800, overlap=80)
    print(f"Chunks: {len(sec_chunks)}; First chunk starts with: {sec_chunks[0].splitlines()[0]}")
    print(sec_chunks[0][:200] + "...\n")

    print("=== Document Splitting (each doc as a chunk) ===")
    docs = [sample_text, sample_text.upper()]  # pretend we have two docs
    doc_chunks = chunk_documents(docs)
    print(f"Chunks: {len(doc_chunks)}; First chunk size: {len(doc_chunks[0])}")
    print(doc_chunks[0][:200] + "...\n")