# Chunking Pipeline (OCR → Children)

Notebook to run the current chunking stack end-to-end so you can tune chunk sizes/overlaps and inspect outputs before indexing.

What it covers:
- Loads OCR/plain text from a file or inline sample.
- Uses the production chunkers (`Semantic`, `Recursive`, `Markdown`, `Code`, or `Auto`).
- Emits chunk stats (counts, token lengths) and previews.
- Optional export to JSONL for quick indexing tests.


## How to use
1) Point `DOC_PATH` to your OCR text file (UTF-8) **or** edit `RAW_TEXT`.
2) Choose `STRATEGY` (`semantic` recommended for prose; `auto` lets the analyzer decide).
3) Tweak `CHUNK_SIZE`, `CHUNK_OVERLAP`, `MIN_CHUNK_SIZE` (defaults 550/100/120 per design).
4) Run the pipeline cell and inspect stats/chunks; export JSONL if needed.


In [None]:
import os, sys, json
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.chunking.config import ChunkingConfig, ChunkingStrategy
from src.chunking.factory import ChunkerFactory
from src.chunking.tokenizer import build_counter
from src.retrieval import Document


In [None]:
# Parameters you can tweak for experiments
DOC_PATH = ""  # e.g., "../data/ocr_sample.txt". Leave empty to use RAW_TEXT below.

RAW_TEXT = """\
هذا نص عربي تجريبي يمثل صفحة OCR واحدة مع أكثر من جملة لاختبار التقسيم.
يمكنك استبداله بنصك الفعلي أو ربط ملفك في DOC_PATH بالأعلى.
يوجد أسطر متعددة للتأكد من عمل تقسيم الفقرات والجمل كما نتوقع.
"""

STRATEGY = "semantic"  # semantic | recursive | paragraph | sentence | code | markdown | auto
CHUNK_SIZE = 550
CHUNK_OVERLAP = 100
MIN_CHUNK_SIZE = 120
MAX_DOC_CHARS = 2_000_000


In [None]:
from typing import Tuple, List


def load_text(doc_path: str, raw_text: str) -> str:
    """Return the text to chunk (file takes precedence)."""
    if doc_path:
        path = Path(doc_path).expanduser()
        if not path.exists():
            raise FileNotFoundError(f"File not found: {path}")
        return path.read_text(encoding="utf-8")
    return raw_text.strip()


def make_chunker(strategy: str, **overrides):
    enum = ChunkingStrategy[strategy.upper()]
    cfg = ChunkingConfig(
        chunk_size=overrides.get("chunk_size", CHUNK_SIZE),
        chunk_overlap=overrides.get("chunk_overlap", CHUNK_OVERLAP),
        min_chunk_size=overrides.get("min_chunk_size", MIN_CHUNK_SIZE),
        strategy=enum,
        max_document_chars=overrides.get("max_document_chars", MAX_DOC_CHARS),
    )
    chunker = ChunkerFactory.create_chunker(enum, cfg)
    return chunker, cfg


def run_chunking(text: str, strategy: str = STRATEGY, **overrides):
    chunker, cfg = make_chunker(strategy, **overrides)
    doc = Document(id="doc-1", content=text, doc_type="text/ocr", metadata={"source": "notebook"})
    chunks = chunker.chunk_document(doc)
    counter = build_counter(prefer_tiktoken=True)
    token_counts = [counter.count(c.content) for c in chunks]
    stats = {
        "num_chunks": len(chunks),
        "min_tokens": min(token_counts) if token_counts else 0,
        "max_tokens": max(token_counts) if token_counts else 0,
        "mean_tokens": sum(token_counts) / len(token_counts) if token_counts else 0,
    }
    return chunks, cfg, stats, token_counts


In [None]:
# Run the chunking pipeline
text = load_text(DOC_PATH, RAW_TEXT)
chunks, cfg, stats, token_counts = run_chunking(
    text,
    strategy=STRATEGY,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    min_chunk_size=MIN_CHUNK_SIZE,
    max_document_chars=MAX_DOC_CHARS,
)

print(f"Strategy: {STRATEGY} | chunks={stats['num_chunks']} | min={stats['min_tokens']:.1f} | max={stats['max_tokens']:.1f} | mean={stats['mean_tokens']:.1f}")
print(f"chunk_size={cfg.chunk_size}, overlap={cfg.chunk_overlap}, min_chunk_size={cfg.min_chunk_size}")


In [None]:
# Inspect a few chunks
counter = build_counter(prefer_tiktoken=True)
preview_limit = 5
for i, ch in enumerate(chunks[:preview_limit]):
    print(f"
Chunk {i} | tokens≈{counter.count(ch.content)} | span=({ch.metadata.get('chunk_start')},{ch.metadata.get('chunk_end')})")
    print(ch.content[:400].strip())
    print("-" * 70)


In [None]:
# (Optional) Export to JSONL for quick indexing experiments
EXPORT = False
EXPORT_PATH = Path("../tmp/chunks_sample.jsonl")

if EXPORT:
    EXPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
    with EXPORT_PATH.open("w", encoding="utf-8") as f:
        for ch in chunks:
            rec = {"id": ch.id, "content": ch.content, "metadata": ch.metadata}
            f.write(json.dumps(rec, ensure_ascii=False) + "
")
    print(f"Saved {len(chunks)} chunks to {EXPORT_PATH}")
else:
    print("Set EXPORT = True to write JSONL to disk.")
