# 01 - Chunk and Embed Cleaned Books

This notebook loads cleaned Gutenberg text files, chunks each book into overlapping word windows, and computes sentence-transformer embeddings.

## Outputs per book
- `./data/processed/{book_id}/chunks.jsonl`
- `./data/processed/{book_id}/embeddings.npy`
- `./data/processed/{book_id}/index.json`


In [1]:
# Install required packages if missing
import importlib
import subprocess
import sys

REQUIRED_PACKAGES = [
    ("numpy", "numpy"),
    ("pandas", "pandas"),
    ("sentence_transformers", "sentence-transformers"),
]

for module_name, pip_name in REQUIRED_PACKAGES:
    try:
        importlib.import_module(module_name)
    except ImportError:
        print(f"Installing {pip_name} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name])

print("Dependency check complete.")


  from .autonotebook import tqdm as notebook_tqdm


Dependency check complete.


In [2]:
from datetime import datetime
from pathlib import Path
import json
import random
import shutil
import re

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
METADATA_PATH = DATA_DIR / "metadata.csv"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

window_words = 300
stride_words = 100
model_name = "sentence-transformers/all-mpnet-base-v2"
batch_size = 64
force_recompute = False
text_preview_chars = 220
show_progress_bar = True

print(f"window_words={window_words}, stride_words={stride_words}")
print(f"model={model_name}, batch_size={batch_size}")


window_words=300, stride_words=100
model=sentence-transformers/all-mpnet-base-v2, batch_size=64


In [3]:
if not METADATA_PATH.exists():
    raise FileNotFoundError(f"Missing metadata file: {METADATA_PATH}")

metadata_df = pd.read_csv(METADATA_PATH)
if "id" not in metadata_df.columns:
    if "pg_id" in metadata_df.columns:
        metadata_df["id"] = metadata_df["pg_id"]
    else:
        raise ValueError("metadata.csv must contain an 'id' or 'pg_id' column.")

if "raw_filename" not in metadata_df.columns:
    metadata_df["raw_filename"] = ""
if "processed_dir" not in metadata_df.columns:
    metadata_df["processed_dir"] = ""

book_records = []
missing_raw = []

for row in metadata_df.sort_values("id").to_dict(orient="records"):
    book_id = int(row["id"])
    raw_filename = str(row.get("raw_filename", "") or "").strip()

    candidate_paths = []
    if raw_filename:
        candidate_paths.append(RAW_DIR / raw_filename)
    candidate_paths.append(RAW_DIR / f"{book_id}.txt")

    resolved_path = None
    for p in candidate_paths:
        if p.exists():
            resolved_path = p
            break

    if resolved_path is None:
        missing_raw.append({"book_id": book_id, "checked": [str(p) for p in candidate_paths]})
        continue

    processed_dir = str(row.get("processed_dir", "") or "").strip()
    if not processed_dir:
        processed_dir = Path(resolved_path).stem
    processed_dir = re.sub(r"[^a-zA-Z0-9_\-]+", "_", processed_dir).strip("_")
    if not processed_dir:
        processed_dir = str(book_id)

    book_records.append({
        "book_id": book_id,
        "title": row.get("title", f"Book_{book_id}"),
        "text_path": resolved_path,
        "raw_filename": resolved_path.name,
        "processed_dir": processed_dir,
    })

if not book_records:
    raise ValueError("No readable raw text files found from metadata.csv")

if missing_raw:
    missing_ids = [m["book_id"] for m in missing_raw[:10]]
    raise FileNotFoundError(f"Missing raw text files for IDs: {missing_ids}")

book_ids = [r["book_id"] for r in book_records]
print(f"Books to process: {len(book_records)}")


Books to process: 20


In [4]:
def chunk_words(text: str, window_words: int = 300, stride_words: int = 100):
    words = re.findall(r"\S+", text)
    n = len(words)
    if n == 0:
        return []

    chunks = []
    min_tail_words = max(80, window_words // 3)

    if n <= window_words:
        chunk_text = " ".join(words)
        chunks.append({
            "chunk_index": 0,
            "start_word": 0,
            "end_word": n,
            "text": chunk_text,
            "text_preview": chunk_text[:text_preview_chars],
        })
        return chunks

    chunk_index = 0
    for start in range(0, n, stride_words):
        end = min(start + window_words, n)
        current_len = end - start

        if start > 0 and end == n and current_len < min_tail_words:
            break

        chunk_text = " ".join(words[start:end])
        chunks.append({
            "chunk_index": chunk_index,
            "start_word": int(start),
            "end_word": int(end),
            "text": chunk_text,
            "text_preview": chunk_text[:text_preview_chars],
        })
        chunk_index += 1

        if end >= n:
            break

    return chunks


def artifact_paths(book_id: int, processed_dir_name: str):
    book_dir = PROCESSED_DIR / processed_dir_name
    legacy_dir = PROCESSED_DIR / str(book_id)

    return {
        "book_dir": book_dir,
        "chunks": book_dir / "chunks.jsonl",
        "embeddings": book_dir / "embeddings.npy",
        "index": book_dir / "index.json",
        "legacy_dir": legacy_dir,
        "legacy_chunks": legacy_dir / "chunks.jsonl",
        "legacy_embeddings": legacy_dir / "embeddings.npy",
        "legacy_index": legacy_dir / "index.json",
    }


def _load_jsonl(path: Path):
    records = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records


def _is_cache_valid(index_obj: dict):
    expected = {
        "window_words": window_words,
        "stride_words": stride_words,
        "model_name": model_name,
        "batch_size": batch_size,
        "dtype": "float32",
    }
    for k, v in expected.items():
        if index_obj.get(k) != v:
            return False
    return True


def _maybe_migrate_legacy_cache(paths: dict):
    if paths["book_dir"].exists():
        return

    if not paths["legacy_dir"].exists():
        return

    needed = ["legacy_chunks", "legacy_embeddings", "legacy_index"]
    if not all(paths[k].exists() for k in needed):
        return

    paths["book_dir"].mkdir(parents=True, exist_ok=True)
    shutil.copy2(paths["legacy_chunks"], paths["chunks"])
    shutil.copy2(paths["legacy_embeddings"], paths["embeddings"])
    shutil.copy2(paths["legacy_index"], paths["index"])


def load_or_embed_chunks(book_id: int, processed_dir_name: str, text: str, model):
    paths = artifact_paths(book_id=book_id, processed_dir_name=processed_dir_name)
    _maybe_migrate_legacy_cache(paths)
    paths["book_dir"].mkdir(parents=True, exist_ok=True)

    if (
        not force_recompute
        and paths["chunks"].exists()
        and paths["embeddings"].exists()
        and paths["index"].exists()
    ):
        try:
            index_obj = json.loads(paths["index"].read_text(encoding="utf-8"))
            if _is_cache_valid(index_obj):
                cached_chunks = _load_jsonl(paths["chunks"])
                cached_embeddings = np.load(paths["embeddings"])
                if (
                    cached_embeddings.dtype == np.float32
                    and cached_embeddings.ndim == 2
                    and cached_embeddings.shape[0] == len(cached_chunks)
                ):
                    return cached_chunks, cached_embeddings, True
        except Exception:
            pass

    chunks = chunk_words(text=text, window_words=window_words, stride_words=stride_words)
    if not chunks:
        raise ValueError(f"No chunks generated for book {book_id}")

    chunk_texts = [c["text"] for c in chunks]
    embeddings = model.encode(
        chunk_texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=False,
        show_progress_bar=show_progress_bar,
    )
    embeddings = np.asarray(embeddings, dtype=np.float32)

    if embeddings.shape[0] != len(chunks):
        raise RuntimeError(
            f"Embedding row mismatch for book {book_id}: {embeddings.shape[0]} vs {len(chunks)}"
        )

    with paths["chunks"].open("w", encoding="utf-8") as f:
        for c in chunks:
            row = {
                "chunk_index": int(c["chunk_index"]),
                "start_word": int(c["start_word"]),
                "end_word": int(c["end_word"]),
                "text_preview": c["text_preview"],
            }
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    np.save(paths["embeddings"], embeddings)

    index_obj = {
        "book_id": int(book_id),
        "processed_dir": str(processed_dir_name),
        "T": int(embeddings.shape[0]),
        "D": int(embeddings.shape[1]),
        "window_words": int(window_words),
        "stride_words": int(stride_words),
        "model_name": model_name,
        "batch_size": int(batch_size),
        "dtype": "float32",
        "created_at": datetime.utcnow().isoformat() + "Z",
    }
    paths["index"].write_text(json.dumps(index_obj, indent=2), encoding="utf-8")

    lightweight_chunks = [
        {
            "chunk_index": int(c["chunk_index"]),
            "start_word": int(c["start_word"]),
            "end_word": int(c["end_word"]),
            "text_preview": c["text_preview"],
        }
        for c in chunks
    ]
    return lightweight_chunks, embeddings, False


In [None]:
print("Loading embedding model...")
model = SentenceTransformer(model_name)

book_stats = []
total_chunks = 0
processed_books = 0

for rec in book_records:
    book_id = int(rec["book_id"])
    title = rec.get("title", f"Book_{book_id}")
    processed_dir = rec.get("processed_dir", str(book_id))
    text_path = Path(rec["text_path"])

    if not text_path.exists():
        print(f"[skip] missing raw text for {book_id}: {text_path.name}")
        continue

    text = text_path.read_text(encoding="utf-8", errors="ignore")

    chunks, embeddings, used_cache = load_or_embed_chunks(
        book_id=book_id,
        processed_dir_name=processed_dir,
        text=text,
        model=model,
    )

    if embeddings.shape[0] != len(chunks):
        raise RuntimeError(f"Validation failed for {book_id}: embeddings/chunks mismatch")

    total_chunks += embeddings.shape[0]
    processed_books += 1

    book_stats.append({
        "book_id": int(book_id),
        "title": title,
        "raw_filename": text_path.name,
        "processed_dir": processed_dir,
        "chunks": int(embeddings.shape[0]),
        "dim": int(embeddings.shape[1]),
        "cached": bool(used_cache),
    })

    cache_tag = "cached" if used_cache else "computed"
    print(f"[{cache_tag}] {book_id} ({processed_dir}): T={embeddings.shape[0]}, D={embeddings.shape[1]}")

if processed_books == 0:
    raise RuntimeError("No books were processed in notebook 1.")

stats_df = pd.DataFrame(book_stats)
print("\nEmbedding summary")
print(f"Processed books: {processed_books}")
print(f"Total chunks: {total_chunks}")
print(f"Average chunks/book: {total_chunks / processed_books:.2f}")
display(stats_df.head(10))


Loading embedding model...


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1992.47it/s, Materializing param=pooler.dense.weight]                        
[1mMPNetModel LOAD REPORT[0m from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 5/5 [00:33<00:00,  6.74s/it]
  "created_at": datetime.utcnow().isoformat() + "Z",


[computed] 11 (alice_s_adventures_wonderland.txt): T=264, D=768


Batches: 100%|██████████| 8/8 [01:32<00:00, 11.55s/it]


[computed] 16 (peter_pan.txt): T=471, D=768


Batches: 100%|██████████| 6/6 [00:43<00:00,  7.23s/it]


[computed] 35 (time_machine.txt): T=323, D=768


Batches: 100%|██████████| 10/10 [01:45<00:00, 10.56s/it]


[computed] 36 (war_worlds.txt): T=599, D=768


Batches: 100%|██████████| 4/4 [00:53<00:00, 13.44s/it]


[computed] 43 (strange_case_dr_jekyll.txt): T=255, D=768


Batches: 100%|██████████| 7/7 [00:39<00:00,  5.58s/it]


[computed] 55 (wonderful_wizard_oz.txt): T=395, D=768


Batches: 100%|██████████| 12/12 [01:10<00:00,  5.88s/it]


[computed] 84 (frankenstein_modern_prometheus.txt): T=749, D=768


Batches:  20%|██        | 2/10 [06:16<24:32, 184.05s/it]