In [14]:
!pip uninstall -y google-adk langchain langchain-text-splitters

[0mFound existing installation: langchain 1.2.9
Uninstalling langchain-1.2.9:
  Successfully uninstalled langchain-1.2.9
Found existing installation: langchain-text-splitters 1.1.0
Uninstalling langchain-text-splitters-1.1.0:
  Successfully uninstalled langchain-text-splitters-1.1.0


In [4]:
!pip -q install -U chromadb sentence-transformers rank-bm25

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/111.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.2/111.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/496.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/496.3 kB[0m [31m22.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m496.3/496.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
from datasets import load_dataset
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path="db")

dense_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

collection = client.get_or_create_collection(
    name="paul_graham",
    embedding_function=dense_ef
)

# 1) Load dataset safely
ds = load_dataset("chromadb/paul_graham_essay", split="data", streaming=True)

# 2) Build text without pulling everything into RAM
#    Also cap total chars to prevent runaway chunk counts
MAX_CHARS = 250_000  # adjust up if you want, but start here
texts = []
total = 0
for row in ds:
    # pick a text field robustly
    if "text" in row:
        t = row["text"]
    else:
        # first string-ish field
        t = next(v for v in row.values() if isinstance(v, str))
    if not t:
        continue
    if total + len(t) > MAX_CHARS:
        t = t[: max(0, MAX_CHARS - total)]
    texts.append(t)
    total += len(t)
    if total >= MAX_CHARS:
        break

text = "\n".join(texts)

# 3) Chunk
def chunk_text(s, chunk_size=1200, overlap=200):
    chunks = []
    start = 0
    n = len(s)
    while start < n:
        end = min(n, start + chunk_size)
        chunk = s[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start = end - overlap
        if start < 0:
            start = 0
        if end == n:
            break
    return chunks

chunks = chunk_text(text, chunk_size=1200, overlap=200)

# 4) Batched add (prevents big embedding spikes)
def batched_add(coll, chunks, batch_size=64):
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        ids = [f"pg_{j}" for j in range(i, i+len(batch))]
        metas = [{"source": "chromadb/paul_graham_essay", "chunk": j} for j in range(i, i+len(batch))]
        coll.add(ids=ids, documents=batch, metadatas=metas)

batched_add(collection, chunks, batch_size=64)

collection.count()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


1

##Collection management (list / get / rename-by-copy / delete)

In [2]:
import chromadb

# List collections
print([c.name for c in client.list_collections()])

# Get the existing collection safely (reuses embedding function)
collection = client.get_collection("paul_graham", embedding_function=dense_ef)

print("count:", collection.count())

['paul_graham', 'demo_dense']
count: 1


Rename is not a direct operation; do copy + delete:

In [3]:
def rename_collection(client, old_name, new_name, embedding_function=None, batch_size=256):
    old = client.get_collection(old_name, embedding_function=embedding_function)
    new = client.get_or_create_collection(new_name, embedding_function=embedding_function)

    data = old.get(include=["documents","metadatas"])
    ids = data["ids"]
    docs = data["documents"]
    metas = data["metadatas"]

    for i in range(0, len(ids), batch_size):
        new.add(
            ids=ids[i:i+batch_size],
            documents=docs[i:i+batch_size],
            metadatas=metas[i:i+batch_size],
        )

    client.delete_collection(old_name)
    return new

# Example:
# collection = rename_collection(client, "paul_graham", "paul_graham_v2", embedding_function=dense_ef)

##Updating & removing data (update / upsert / delete-by-metadata)

Chroma supports `update()` and `delete()`. Some versions also support `upsert()`; this wrapper works either way.

In [4]:
def safe_upsert(coll, ids, documents=None, metadatas=None):
    # If upsert exists, use it; else do update then add missing ids.
    if hasattr(coll, "upsert"):
        return coll.upsert(ids=ids, documents=documents, metadatas=metadatas)

    existing = set(coll.get(ids=ids, include=[]).get("ids", []))
    to_update = [i for i in ids if i in existing]
    to_add = [i for i in ids if i not in existing]

    if to_update:
        idx = [ids.index(i) for i in to_update]
        coll.update(
            ids=to_update,
            documents=[documents[j] for j in idx] if documents else None,
            metadatas=[metadatas[j] for j in idx] if metadatas else None,
        )
    if to_add:
        idx = [ids.index(i) for i in to_add]
        coll.add(
            ids=to_add,
            documents=[documents[j] for j in idx] if documents else None,
            metadatas=[metadatas[j] for j in idx] if metadatas else None,
        )

# Example: update an existing chunk
collection.update(
    ids=["pg_0"],
    documents=["[UPDATED] " + collection.get(ids=["pg_0"], include=["documents"])["documents"][0]],
    metadatas=[{"source":"chromadb/paul_graham_essay","chunk":0,"updated":True}]
)

# Example: delete by ids
collection.delete(ids=["pg_1","pg_2"])

print("count:", collection.count())

count: 1


In [5]:
# delete all records where updated=True
collection.delete(where={"updated": True})
print("count:", collection.count())

count: 0


##Dense search (semantic)

In [6]:
def dense_search(query, k=5):
    r = collection.query(
        query_texts=[query],
        n_results=k,
        include=["documents","metadatas","distances"]
    )
    out = []
    for _id, doc, meta, dist in zip(r["ids"][0], r["documents"][0], r["metadatas"][0], r["distances"][0]):
        out.append({"id": _id, "distance": float(dist), "meta": meta, "preview": doc[:180]})
    return out

dense_search("advice about learning fast and writing", k=5)

[]

##Lexical search (BM25) + rebuild after changes

In [8]:
print("count:", collection.count())
sample = collection.get(limit=3, include=["documents","metadatas"])
print("ids:", len(sample["ids"]))
print("docs:", len(sample.get("documents") or []))
print("example doc:", (sample.get("documents") or [None])[0])

count: 0
ids: 0
docs: 0
example doc: None


In [9]:
from datasets import load_dataset

# (Re)create clean collection if you want
# client.delete_collection("paul_graham")
# collection = client.get_or_create_collection("paul_graham", embedding_function=dense_ef)

# Load the dataset (split is "data")
ds = load_dataset("chromadb/paul_graham_essay", split="data", streaming=True)

# Build text (cap to avoid too many chunks)
MAX_CHARS = 250_000
texts, total = [], 0
for row in ds:
    t = row.get("text") or next(v for v in row.values() if isinstance(v, str))
    if not t:
        continue
    if total + len(t) > MAX_CHARS:
        t = t[: max(0, MAX_CHARS - total)]
    texts.append(t)
    total += len(t)
    if total >= MAX_CHARS:
        break

text = "\n".join(texts)

def chunk_text(s, chunk_size=1200, overlap=200):
    chunks = []
    start = 0
    n = len(s)
    while start < n:
        end = min(n, start + chunk_size)
        chunk = s[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start = end - overlap
        if start < 0:
            start = 0
        if end == n:
            break
    return chunks

chunks = chunk_text(text, chunk_size=1200, overlap=200)
print("chars:", len(text), "chunks:", len(chunks))

def batched_add(coll, chunks, batch_size=64):
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        ids = [f"pg_{j}" for j in range(i, i+len(batch))]
        metas = [{"source": "chromadb/paul_graham_essay", "chunk": j} for j in range(i, i+len(batch))]
        coll.add(ids=ids, documents=batch, metadatas=metas)

batched_add(collection, chunks, batch_size=64)

print("collection count:", collection.count())

chars: 305 chunks: 1
collection count: 1


In [10]:
from rank_bm25 import BM25Okapi
import re

def tokenize(text: str):
    return re.findall(r"[a-z0-9]+", text.lower())

def rebuild_bm25(coll):
    data = coll.get(include=["documents"])
    ids = data["ids"]
    docs = data["documents"]
    bm25 = BM25Okapi([tokenize(d) for d in docs])
    return bm25, ids, docs

bm25, bm25_ids, bm25_docs = rebuild_bm25(collection)

def bm25_search(query, k=5):
    scores = bm25.get_scores(tokenize(query))
    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [{"id": bm25_ids[i], "score": float(scores[i]), "preview": bm25_docs[i][:180]} for i in ranked]

bm25_search("venture capital startup", k=5)

[{'id': 'pg_0',
  'score': 0.0,
  'preview': '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56\n57\n58\n59\n60\n61\n62\n6'}]

##Hybrid search (dense + lexical) using your existing RRF code

You already have dense_ranked_ids, rrf, hybrid_search. You just need bm25_search returning ids. Add this helper:

In [12]:
def bm25_search_ids(query, k=10):
    scores = bm25.get_scores(tokenize(query))
    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [bm25_ids[i] for i in ranked]

In [14]:
def dense_ranked_ids(query, k=10):
    r = collection.query(
        query_texts=[query],
        n_results=k
    )
    return r["ids"][0]


def bm25_search_ids(query, k=10):
    if bm25 is None:
        return []
    scores = bm25.get_scores(tokenize(query))
    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [bm25_ids[i] for i in ranked]


def rrf(rank_lists, k=60, weights=None):
    if weights is None:
        weights = [1.0] * len(rank_lists)
    scores = {}
    for w, ids in zip(weights, rank_lists):
        for rank, _id in enumerate(ids):
            scores[_id] = scores.get(_id, 0.0) + w * (1.0 / (k + rank))
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [15]:
def hybrid_search(query, k_dense=10, k_lex=10, k_final=5):
    dense_ids = dense_ranked_ids(query, k=k_dense)
    lex_ids = bm25_search_ids(query, k=k_lex)
    fused = rrf([dense_ids, lex_ids], k=60)

    top = [i for i, _ in fused[:k_final]]
    got = collection.get(ids=top, include=["documents","metadatas"])
    id_to_idx = {i: idx for idx, i in enumerate(got["ids"])}

    out = []
    for _id in top:
        idx = id_to_idx[_id]
        out.append((_id, got["metadatas"][idx], got["documents"][idx][:160]))
    return out


hybrid_search("startup advice about writing and learning", k_final=5)

[('pg_0',
  {'chunk': 0, 'source': 'chromadb/paul_graham_essay'},
  '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56')]

## Minimal “workflow” pattern you’ll use in the notebook

In [16]:
bm25, bm25_ids, bm25_docs = rebuild_bm25(collection)

In [17]:
dense_search("...", k=5)
bm25_search("...", k=5)
hybrid_search("...", k_final=5)

[('pg_0',
  {'chunk': 0, 'source': 'chromadb/paul_graham_essay'},
  '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56')]

-------------------------
##Metadata Filtering (pre-filter)

Decide what metadata is “filterable”

In real systems this is not arbitrary.

Typical filter dimensions:

- source (document / dataset / tenant)

- chunk or doc_id

- section, date, author

- access_level, user_id, org_id

We’ll use: `source` and `chunk (numeric)`

## Dense search WITH pre-filter (semantic + metadata)

This is the most common production pattern.

In [18]:
def dense_search_filtered(query, where, k=5):
    r = collection.query(
        query_texts=[query],
        n_results=k,
        where=where,
        include=["documents","metadatas","distances"]
    )
    out = []
    for _id, doc, meta, dist in zip(
        r["ids"][0], r["documents"][0], r["metadatas"][0], r["distances"][0]
    ):
        out.append({
            "id": _id,
            "distance": float(dist),
            "meta": meta,
            "preview": doc[:160]
        })
    return out

Example usage:

In [19]:
dense_search_filtered(
    "startup advice about writing",
    where={"source": "chromadb/paul_graham_essay"},
    k=5
)

[{'id': 'pg_0',
  'distance': 1.0570777654647827,
  'meta': {'chunk': 0, 'source': 'chromadb/paul_graham_essay'},
  'preview': '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56'}]

**Key point:**
Filtering happens before vector similarity → faster + correct.

## Lexical (BM25) WITH pre-filter

In [20]:
def bm25_search_filtered(query, where, k=5):
    if bm25 is None:
        return []

    filtered = [
        (i, bm25_docs[i])
        for i in range(len(bm25_docs))
        if all(bm25_docs[i] is not None and
               collection.get(ids=[bm25_ids[i]], include=["metadatas"])["metadatas"][0].get(key) == val
               for key, val in where.items())
    ]

    if not filtered:
        return []

    idxs, docs = zip(*filtered)
    scores = bm25.get_scores(tokenize(query))
    ranked = sorted(idxs, key=lambda i: scores[i], reverse=True)[:k]

    return [
        {"id": bm25_ids[i], "score": float(scores[i]), "preview": bm25_docs[i][:160]}
        for i in ranked
    ]

Example of usage:

In [21]:
bm25_search_filtered(
    "venture capital",
    where={"source": "chromadb/paul_graham_essay"},
    k=5
)

[{'id': 'pg_0',
  'score': 0.0,
  'preview': '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56'}]

## Hybrid search WITH pre-filter (real-world default)

This is the industry-standard configuration:

filter → recall → rerank → fuse

In [22]:
def hybrid_search_filtered(query, where, k_dense=10, k_lex=10, k_final=5):
    # Dense (already supports pre-filter)
    dense_ids = collection.query(
        query_texts=[query],
        n_results=k_dense,
        where=where
    )["ids"][0]

    # Lexical (manual filter)
    lex_ids = [
        r["id"]
        for r in bm25_search_filtered(query, where=where, k=k_lex)
    ]

    fused = rrf([dense_ids, lex_ids], k=60)
    top = [i for i, _ in fused[:k_final]]

    got = collection.get(ids=top, include=["documents","metadatas"])
    id_to_idx = {i: idx for idx, i in enumerate(got["ids"])}

    return [
        (_id, got["metadatas"][id_to_idx[_id]], got["documents"][id_to_idx[_id]][:160])
        for _id in top
    ]

In [23]:
hybrid_search_filtered(
    "startup advice about writing and learning",
    where={"source": "chromadb/paul_graham_essay"},
    k_final=5
)

[('pg_0',
  {'chunk': 0, 'source': 'chromadb/paul_graham_essay'},
  '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56')]

## Demo comparison

In [24]:
collection.add(
    ids=[f"other_{i}" for i in range(5)],
    documents=[
        "This document is about cooking and baking.",
        "This document is about skincare routines.",
        "This document is about cloud architecture.",
        "This document is about accounting standards.",
        "This document is about vector databases.",
    ],
    metadatas=[{"source":"other", "chunk": i} for i in range(5)]
)

In [25]:
dense_search("startup advice", k=5)

[{'id': 'other_0',
  'distance': 0.9585310220718384,
  'meta': {'chunk': 0, 'source': 'other'},
  'preview': 'This document is about cooking and baking.'},
 {'id': 'other_2',
  'distance': 0.9839558601379395,
  'meta': {'source': 'other', 'chunk': 2},
  'preview': 'This document is about cloud architecture.'},
 {'id': 'other_1',
  'distance': 0.9861060380935669,
  'meta': {'source': 'other', 'chunk': 1},
  'preview': 'This document is about skincare routines.'},
 {'id': 'other_4',
  'distance': 1.0113791227340698,
  'meta': {'source': 'other', 'chunk': 4},
  'preview': 'This document is about vector databases.'},
 {'id': 'other_3',
  'distance': 1.0553466081619263,
  'meta': {'chunk': 3, 'source': 'other'},
  'preview': 'This document is about accounting standards.'}]

In [35]:
import time

def dense_search_filtered_timed(query, where=None, k=5, repeat=1):
    # warm-up
    if where is None:
        collection.query(query_texts=[query], n_results=1)
    else:
        collection.query(query_texts=[query], n_results=1, where=where)

    t0 = time.perf_counter()
    for _ in range(repeat):
        if where is None:
            r = collection.query(
                query_texts=[query],
                n_results=k,
                include=["documents","metadatas","distances"]
            )
        else:
            r = collection.query(
                query_texts=[query],
                n_results=k,
                where=where,
                include=["documents","metadatas","distances"]
            )
    t1 = time.perf_counter()

    elapsed_ms = (t1 - t0) * 1000 / repeat

    out = []
    for _id, doc, meta, dist in zip(
        r["ids"][0], r["documents"][0], r["metadatas"][0], r["distances"][0]
    ):
        out.append({
            "id": _id,
            "distance": float(dist),
            "meta": meta,
            "preview": doc[:160]
        })

    print(f"where={where if where is not None else 'NO FILTER'} | avg latency: {elapsed_ms:.2f} ms")
    return out

In [36]:
dense_search_filtered_timed(
    "startup advice",
    where={"source":"chromadb/paul_graham_essay"},
    k=5,
    repeat=5
)

dense_search_filtered_timed(
    "startup advice",
    where={"source":"other"},
    k=5,
    repeat=5
)

where={'source': 'chromadb/paul_graham_essay'} | avg latency: 16.01 ms
where={'source': 'other'} | avg latency: 16.35 ms


[{'id': 'other_0',
  'distance': 0.9585310220718384,
  'meta': {'source': 'other', 'chunk': 0},
  'preview': 'This document is about cooking and baking.'},
 {'id': 'other_2',
  'distance': 0.9839558601379395,
  'meta': {'chunk': 2, 'source': 'other'},
  'preview': 'This document is about cloud architecture.'},
 {'id': 'other_1',
  'distance': 0.9861060380935669,
  'meta': {'chunk': 1, 'source': 'other'},
  'preview': 'This document is about skincare routines.'},
 {'id': 'other_4',
  'distance': 1.0113791227340698,
  'meta': {'source': 'other', 'chunk': 4},
  'preview': 'This document is about vector databases.'},
 {'id': 'other_3',
  'distance': 1.0553466081619263,
  'meta': {'source': 'other', 'chunk': 3},
  'preview': 'This document is about accounting standards.'}]

In [38]:
# no filter
dense_search_filtered_timed(
    "startup advice",
    where=None,
    k=5,
    repeat=5
)

# with filter
dense_search_filtered_timed(
    "startup advice",
    where={"source":"chromadb/paul_graham_essay"},
    k=5,
    repeat=5
)

where=NO FILTER | avg latency: 16.18 ms
where={'source': 'chromadb/paul_graham_essay'} | avg latency: 16.07 ms


[{'id': 'pg_0',
  'distance': 1.0795005559921265,
  'meta': {'source': 'chromadb/paul_graham_essay', 'chunk': 0},
  'preview': '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56'}]

[{'id': 'pg_0',
  'distance': 1.0795005559921265,
  'meta': {'source': 'chromadb/paul_graham_essay', 'chunk': 0},
  'preview': '0\n79\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n1\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\n41\n42\n43\n44\n45\n46\n47\n48\n49\n50\n51\n52\n53\n54\n55\n56'}]

In [39]:
def candidate_count(where=None):
    if where is None:
        return len(collection.get(include=[]).get("ids", []))
    return len(collection.get(where=where, include=[]).get("ids", []))

print("All candidates:", candidate_count())
print("PG candidates:", candidate_count({"source":"chromadb/paul_graham_essay"}))
print("Other candidates:", candidate_count({"source":"other"}))

All candidates: 6
PG candidates: 1
Other candidates: 5


What this demonstrates (say this explicitly)

Filtering happens before similarity search

Smaller candidate pool → lower latency

With tiny data, difference is small

At scale, this is non-optional

Use this sentence verbatim if you want:

“Metadata filtering doesn’t change ranking logic; it shrinks the search space.
The speedup compounds as the corpus grows.”

In [40]:
# --- CHUNKING STRATEGY TUNING + INCREMENTAL UPDATES (HF embeddings + BM25) ---
# Assumes you already have:
#   client (chromadb.PersistentClient)
#   dense_ef (SentenceTransformerEmbeddingFunction)
#   collection (current collection)
#   tokenize(), rebuild_bm25() from earlier
#   bm25, bm25_ids, bm25_docs already built at least once

import time
from datasets import load_dataset

# ----------------------------
# 1) Chunking strategy tuning
# ----------------------------

def chunk_text(s, chunk_size=1200, overlap=200):
    chunks = []
    start = 0
    n = len(s)
    while start < n:
        end = min(n, start + chunk_size)
        chunk = s[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start = end - overlap
        if start < 0:
            start = 0
        if end == n:
            break
    return chunks


def create_collection_from_text(
    client,
    name,
    text,
    chunk_size,
    overlap,
    source="pg",
    embedding_function=None,
    batch_size=64,
    wipe_if_exists=True,
):
    # recreate the collection for a clean comparison
    if wipe_if_exists:
        try:
            client.delete_collection(name)
        except Exception:
            pass

    coll = client.get_or_create_collection(name=name, embedding_function=embedding_function)

    chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
    ids = [f"{source}_{i}" for i in range(len(chunks))]
    metas = [{"source": source, "chunk": i, "chunk_size": chunk_size, "overlap": overlap} for i in range(len(chunks))]

    # batched add to avoid Colab spikes
    for i in range(0, len(chunks), batch_size):
        coll.add(
            ids=ids[i:i+batch_size],
            documents=chunks[i:i+batch_size],
            metadatas=metas[i:i+batch_size],
        )
    return coll, len(chunks)


def timed_dense_query(coll, query, k=5, repeat=5):
    # warmup
    coll.query(query_texts=[query], n_results=1)
    t0 = time.perf_counter()
    for _ in range(repeat):
        r = coll.query(query_texts=[query], n_results=k)
    t1 = time.perf_counter()
    return (t1 - t0) * 1000 / repeat, r


# Load the essay text once (streaming, capped)
ds = load_dataset("chromadb/paul_graham_essay", split="data", streaming=True)
MAX_CHARS = 250_000
texts, total = [], 0
for row in ds:
    t = row.get("text") or next(v for v in row.values() if isinstance(v, str))
    if not t:
        continue
    if total + len(t) > MAX_CHARS:
        t = t[: max(0, MAX_CHARS - total)]
    texts.append(t)
    total += len(t)
    if total >= MAX_CHARS:
        break
essay_text = "\n".join(texts)

# Build 3 variants to compare
variants = [
    ("pg_cs400_ov50",  400,  50),
    ("pg_cs800_ov100", 800,  100),
    ("pg_cs1200_ov200",1200, 200),
]

query = "startup advice about writing and learning"
results = []

for name, cs, ov in variants:
    coll, n_chunks = create_collection_from_text(
        client=client,
        name=name,
        text=essay_text,
        chunk_size=cs,
        overlap=ov,
        source="pg",
        embedding_function=dense_ef,
        batch_size=64,
        wipe_if_exists=True,
    )
    ms, r = timed_dense_query(coll, query, k=5, repeat=5)
    results.append((name, cs, ov, n_chunks, ms, r["ids"][0]))

print("Chunking comparison (name, chunk_size, overlap, #chunks, avg_ms, top_ids):")
for row in results:
    print(row[0], row[1], row[2], row[3], f"{row[4]:.2f}ms", row[5][:3])


# -------------------------------------
# 2) Incremental updates (no full reindex)
# -------------------------------------
# Pattern:
#   - Add only new docs (new IDs)
#   - Chroma embeds only the new docs (because embedding_function is attached)
#   - Rebuild BM25 (or incrementally update; rebuild is simplest)

def incremental_add_documents(coll, new_documents, source="daily", start_id=0, batch_size=64):
    ids = [f"{source}_{start_id+i}" for i in range(len(new_documents))]
    metas = [{"source": source, "chunk": start_id+i} for i in range(len(new_documents))]

    for i in range(0, len(new_documents), batch_size):
        coll.add(
            ids=ids[i:i+batch_size],
            documents=new_documents[i:i+batch_size],
            metadatas=metas[i:i+batch_size],
        )
    return ids


# Example: simulate "daily new docs"
new_docs_day1 = [
    "Daily update: We shipped a new feature that improves retrieval quality for exact names.",
    "Daily update: Hybrid retrieval is now default for compliance-sensitive workflows.",
    "Daily update: Metadata filters reduced latency by restricting the candidate pool.",
]

before = collection.count()
added_ids = incremental_add_documents(collection, new_docs_day1, source="daily", start_id=0, batch_size=16)
after = collection.count()
print("Count before:", before, "after:", after, "added:", len(added_ids))

# Rebuild lexical index after updates (simple + standard)
bm25, bm25_ids, bm25_docs = rebuild_bm25(collection)
print("BM25 rebuilt on docs:", len(bm25_docs))


# Optional: quick verification queries
def bm25_search_ids(query, k=5):
    if bm25 is None:
        return []
    scores = bm25.get_scores(tokenize(query))
    ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [bm25_ids[i] for i in ranked]

print("Dense top ids:", collection.query(query_texts=["metadata filters latency"], n_results=5)["ids"][0])
print("Lexical top ids:", bm25_search_ids("metadata filters latency", k=5))

Chunking comparison (name, chunk_size, overlap, #chunks, avg_ms, top_ids):
pg_cs400_ov50 400 50 1 17.54ms ['pg_0']
pg_cs800_ov100 800 100 1 23.76ms ['pg_0']
pg_cs1200_ov200 1200 200 1 29.91ms ['pg_0']
Count before: 6 after: 9 added: 3
BM25 rebuilt on docs: 9
Dense top ids: ['daily_2', 'daily_1', 'daily_0', 'other_4', 'other_2']
Lexical top ids: ['daily_2', 'pg_0', 'other_0', 'other_1', 'other_2']
