# Notebook 02 — Chroma indexing + persistence (LlamaIndex + OpenAI embeddings)

This notebook loads the existing deterministic notes dataset, chunks it into nodes, and then **builds or loads** a persisted Chroma index using OpenAI embeddings.

It only validates indexing + retrieval behavior (no generation/answer synthesis yet).


In [None]:
from __future__ import annotations

import os
import sys
from pathlib import Path

import pandas as pd

def _find_project_root() -> Path:
    cwd = Path.cwd().resolve()
    for base in (cwd, *cwd.parents):
        if (base / "src" / "config.py").exists():
            return base
        nested = base / "agentic-rag-second-brain"
        if (nested / "src" / "config.py").exists():
            return nested
    raise RuntimeError("Could not locate project root containing src/config.py")

PROJECT_ROOT = _find_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

from src.config import settings
from src.dataset import ensure_dataset_exists
from src.ingestion import chunk_documents, load_markdown_documents
from src.index_store import build_or_load_index

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
EMBED_MODEL = os.getenv("EMBED_MODEL", settings.embed_model)
CHROMA_DIR = Path(os.getenv("CHROMA_DIR", settings.chroma_dir)).resolve()
RESET_INDEX = os.getenv("RESET_INDEX", settings.reset_index).strip() == "1"
TOP_K = int(os.getenv("TOP_K", settings.top_k))

print("Config:")
print(f"- PROJECT_ROOT: {PROJECT_ROOT}")
print(f"- EMBED_MODEL: {EMBED_MODEL}")
print(f"- CHROMA_DIR: {CHROMA_DIR}")
print(f"- RESET_INDEX: {RESET_INDEX}")
print(f"- TOP_K: {TOP_K}")
print(f"- OPENAI_API_KEY set: {'yes' if OPENAI_API_KEY else 'no'}")

In [None]:
if not OPENAI_API_KEY:
    raise EnvironmentError(
        "OPENAI_API_KEY is required for Notebook 02. "
        "Set it before running, for example: `export OPENAI_API_KEY='your-key'`."
    )

In [None]:
dataset_summary = ensure_dataset_exists(force_rebuild=False)
print(dataset_summary)

In [None]:
notes_dir = PROJECT_ROOT / "data" / "raw" / "notes"
documents = load_markdown_documents(notes_dir)
nodes = chunk_documents(documents)

print(f"Loaded documents: {len(documents)}")
print(f"Chunked nodes: {len(nodes)}")
if nodes:
    print("Sample metadata keys:", sorted(nodes[0].metadata.keys()))

In [None]:
index_info = build_or_load_index(
    nodes=nodes,
    reset=RESET_INDEX,
    chroma_dir=CHROMA_DIR,
    embed_model=EMBED_MODEL,
)

index = index_info["index"]
print(f"Collection: {index_info['collection_name']}")
print(f"Built this run: {index_info['built']}")
print(f"Persist dir: {index_info['chroma_dir']}")
print(f"Vector count: {index_info['vector_count']}")

In [None]:
query = "What embedding model is the most recent recommendation?"
retriever = index.as_retriever(similarity_top_k=TOP_K)
results = retriever.retrieve(query)

rows = []
for result in results:
    node = result.node
    rows.append(
        {
            "score": float(result.score) if result.score is not None else None,
            "doc_date": node.metadata.get("doc_date", ""),
            "doc_title": node.metadata.get("doc_title", ""),
            "chunk_id": node.metadata.get("chunk_id", ""),
            "source_path": node.metadata.get("source_path", ""),
            "text_preview": node.get_content()[:200].replace("\n", " "),
        }
    )

results_df = pd.DataFrame(rows)
results_df

### Rebuild behavior for repeated demos

- Default behavior (`RESET_INDEX=0`) loads the existing persisted Chroma index if present.
- Set `RESET_INDEX=1` to wipe `CHROMA_DIR` and fully rebuild vectors.
