# Notebook 02 â€” Chroma indexing + persistence (LlamaIndex + OpenAI embeddings)

This notebook loads the existing deterministic notes dataset, chunks it into nodes, and then **builds or loads** a persisted Chroma index using OpenAI embeddings.

It only validates indexing + retrieval behavior (no generation/answer synthesis yet).


In [1]:
from __future__ import annotations

import os
import sys
from pathlib import Path

import pandas as pd

def _find_project_root() -> Path:
    cwd = Path.cwd().resolve()
    for base in (cwd, *cwd.parents):
        if (base / "src" / "config.py").exists():
            return base
        nested = base / "agentic-rag-second-brain"
        if (nested / "src" / "config.py").exists():
            return nested
    raise RuntimeError("Could not locate project root containing src/config.py")

PROJECT_ROOT = _find_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)

from src.config import settings
from src.dataset import ensure_dataset_exists
from src.ingestion import chunk_documents, load_markdown_documents
from src.index_store import build_or_load_index

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
EMBED_MODEL = os.getenv("EMBED_MODEL", settings.embed_model)
CHROMA_DIR = Path(os.getenv("CHROMA_DIR", settings.chroma_dir)).resolve()
RESET_INDEX = os.getenv("RESET_INDEX", settings.reset_index).strip() == "1"
TOP_K = int(os.getenv("TOP_K", settings.top_k))

print("Config:")
print(f"- PROJECT_ROOT: {PROJECT_ROOT}")
print(f"- EMBED_MODEL: {EMBED_MODEL}")
print(f"- CHROMA_DIR: {CHROMA_DIR}")
print(f"- RESET_INDEX: {RESET_INDEX}")
print(f"- TOP_K: {TOP_K}")
print(f"- OPENAI_API_KEY set: {'yes' if OPENAI_API_KEY else 'no'}")

Config:
- PROJECT_ROOT: C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic-rag-second-brain
- EMBED_MODEL: text-embedding-3-small
- CHROMA_DIR: C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic-rag-second-brain\data\processed\chroma
- RESET_INDEX: False
- TOP_K: 6
- OPENAI_API_KEY set: yes


In [2]:
if not OPENAI_API_KEY:
    raise EnvironmentError(
        "OPENAI_API_KEY is required for Notebook 02. "
        "Set it before running, for example: `export OPENAI_API_KEY='your-key'`."
    )

In [3]:
dataset_summary = ensure_dataset_exists(force_rebuild=False)
print(dataset_summary)

{'notes_dir': 'data\\raw\\notes', 'num_notes': 14, 'filenames': ['2025-01-10-embedding-model-cost-first.md', '2025-02-02-chunking-large-windows.md', '2025-03-07-meeting-search-quality.md', '2025-03-18-embedding-evaluation-q1.md', '2025-04-14-chunking-feedback.md', '2025-05-22-research-hybrid-retrieval.md', '2025-06-30-meeting-onboarding-notes.md', '2025-07-05-embedding-model-quality-shift.md', '2025-08-15-research-metadata-schema.md', '2025-09-03-chunking-small-overlap.md', '2025-10-02-meeting-demo-retro.md', '2025-10-21-embedding-rollout-postmortem.md', '2025-11-12-chunking-maintenance.md', '2025-12-01-roadmap-notes-q1.md'], 'created_or_updated': [], 'force_rebuild': False}


In [3]:
notes_dir = PROJECT_ROOT / "data" / "raw" / "notes"
documents = load_markdown_documents(notes_dir)
nodes = chunk_documents(documents)

print(f"Loaded documents: {len(documents)}")
print(f"Chunked nodes: {len(nodes)}")
if nodes:
    print("Sample metadata keys:", sorted(nodes[0].metadata.keys()))

Loaded documents: 14
Chunked nodes: 14
Sample metadata keys: ['chunk_id', 'doc_date', 'doc_id', 'doc_title', 'source_path', 'tags']


In [4]:
index_info = build_or_load_index(
    nodes=nodes,
    reset=RESET_INDEX,
    chroma_dir=CHROMA_DIR,
    embed_model=EMBED_MODEL,
)

index = index_info["index"]
print(f"Collection: {index_info['collection_name']}")
print(f"Built this run: {index_info['built']}")
print(f"Persist dir: {index_info['chroma_dir']}")
print(f"Vector count: {index_info['vector_count']}")

Collection: notes
Built this run: False
Persist dir: C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic-rag-second-brain\data\processed\chroma
Vector count: 14


In [5]:
query = "What embedding model is the most recent recommendation?"
retriever = index.as_retriever(similarity_top_k=TOP_K)
results = retriever.retrieve(query)

rows = []
for result in results:
    node = result.node
    rows.append(
        {
            "score": float(result.score) if result.score is not None else None,
            "doc_date": node.metadata.get("doc_date", ""),
            "doc_title": node.metadata.get("doc_title", ""),
            "chunk_id": node.metadata.get("chunk_id", ""),
            "source_path": node.metadata.get("source_path", ""),
            "text_preview": node.get_content()[:200].replace("\n", " "),
        }
    )

results_df = pd.DataFrame(rows)
results_df

Unnamed: 0,score,doc_date,doc_title,chunk_id,source_path,text_preview
0,0.420195,2025-07-05,Embedding Model Decision Update: Quality Priority,1843528f9966ef38e563dc60ec056795eab0a0b1:7,C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic...,Query logs now show many semantically subtle q...
1,0.399181,2025-01-10,Embedding Model Decision: Cost-First Default,627528ea8b8af5f59df5fae9b902a22869e8e53f:0,C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic...,We should standardize on EmbedLite-v1 for now ...
2,0.389766,2025-03-18,Q1 Embedding Evaluation Notes,aa23b701925fb16b6312fa7dc6f53474541c46fc:3,C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic...,Compared EmbedLite-v1 and EmbedPro-v2 on histo...
3,0.345105,2025-10-21,Embedding Rollout Postmortem,3ce7ccdc6cae9be14a952f15d541a4f87c73ec51:11,C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic...,"After switching to EmbedPro-v2, retrieval qual..."
4,0.273567,2025-10-02,Demo Retro: Internal Stakeholder Session,23d249bcab98910ef4133cf9ece7801e2b179a31:10,C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic...,Stakeholders responded positively to timeline-...
5,0.272027,2025-05-22,Research Snippet: Hybrid Retrieval,79c9b8fbd7b9030b5f342d8f45587b0bb8c4a3fb:5,C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic...,A short literature scan suggests dense+sparse ...


### Rebuild behavior for repeated demos

- Default behavior (`RESET_INDEX=0`) loads the existing persisted Chroma index if present.
- Set `RESET_INDEX=1` to wipe `CHROMA_DIR` and fully rebuild vectors.
