# Timeframe drift dataset + LlamaIndex ingestion

Create a deterministic markdown note corpus, ingest it with LlamaIndex, and preview timeframe drift.

In [2]:
from pathlib import Path
import os
import sys

# Ensure notebook runs from project root and can import local src package
project_root = Path.cwd().resolve()
if project_root.name == 'notebooks':
    project_root = project_root.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
os.chdir(project_root)

In [3]:
import pandas as pd
from src.config import config
from src.dataset import ensure_dataset_exists
from src.ingestion import load_markdown_documents, chunk_documents


In [4]:
force_rebuild = False
summary = ensure_dataset_exists(force_rebuild=force_rebuild)
print('Notes created:', summary['num_notes'])
print('Sample filenames:', summary['filenames'][:5])


Notes created: 14
Sample filenames: ['2025-01-10-embedding-model-cost-first.md', '2025-02-02-chunking-large-windows.md', '2025-03-07-meeting-search-quality.md', '2025-03-18-embedding-evaluation-q1.md', '2025-04-14-chunking-feedback.md']


In [5]:
rows = []
for path in sorted(config.DATA_RAW_NOTES_DIR.glob('*.md')):
    text = path.read_text(encoding='utf-8')
    parts = text.split('---')
    frontmatter = parts[1].strip().splitlines()
    item = {'filename': path.name, 'title': '', 'date': '', 'tags': []}
    i = 0
    while i < len(frontmatter):
        line = frontmatter[i]
        if line.startswith('title:'):
            item['title'] = line.split(':', 1)[1].strip()
        elif line.startswith('date:'):
            item['date'] = line.split(':', 1)[1].strip()
        elif line.startswith('tags:'):
            tags = []
            i += 1
            while i < len(frontmatter) and frontmatter[i].lstrip().startswith('-'):
                tags.append(frontmatter[i].split('-', 1)[1].strip())
                i += 1
            item['tags'] = tags
            continue
        i += 1
    rows.append(item)

notes_df = pd.DataFrame(rows).sort_values('date').reset_index(drop=True)
notes_df[['title', 'date', 'tags', 'filename']]


Unnamed: 0,title,date,tags,filename
0,Embedding Model Decision: Cost-First Default,2025-01-10,"[embeddings, architecture, cost]",2025-01-10-embedding-model-cost-first.md
1,Chunking Strategy v1: Large Windows,2025-02-02,"[chunking, retrieval]",2025-02-02-chunking-large-windows.md
2,Weekly Meeting: Search Quality Review,2025-03-07,"[meeting, search]",2025-03-07-meeting-search-quality.md
3,Q1 Embedding Evaluation Notes,2025-03-18,"[embeddings, evaluation]",2025-03-18-embedding-evaluation-q1.md
4,Chunking Feedback from Pilot,2025-04-14,"[chunking, evaluation]",2025-04-14-chunking-feedback.md
5,Research Snippet: Hybrid Retrieval,2025-05-22,"[research, retrieval]",2025-05-22-research-hybrid-retrieval.md
6,Onboarding Meeting Notes,2025-06-30,"[meeting, onboarding]",2025-06-30-meeting-onboarding-notes.md
7,Embedding Model Decision Update: Quality Priority,2025-07-05,"[embeddings, architecture, quality]",2025-07-05-embedding-model-quality-shift.md
8,Metadata Schema Research,2025-08-15,"[research, metadata]",2025-08-15-research-metadata-schema.md
9,Chunking Strategy v2: Smaller Chunks + Overlap,2025-09-03,"[chunking, retrieval, quality]",2025-09-03-chunking-small-overlap.md


In [6]:
documents = load_markdown_documents(config.DATA_RAW_NOTES_DIR)
nodes = chunk_documents(documents)
print('Number of documents:', len(documents))
print('Number of chunks/nodes:', len(nodes))


Number of documents: 14
Number of chunks/nodes: 14


In [7]:
for node in nodes[:3]:
    md = node.metadata
    print('-' * 80)
    print('doc_title:', md.get('doc_title'))
    print('doc_date:', md.get('doc_date'))
    print('tags:', md.get('tags'))
    print('source_path:', md.get('source_path'))
    print('chunk_id:', md.get('chunk_id'))
    print('chunk_text:', node.text[:300])


--------------------------------------------------------------------------------
doc_title: Embedding Model Decision: Cost-First Default
doc_date: 2025-01-10
tags: ['embeddings', 'architecture', 'cost']
source_path: C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic-rag-second-brain\data\raw\notes\2025-01-10-embedding-model-cost-first.md
chunk_id: 627528ea8b8af5f59df5fae9b902a22869e8e53f:0
chunk_text: We should standardize on EmbedLite-v1 for now because the projected monthly query volume is high and token costs dominate. Retrieval quality is acceptable in internal tests for broad topical queries.

Decision: Use EmbedLite-v1 as default for all note ingestion pipelines until quality complaints inc
--------------------------------------------------------------------------------
doc_title: Chunking Strategy v1: Large Windows
doc_date: 2025-02-02
tags: ['chunking', 'retrieval']
source_path: C:\Repos\Intro-to-RAG-Agentic-RAG-2602\agentic-rag-second-brain\data\raw\notes\2025-02-02-chunking-large

In [8]:
keyword = 'embedding model'
drift_rows = []
for node in nodes:
    text = node.text.lower()
    title = str(node.metadata.get('doc_title', '')).lower()
    if keyword in text or keyword in title:
        drift_rows.append({
            'doc_date': node.metadata.get('doc_date'),
            'doc_title': node.metadata.get('doc_title'),
            'tags': node.metadata.get('tags'),
            'chunk_id': node.metadata.get('chunk_id'),
            'preview': node.text[:180].replace('\n', ' '),
        })

drift_df = pd.DataFrame(drift_rows).sort_values('doc_date').reset_index(drop=True)
print('Drift preview for keyword:', keyword)
drift_df


Drift preview for keyword: embedding model


Unnamed: 0,doc_date,doc_title,tags,chunk_id,preview
0,2025-01-10,Embedding Model Decision: Cost-First Default,"[embeddings, architecture, cost]",627528ea8b8af5f59df5fae9b902a22869e8e53f:0,We should standardize on EmbedLite-v1 for now ...
1,2025-07-05,Embedding Model Decision Update: Quality Priority,"[embeddings, architecture, quality]",1843528f9966ef38e563dc60ec056795eab0a0b1:7,Query logs now show many semantically subtle q...
2,2025-10-21,Embedding Rollout Postmortem,"[embeddings, postmortem]",3ce7ccdc6cae9be14a952f15d541a4f87c73ec51:11,"After switching to EmbedPro-v2, retrieval qual..."
