# Timeframe drift dataset + LlamaIndex ingestion

Create a deterministic markdown note corpus, ingest it with LlamaIndex, and preview timeframe drift.

In [None]:
import pandas as pd
from src.config import config
from src.dataset import ensure_dataset_exists
from src.ingestion import load_markdown_documents, chunk_documents


In [None]:
force_rebuild = False
summary = ensure_dataset_exists(force_rebuild=force_rebuild)
print('Notes created:', summary['num_notes'])
print('Sample filenames:', summary['filenames'][:5])


In [None]:
rows = []
for path in sorted(config.DATA_RAW_NOTES_DIR.glob('*.md')):
    text = path.read_text(encoding='utf-8')
    parts = text.split('---')
    frontmatter = parts[1].strip().splitlines()
    item = {'filename': path.name, 'title': '', 'date': '', 'tags': []}
    i = 0
    while i < len(frontmatter):
        line = frontmatter[i]
        if line.startswith('title:'):
            item['title'] = line.split(':', 1)[1].strip()
        elif line.startswith('date:'):
            item['date'] = line.split(':', 1)[1].strip()
        elif line.startswith('tags:'):
            tags = []
            i += 1
            while i < len(frontmatter) and frontmatter[i].lstrip().startswith('-'):
                tags.append(frontmatter[i].split('-', 1)[1].strip())
                i += 1
            item['tags'] = tags
            continue
        i += 1
    rows.append(item)

notes_df = pd.DataFrame(rows).sort_values('date').reset_index(drop=True)
notes_df[['title', 'date', 'tags', 'filename']]


In [None]:
documents = load_markdown_documents(config.DATA_RAW_NOTES_DIR)
nodes = chunk_documents(documents)
print('Number of documents:', len(documents))
print('Number of chunks/nodes:', len(nodes))


In [None]:
for node in nodes[:3]:
    md = node.metadata
    print('-' * 80)
    print('doc_title:', md.get('doc_title'))
    print('doc_date:', md.get('doc_date'))
    print('tags:', md.get('tags'))
    print('source_path:', md.get('source_path'))
    print('chunk_id:', md.get('chunk_id'))
    print('chunk_text:', node.text[:300])


In [None]:
keyword = 'embedding model'
drift_rows = []
for node in nodes:
    text = node.text.lower()
    title = str(node.metadata.get('doc_title', '')).lower()
    if keyword in text or keyword in title:
        drift_rows.append({
            'doc_date': node.metadata.get('doc_date'),
            'doc_title': node.metadata.get('doc_title'),
            'tags': node.metadata.get('tags'),
            'chunk_id': node.metadata.get('chunk_id'),
            'preview': node.text[:180].replace('\n', ' '),
        })

drift_df = pd.DataFrame(drift_rows).sort_values('doc_date').reset_index(drop=True)
print('Drift preview for keyword:', keyword)
drift_df
