# BabyCoach Data Exploration

This notebook explores the structure and content of the Evernote corpus to understand:

1. **Data Structure**: JSON format, fields, and schema
2. **Content Analysis**: Note lengths, tag distribution, date ranges
3. **Chunking Strategy**: Evaluate different approaches for RAG
4. **Embedding Preparation**: Identify optimal text processing pipeline

## Setup

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Set up display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("✅ Setup complete")

## Load and Inspect Corpus

In [None]:
# Load the corpus data
corpus_path = Path('../data/corpus.json')

if not corpus_path.exists():
    print(f"❌ Corpus file not found at {corpus_path}")
    print("Make sure the symbolic link is set up correctly:")
    print("ln -s ~/Desktop/evernote_rag_export.json ../data/corpus.json")
else:
    print(f"✅ Loading corpus from {corpus_path}")
    
    with open(corpus_path, 'r', encoding='utf-8') as f:
        corpus_data = json.load(f)
    
    print(f"📊 Loaded {len(corpus_data)} notes")
    print(f"📁 File size: {corpus_path.stat().st_size / (1024*1024):.1f} MB")

In [None]:
# Examine the structure of a few notes
print("=== Sample Note Structure ===")
if corpus_data:
    sample_note = corpus_data[0]
    print(f"Sample note keys: {list(sample_note.keys())}")
    print("\n=== Sample Note ===\n")
    for key, value in sample_note.items():
        if key == 'text':
            print(f"{key}: {str(value)[:200]}..." if len(str(value)) > 200 else f"{key}: {value}")
        else:
            print(f"{key}: {value}")
else:
    print("No data loaded")

## Data Analysis

In [None]:
# Convert to DataFrame for easier analysis
if corpus_data:
    # Extract metadata into separate columns
    df_data = []
    for note in corpus_data:
        row = {
            'id': note['id'],
            'text': note['text'],
            'text_length': len(note['text']),
            'title': note['metadata'].get('title', ''),
            'tags': note['metadata'].get('tags', []),
            'tag_count': len(note['metadata'].get('tags', [])),
            'created': note['metadata'].get('created', ''),
            'source': note['metadata'].get('source', '')
        }
        df_data.append(row)
    
    df = pd.DataFrame(df_data)
    print(f"📊 DataFrame created with {len(df)} notes")
    print(f"📋 Columns: {list(df.columns)}")
    
    # Basic statistics
    print("\n=== Basic Statistics ===")
    print(df[['text_length', 'tag_count']].describe())
else:
    print("No data to analyze")

In [None]:
# Text length distribution
if 'df' in locals():
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(df['text_length'], bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Text Length (characters)')
    plt.ylabel('Number of Notes')
    plt.title('Distribution of Note Text Lengths')
    plt.yscale('log')
    
    plt.subplot(1, 2, 2)
    plt.hist(df['tag_count'], bins=range(0, max(df['tag_count'])+2), alpha=0.7, edgecolor='black')
    plt.xlabel('Number of Tags')
    plt.ylabel('Number of Notes')
    plt.title('Distribution of Tag Counts')
    
    plt.tight_layout()
    plt.show()
    
    print(f"📏 Text length range: {df['text_length'].min()} - {df['text_length'].max()} characters")
    print(f"📏 Median text length: {df['text_length'].median():.0f} characters")
    print(f"🏷️  Tag count range: {df['tag_count'].min()} - {df['tag_count'].max()} tags")

## Tag Analysis

In [None]:
# Analyze tag usage
if 'df' in locals():
    # Flatten all tags
    all_tags = []
    for tags in df['tags']:
        all_tags.extend(tags)
    
    tag_counts = Counter(all_tags)
    print(f"🏷️  Total unique tags: {len(tag_counts)}")
    print(f"🏷️  Total tag instances: {len(all_tags)}")
    
    # Most common tags
    print("\n=== Top 20 Tags ===")
    for tag, count in tag_counts.most_common(20):
        print(f"{tag}: {count}")
    
    # Notes without tags
    untagged = df[df['tag_count'] == 0]
    print(f"\n📋 Notes without tags: {len(untagged)} ({len(untagged)/len(df)*100:.1f}%)")

## Content Sampling

In [None]:
# Sample different types of notes
if 'df' in locals():
    print("=== Sample Notes by Length ===")
    
    # Short note
    short_notes = df[df['text_length'] < 100]
    if not short_notes.empty:
        short_sample = short_notes.iloc[0]
        print(f"\n📝 SHORT NOTE (ID: {short_sample['id']})")
        print(f"Title: {short_sample['title']}")
        print(f"Tags: {short_sample['tags']}")
        print(f"Text: {short_sample['text']}")
    
    # Medium note
    medium_notes = df[(df['text_length'] >= 500) & (df['text_length'] <= 1500)]
    if not medium_notes.empty:
        medium_sample = medium_notes.iloc[0]
        print(f"\n📄 MEDIUM NOTE (ID: {medium_sample['id']})")
        print(f"Title: {medium_sample['title']}")
        print(f"Tags: {medium_sample['tags']}")
        print(f"Text: {medium_sample['text'][:500]}...")
    
    # Long note
    long_notes = df[df['text_length'] > 2000]
    if not long_notes.empty:
        long_sample = long_notes.iloc[0]
        print(f"\n📚 LONG NOTE (ID: {long_sample['id']})")
        print(f"Title: {long_sample['title']}")
        print(f"Tags: {long_sample['tags']}")
        print(f"Text: {long_sample['text'][:500]}...")
        print(f"(Full length: {long_sample['text_length']} characters)")

## Chunking Strategy Analysis

In [None]:
# Analyze optimal chunking strategy
if 'df' in locals():
    print("=== Chunking Strategy Analysis ===")
    
    # Typical embedding model context windows
    embedding_limits = {
        'text-embedding-3-small': 8191,  # tokens
        'text-embedding-ada-002': 8191,
    }
    
    # Rough estimation: 1 token ≈ 4 characters for English text
    chars_per_token = 4
    
    for model, token_limit in embedding_limits.items():
        char_limit = token_limit * chars_per_token
        
        notes_over_limit = df[df['text_length'] > char_limit]
        print(f"\n{model} (≈{char_limit:,} chars):")
        print(f"  Notes over limit: {len(notes_over_limit)} ({len(notes_over_limit)/len(df)*100:.1f}%)")
        
        if not notes_over_limit.empty:
            print(f"  Longest note: {notes_over_limit['text_length'].max():,} chars")
            print(f"  Would need chunking: {notes_over_limit['text_length'].sum() / char_limit:.1f}x total chunks")
    
    # Recommendation
    print("\n=== Chunking Recommendations ===")
    small_notes = len(df[df['text_length'] <= 1000])
    medium_notes = len(df[(df['text_length'] > 1000) & (df['text_length'] <= 4000)])
    large_notes = len(df[df['text_length'] > 4000])
    
    print(f"Small notes (≤1000 chars): {small_notes} ({small_notes/len(df)*100:.1f}%)")
    print(f"Medium notes (1000-4000 chars): {medium_notes} ({medium_notes/len(df)*100:.1f}%)")
    print(f"Large notes (>4000 chars): {large_notes} ({large_notes/len(df)*100:.1f}%)")
    
    print("\n💡 Recommendation: Start with note-level chunking, split only notes >4000 chars")

## Next Steps

Based on this analysis, the next steps for BabyCoach development are:

1. **Vector Store Setup**: Use ChromaDB for local development
2. **Embedding Strategy**: Start with note-level embeddings using OpenAI `text-embedding-3-small`
3. **Chunking**: Split only notes longer than 4000 characters
4. **Metadata**: Leverage tags, titles, and dates for hybrid search
5. **Testing**: Create queries that span different note types and topics

The data looks well-structured and ready for RAG implementation!