# Corpus Exploration

Interactive exploration of the Corpus class for development and testing.
Use this notebook to experiment with the ENEX parsing functionality.

In [None]:
# Fresh setup - clear all imports and restart
import sys
import os

# Print working directory to debug
print(f"Current working directory: {os.getcwd()}")

# Clear any cached modules
modules_to_clear = [mod for mod in sys.modules if mod.startswith('enote')]
for mod in modules_to_clear:
    del sys.modules[mod]

# Set correct path
project_root = os.path.abspath('..')
src_path = os.path.join(project_root, 'src')
print(f"Adding to path: {src_path}")

if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Import fresh
import enote
print(f"Imported enote from: {enote.__file__}")
print(f"Corpus module: {enote.corpus.__file__}")

In [None]:
# Test corpus with fresh imports
corpus = enote.Corpus()
corpus.load(max_notes=3)

print(f"Loaded {len(corpus.notes)} notes")

# Test each note
for note_id, note_data in corpus.notes.items():
    print(f"\n=== {note_id} ===")
    print(f"Title: {note_data['title']}")
    print(f"Available keys: {list(note_data.keys())}")
    
    # Check the tags field
    if 'tags' in note_data:
        tags = note_data['tags']
        print(f"Tags: {tags} (type: {type(tags)}, length: {len(tags) if isinstance(tags, list) else 'not a list'})")
        
        # Verify it's always a list
        if isinstance(tags, list):
            print(f"✅ Tags is a list as expected")
            if len(tags) > 0:
                print(f"First tag: '{tags[0]}'")
        else:
            print(f"❌ Tags is not a list: {type(tags)}")
    else:
        print("❌ No 'tags' field found")
    
    # Check if old 'tag' field still exists
    if 'tag' in note_data:
        print(f"❌ Old 'tag' field still present: {note_data['tag']}")
    else:
        print("✅ Old 'tag' field properly removed")

In [None]:
# Test RAG export with corrected tags field
import json

rag_json = corpus.export_for_rag()
rag_data = json.loads(rag_json)

print(f"Exported {len(rag_data)} notes for RAG")
print(f"First note structure:")

first_note = rag_data[0]
print(json.dumps(first_note, indent=2))

# Verify tags are always lists in export
print(f"\nTags verification:")
for i, note in enumerate(rag_data):
    tags = note['metadata']['tags']
    if isinstance(tags, list):
        print(f"Note {i+1}: ✅ tags is list with {len(tags)} items: {tags}")
    else:
        print(f"Note {i+1}: ❌ tags is not list: {type(tags)} = {tags}")

In [None]:
# Load some notes and explore
corpus.load(max_notes=2)
print(f"Loaded {len(corpus.notes)} notes")

In [None]:
# Restart kernel to pick up latest code changes
import sys
import os

# Add src directory to Python path
sys.path.insert(0, os.path.abspath('../src'))

import enote

# Create corpus and load notes
corpus = enote.Corpus()
corpus.load(max_notes=10)

print(f"Loaded {len(corpus.notes)} notes")

# Test the tags field
for note_id, note_data in corpus.notes.items():
    print(f"=== {note_id} ===")
    print(f"Title: {note_data['title']}")
    print(f"Available keys: {list(note_data.keys())}")
    
    # Check both possible tag fields
    if 'tags' in note_data:
        print(f"Tags field: {note_data['tags']} (type: {type(note_data['tags'])})")
    if 'tag' in note_data:
        print(f"Tag field: {note_data['tag']} (type: {type(note_data['tag'])})")
    
    break  # Just show first one

In [None]:
# Try some operations
first_note = list(corpus.notes.values())[0]
print("First note structure:")
for key, value in first_note.items():
    print(f"{key}: {type(value)} - {str(value)[:50]}...")

In [None]:
# Test the new cleaned_text functionality
for note_id, note_data in corpus.notes.items():
    original = note_data.get('content', '')
    cleaned = note_data.get('cleaned_text', '')
    
    print(f"\n=== {note_id}: {note_data['title']} ===")
    print(f"Original length: {len(original)} chars")
    print(f"Cleaned length: {len(cleaned)} chars")
    
    if len(original) > 0:
        reduction = ((len(original) - len(cleaned)) / len(original)) * 100
        print(f"Size reduction: {reduction:.1f}%")
    
    print(f"\nCleaned text preview:")
    print(cleaned[:200] + "..." if len(cleaned) > 200 else cleaned)
    
    break  # Just show first one

## RAG Export

Test the new RAG export functionality. This generates JSON in the standard format for feeding into LLMs and vector databases.

In [None]:
# Test RAG export with a small sample first
print("=== Testing RAG Export (Small Sample) ===")

# Create a fresh corpus for export testing
export_corpus = enote.Corpus()
export_corpus.load(max_notes=3)  # Just a few notes for testing

print(f"Loaded {len(export_corpus.notes)} notes for export test")

# Export to string (not file) for preview
rag_json = export_corpus.export_for_rag()

# Show the structure
import json
rag_data = json.loads(rag_json)

print(f"\nFirst exported entry:")
print(json.dumps(rag_data[0], indent=2))

print(f"\nNote IDs in export:")
for item in rag_data:
    print(f"  - {item['id']}: {item['metadata']['title']}")

In [None]:
# Full export (uncomment when you want to export all notes)
# WARNING: This loads ALL notes and creates a large JSON file

def export_all_notes():
    """Export all notes to RAG format JSON file."""
    print("=== Exporting ALL Notes to RAG Format ===")
    
    corpus = enote.Corpus()
    print(f"Loading notes from: {corpus.enex_path}")
    
    # Load all notes (no max_notes limit)
    corpus.load()
    print(f"Loaded {len(corpus.notes)} notes")
    
    # Export to Desktop for safety (avoid accidentally committing private data)
    from pathlib import Path
    desktop_path = Path.home() / "Desktop"
    output_file = desktop_path / "evernote_rag_export.json"
    
    corpus.export_for_rag(str(output_file))
    
    print(f"✅ Exported to {output_file}")
    print(f"🔒 Safely stored outside project directory")
    
    file_size_kb = output_file.stat().st_size / 1024
    print(f"📊 File size: {file_size_kb:.1f} KB")
    
    # Show sample of what was exported
    print(f"\nSample note IDs:")
    for i, note_id in enumerate(list(corpus.notes.keys())[:5]):
        print(f"  {i+1}. {note_id}")
    
    if len(corpus.notes) > 5:
        print(f"  ... and {len(corpus.notes) - 5} more")
    
    return corpus

# Uncomment the line below when you want to do the full export:
# full_corpus = export_all_notes()

In [None]:
# Analyze the exported JSON structure
def analyze_rag_export(json_file=None):
    """Analyze the structure and content of the RAG export."""
    from pathlib import Path
    import json
    
    # Default to Desktop location for safety
    if json_file is None:
        json_file = Path.home() / "Desktop" / "evernote_rag_export.json"
    else:
        json_file = Path(json_file)
    
    if not json_file.exists():
        print(f"❌ File {json_file} not found. Run export_all_notes() first.")
        return
    
    print(f"=== Analyzing {json_file.name} ===")
    
    # Load and analyze
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"📊 Total notes: {len(data)}")
    
    # Analyze text lengths
    text_lengths = [len(item['text']) for item in data]
    print(f"📝 Text length stats:")
    print(f"  - Average: {sum(text_lengths) / len(text_lengths):.0f} chars")
    print(f"  - Shortest: {min(text_lengths)} chars")
    print(f"  - Longest: {max(text_lengths)} chars")
    
    # Analyze tags
    all_tags = []
    for item in data:
        tags = item['metadata']['tags']
        if isinstance(tags, list):
            all_tags.extend(tags)
        elif tags:  # single tag as string
            all_tags.append(tags)
    
    unique_tags = set(all_tags)
    print(f"🏷️  Tag stats:")
    print(f"  - Total tag instances: {len(all_tags)}")
    print(f"  - Unique tags: {len(unique_tags)}")
    
    # Show sample entries with different characteristics
    print(f"\nSample entries:")
    print(f"1. Shortest note: {min(data, key=lambda x: len(x['text']))['id']}")
    print(f"2. Longest note: {max(data, key=lambda x: len(x['text']))['id']}")
    
    return data

# Uncomment to analyze after export:
# analysis = analyze_rag_export()

In [None]:
# Test that tags normalization fixes the bug
print("=== Testing Tags Normalization ===")

test_corpus = enote.Corpus()
test_corpus.load(max_notes=10)  # Load more notes to find edge cases

notes_without_original_tags = 0
for note_id, note_data in test_corpus.notes.items():
    # Check that every note now has tags (even if empty)
    assert 'tags' in note_data, f"Note {note_id} missing tags field!"
    assert 'tag' in note_data, f"Note {note_id} missing tag field!"
    assert isinstance(note_data['tags'], list), f"Note {note_id} tags not a list!"
    
    # Count notes that originally had no tags
    if len(note_data['tags']) == 0:
        notes_without_original_tags += 1

print(f"✅ All {len(test_corpus.notes)} notes have normalized tags")
print(f"📊 Notes with no tags: {notes_without_original_tags}")
print(f"📊 Notes with tags: {len(test_corpus.notes) - notes_without_original_tags}")

# Show examples
print(f"\nTag examples:")
for i, (note_id, note_data) in enumerate(test_corpus.notes.items()):
    if i >= 3:  # Just show first 3
        break
    tags = note_data['tags']
    title = note_data['title']
    print(f"  {note_id}: {len(tags)} tags - {tags} ('{title}')")