# Corpus Exploration

Interactive exploration of the Corpus class for development and testing.
Use this notebook to experiment with the ENEX parsing functionality.

In [1]:
# Test if workspace settings handle the import path
# If this fails, we'll need to add back the sys.path.insert() 

import enote

print("✅ Import successful! Workspace settings are working.")

✅ Import successful! Workspace settings are working.


In [2]:
# Explore the Corpus class
corpus = enote.Corpus()
print("Corpus attributes:")
print(dir(corpus))

Corpus attributes:
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__firstlineno__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__static_attributes__', '__str__', '__subclasshook__', '__weakref__', '_clean_enml', '_generate_note_id', '_parse_note_element', 'enex_path', 'export_for_rag', 'get_backlinks', 'get_linked_notes', 'load', 'notes', 'query']


In [3]:
# Look at the documentation
help(corpus.load)

Help on method load in module enote.corpus:

load(max_notes: Optional[int] = None) -> None method of enote.corpus.Corpus instance
    Load and parse notes from ENEX files into the corpus.

    Args:
        max_notes: Optional limit on number of notes to load
                  (useful for testing with large datasets)

    Note:
        Populates self.notes with the parsed note data.
        Each note is stored as: note_id -> {title, body, tags, metadata}

    Example:
        {
            "note_123": {
                "title": "Project Ideas",
                "body": "...",
                "tags": ["projects", "brainstorm"],
                "created": datetime(...),
                "updated": datetime(...)
            }
        }



In [4]:
# Load some notes and explore
corpus.load(max_notes=2)
print(f"Loaded {len(corpus.notes)} notes")

Loaded 2 notes


In [5]:
# Explore the notes structure
for note_id, note_data in corpus.notes.items():
    print(f"\n=== {note_id} ===")
    print(f"Title: {note_data['title']}")
    print(f"Tags: {note_data['tags']}")
    print(f"Keys: {list(note_data.keys())}")
    break  # Just show first one


=== band_practice_checklist ===
Title: Band Practice checklist


KeyError: 'tags'

In [None]:
# Try some operations
first_note = list(corpus.notes.values())[0]
print("First note structure:")
for key, value in first_note.items():
    print(f"{key}: {type(value)} - {str(value)[:50]}...")

In [None]:
# Test the new cleaned_text functionality
for note_id, note_data in corpus.notes.items():
    original = note_data.get('content', '')
    cleaned = note_data.get('cleaned_text', '')
    
    print(f"\n=== {note_id}: {note_data['title']} ===")
    print(f"Original length: {len(original)} chars")
    print(f"Cleaned length: {len(cleaned)} chars")
    
    if len(original) > 0:
        reduction = ((len(original) - len(cleaned)) / len(original)) * 100
        print(f"Size reduction: {reduction:.1f}%")
    
    print(f"\nCleaned text preview:")
    print(cleaned[:200] + "..." if len(cleaned) > 200 else cleaned)
    
    break  # Just show first one

## RAG Export

Test the new RAG export functionality. This generates JSON in the standard format for feeding into LLMs and vector databases.

In [None]:
# Test RAG export with a small sample first
print("=== Testing RAG Export (Small Sample) ===")

# Create a fresh corpus for export testing
export_corpus = enote.Corpus()
export_corpus.load(max_notes=3)  # Just a few notes for testing

print(f"Loaded {len(export_corpus.notes)} notes for export test")

# Export to string (not file) for preview
rag_json = export_corpus.export_for_rag()

# Show the structure
import json
rag_data = json.loads(rag_json)

print(f"\nFirst exported entry:")
print(json.dumps(rag_data[0], indent=2))

print(f"\nNote IDs in export:")
for item in rag_data:
    print(f"  - {item['id']}: {item['metadata']['title']}")

In [None]:
# Full export (uncomment when you want to export all notes)
# WARNING: This loads ALL notes and creates a large JSON file

def export_all_notes():
    """Export all notes to RAG format JSON file."""
    print("=== Exporting ALL Notes to RAG Format ===")
    
    corpus = enote.Corpus()
    print(f"Loading notes from: {corpus.enex_path}")
    
    # Load all notes (no max_notes limit)
    corpus.load()
    print(f"Loaded {len(corpus.notes)} notes")
    
    # Export to Desktop for safety (avoid accidentally committing private data)
    from pathlib import Path
    desktop_path = Path.home() / "Desktop"
    output_file = desktop_path / "evernote_rag_export.json"
    
    corpus.export_for_rag(str(output_file))
    
    print(f"✅ Exported to {output_file}")
    print(f"🔒 Safely stored outside project directory")
    
    file_size_kb = output_file.stat().st_size / 1024
    print(f"📊 File size: {file_size_kb:.1f} KB")
    
    # Show sample of what was exported
    print(f"\nSample note IDs:")
    for i, note_id in enumerate(list(corpus.notes.keys())[:5]):
        print(f"  {i+1}. {note_id}")
    
    if len(corpus.notes) > 5:
        print(f"  ... and {len(corpus.notes) - 5} more")
    
    return corpus

# Uncomment the line below when you want to do the full export:
# full_corpus = export_all_notes()

In [None]:
# Analyze the exported JSON structure
def analyze_rag_export(json_file=None):
    """Analyze the structure and content of the RAG export."""
    from pathlib import Path
    import json
    
    # Default to Desktop location for safety
    if json_file is None:
        json_file = Path.home() / "Desktop" / "evernote_rag_export.json"
    else:
        json_file = Path(json_file)
    
    if not json_file.exists():
        print(f"❌ File {json_file} not found. Run export_all_notes() first.")
        return
    
    print(f"=== Analyzing {json_file.name} ===")
    
    # Load and analyze
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"📊 Total notes: {len(data)}")
    
    # Analyze text lengths
    text_lengths = [len(item['text']) for item in data]
    print(f"📝 Text length stats:")
    print(f"  - Average: {sum(text_lengths) / len(text_lengths):.0f} chars")
    print(f"  - Shortest: {min(text_lengths)} chars")
    print(f"  - Longest: {max(text_lengths)} chars")
    
    # Analyze tags
    all_tags = []
    for item in data:
        tags = item['metadata']['tags']
        if isinstance(tags, list):
            all_tags.extend(tags)
        elif tags:  # single tag as string
            all_tags.append(tags)
    
    unique_tags = set(all_tags)
    print(f"🏷️  Tag stats:")
    print(f"  - Total tag instances: {len(all_tags)}")
    print(f"  - Unique tags: {len(unique_tags)}")
    
    # Show sample entries with different characteristics
    print(f"\nSample entries:")
    print(f"1. Shortest note: {min(data, key=lambda x: len(x['text']))['id']}")
    print(f"2. Longest note: {max(data, key=lambda x: len(x['text']))['id']}")
    
    return data

# Uncomment to analyze after export:
# analysis = analyze_rag_export()