# ENEX Performance Analysis & Element Discovery

Comprehensive analysis of our dynamic ENEX parsing implementation:

1. **Performance Testing**: Time how long it takes to load all ENEX files
2. **Element Discovery**: Find all unique element names across the entire corpus
3. **Data Structure Analysis**: Understand the complete ENEX schema

This will help us understand the real-world performance characteristics and discover what data fields are available in your Evernote export.

In [None]:
# Import Required Libraries
import sys
import time
from pathlib import Path
from collections import Counter, defaultdict

# Add src to path for importing enote
src_path = Path.cwd().parent / "src"
sys.path.insert(0, str(src_path))

import enote

print("✅ Libraries imported successfully")
print(f"📂 ENEX path: {enote.DEFAULT_ENEX_PATH}")

In [None]:
# Initialize Corpus and Setup Timing
print("🔧 Initializing Corpus...")

# Create corpus instance
corpus = enote.Corpus()
print(f"📍 Corpus path: {corpus.enex_path}")

# Check what ENEX files are available
enex_files = list(corpus.enex_path.glob("*.enex"))
print(f"📁 Found {len(enex_files)} ENEX files:")

for enex_file in enex_files:
    size_mb = enex_file.stat().st_size / (1024 * 1024)
    print(f"  - {enex_file.name}: {size_mb:.1f} MB")

print(f"\n⏱️  Ready to measure performance...")

In [None]:
# Time ENEX File Loading
print("🚀 Starting full ENEX file loading...")
print("=" * 50)

# Record start time
start_time = time.time()

# Load ALL notes (no max_notes limit)
corpus.load()

# Record end time
end_time = time.time()
elapsed_time = end_time - start_time

# Display results
notes_loaded = len(corpus.notes)
notes_per_second = notes_loaded / elapsed_time if elapsed_time > 0 else 0

print(f"\n📊 PERFORMANCE RESULTS:")
print(f"⏱️  Total time: {elapsed_time:.2f} seconds")
print(f"📝 Notes loaded: {notes_loaded:,}")
print(f"🚀 Speed: {notes_per_second:.1f} notes/second")

if elapsed_time > 60:
    minutes = elapsed_time / 60
    print(f"⌚ Time: {minutes:.1f} minutes")

print("=" * 50)

In [None]:
# Extract All Element Names from Notes
print("🔍 Discovering all element names across corpus...")

# Collect all unique element names
all_element_names = set()
element_counts = Counter()
element_examples = defaultdict(list)

# Analyze each note
for note_id, note_data in corpus.notes.items():
    for element_name, value in note_data.items():
        # Track unique element names
        all_element_names.add(element_name)
        
        # Count occurrences
        element_counts[element_name] += 1
        
        # Store examples (first 3 for each element)
        if len(element_examples[element_name]) < 3:
            if isinstance(value, list):
                example = f"list({len(value)}) - {value[:2] if value else '[]'}"
            else:
                example = str(value)[:50] + "..." if len(str(value)) > 50 else str(value)
            element_examples[element_name].append(example)

print(f"✅ Analysis complete!")
print(f"🏷️  Found {len(all_element_names)} unique element names")
print(f"📝 Across {len(corpus.notes):,} notes")

In [None]:
# Analyze Performance Results
print("📈 DETAILED PERFORMANCE ANALYSIS")
print("=" * 60)

# File-level statistics
total_files = len(enex_files)
total_size_mb = sum(f.stat().st_size for f in enex_files) / (1024 * 1024)

print(f"📁 File Statistics:")
print(f"   Files processed: {total_files}")
print(f"   Total size: {total_size_mb:.1f} MB")
print(f"   Average file size: {total_size_mb/total_files:.1f} MB")

# Performance metrics
print(f"\n⚡ Performance Metrics:")
print(f"   Total parsing time: {elapsed_time:.2f} seconds")
print(f"   Notes per second: {notes_per_second:.1f}")
print(f"   MB per second: {total_size_mb/elapsed_time:.1f}")
print(f"   Average time per note: {(elapsed_time/notes_loaded)*1000:.1f} ms")

# Memory efficiency estimate
avg_elements_per_note = sum(len(note) for note in corpus.notes.values()) / len(corpus.notes)
print(f"\n💾 Data Structure:")
print(f"   Average elements per note: {avg_elements_per_note:.1f}")
print(f"   Total note objects: {notes_loaded:,}")
print(f"   Est. total elements: {int(notes_loaded * avg_elements_per_note):,}")

print("=" * 60)

In [None]:
# Display Element Name Statistics
print("🏷️  COMPLETE ELEMENT NAME ANALYSIS")
print("=" * 80)

print(f"📊 Found {len(all_element_names)} unique element types:")
print()

# Show ALL elements sorted by frequency (most common first)
for element_name, count in element_counts.most_common():
    percentage = (count / len(corpus.notes)) * 100
    
    print(f"🔹 {element_name}")
    print(f"   Frequency: {count:,} notes ({percentage:.1f}%)")
    
    # Show examples
    examples = element_examples[element_name]
    if examples:
        print(f"   Examples:")
        for i, example in enumerate(examples[:2], 1):
            print(f"     {i}. {example}")
    print()

print("=" * 80)

# Summary insights
print("🎯 KEY INSIGHTS:")
print(f"✅ All {len(corpus.notes):,} notes loaded successfully")
print(f"✅ Dynamic extraction discovered {len(all_element_names)} element types")
print(f"✅ No hardcoded field limitations")

# Most/least common elements
most_common = element_counts.most_common(1)[0]
least_common = element_counts.most_common()[-1]
print(f"📈 Most common: '{most_common[0]}' in {most_common[1]:,} notes")
print(f"📉 Least common: '{least_common[0]}' in {least_common[1]:,} notes")

# Show complete set of all element names
print(f"\n📋 COMPLETE ELEMENT LIST:")
sorted_elements = sorted(all_element_names)
print(f"   {', '.join(sorted_elements)}")