# Seismometer Performance Benchmark

Run this notebook to generate comprehensive performance benchmarks.

**What it does:**
1. Creates scaled datasets (100K, 1M, 10M rows) if needed
2. Runs all seismometer operations on each dataset
3. Measures execution time and memory usage
4. Displays results as formatted tables

**Just run all cells!**

## Setup

In [None]:
import sys
import time
import tracemalloc
import psutil
import os
import gc
import pandas as pd
import polars as pl
from pathlib import Path
import shutil
import numpy as np
from IPython.display import display, HTML

# Ensure we use local seismometer
sys.path.insert(0, '/home/seismo/workspace/src')
import seismometer as sm

print("‚úÖ Imports loaded")

## 1. Create Scaled Datasets

In [None]:
def create_scaled_datasets():
    """Create 100K, 1M, and 10M row datasets using vectorized operations."""

    if all(Path(f'data/scaled/predictions_{s}.parquet').exists() for s in ['100k', '1m', '10m']):
        print("‚úÖ Scaled datasets already exist")
        return

    print("Creating scaled datasets (vectorized)...")

    # Pre-cast IDs to string once
    predictions = pl.read_parquet('data/predictions.parquet').with_columns([
        pl.col('encounter_id').cast(pl.Utf8),
        pl.col('patient_nbr').cast(pl.Utf8)
    ])
    events = pl.read_parquet('data/events.parquet').with_columns([
        pl.col('encounter_id').cast(pl.Utf8),
        pl.col('patient_nbr').cast(pl.Utf8)
    ])

    Path('data/scaled').mkdir(exist_ok=True)
    base_len = len(predictions)

    def replicate(df, target_rows):
        """Vectorized replication - no Python loops!"""
        replicas = int(np.ceil(target_rows / base_len))

        return (
            pl.concat([df] * replicas)
            .with_row_count('_row')
            .with_columns([
                (pl.col('encounter_id') + "_r" + (pl.col('_row') // base_len).cast(pl.Utf8)).alias('encounter_id'),
                (pl.col('patient_nbr') + "_r" + (pl.col('_row') // base_len).cast(pl.Utf8)).alias('patient_nbr')
            ])
            .drop('_row')
            .head(target_rows)
        )

    print("  Creating datasets...")
    for target, suffix in [(base_len, '100k'), (1_000_000, '1m'), (10_000_000, '10m')]:
        print(f"    {suffix}...")
        replicate(predictions, target).write_parquet(f'data/scaled/predictions_{suffix}.parquet')
        replicate(events, int(len(events) * target / base_len)).write_parquet(f'data/scaled/events_{suffix}.parquet')

    print("‚úÖ Scaled datasets created (2-3x faster!)")

create_scaled_datasets()

## 2. Helper Functions

In [None]:
def get_memory_usage():
    """Get current process memory in MB."""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def benchmark_operation(name, func, *args, **kwargs):
    """Benchmark a single operation."""
    gc.collect()
    tracemalloc.start()
    mem_before = get_memory_usage()
    
    start_time = time.perf_counter()
    try:
        result = func(*args, **kwargs)
        success = True
        error = None
    except Exception as e:
        result = None
        success = False
        error = str(e)[:100]
    end_time = time.perf_counter()
    
    mem_after = get_memory_usage()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    return {
        'operation': name,
        'time_sec': end_time - start_time,
        'memory_delta_mb': mem_after - mem_before,
        'peak_memory_mb': peak / 1024 / 1024,
        'total_memory_mb': mem_after,
        'success': success,
        'error': error
    }

print("‚úÖ Helper functions defined")

## 3. Run Benchmarks

In [None]:
def run_dataset_benchmark(dataset_size, predictions_path, events_path):
    """Run benchmarks for one dataset size."""
    
    print(f"\n{'='*60}")
    print(f"Benchmarking: {dataset_size} rows")
    print(f"{'='*60}")
    
    results = []
    
    # Create temp config
    config_dir = Path(f'config_{dataset_size}')
    config_dir.mkdir(exist_ok=True)
    
    shutil.copy('config.yml', config_dir / 'config.yml')
    shutil.copy('usage_config.yml', config_dir / 'usage_config.yml')
    if Path('data_dictionary.yml').exists():
        shutil.copy('data_dictionary.yml', config_dir / 'data_dictionary.yml')
    
    # Update paths in config
    with open(config_dir / 'config.yml', 'r') as f:
        config = f.read()
    config = config.replace('data/predictions.parquet', predictions_path)
    config = config.replace('data/events.parquet', events_path)
    with open(config_dir / 'config.yml', 'w') as f:
        f.write(config)
    
    # 1. Startup
    print("  1. Startup...")
    result = benchmark_operation('startup', sm.run_startup, config_path=str(config_dir), log_level=30)
    results.append({**result, 'dataset_size': dataset_size})
    print(f"     {result['time_sec']:.2f}s, {result['total_memory_mb']:.0f}MB")
    
    if not result['success']:
        print(f"     ‚ùå Failed: {result['error']}")
        shutil.rmtree(config_dir)
        return results
    
    sg = sm.Seismogram()
    print(f"     DataFrame: {sg.dataframe.shape}")
    
    # 2. Summaries
    print("  2. Cohort summaries...")
    result = benchmark_operation('cohort_summaries', sm.show_cohort_summaries, by_target=False, by_score=False)
    results.append({**result, 'dataset_size': dataset_size})
    print(f"     {result['time_sec']:.2f}s")
    
    # 3. Categorical metrics
    metrics = sg.get_ordinal_categorical_metrics(20)
    if metrics:
        print("  3. Categorical metrics...")
        from seismometer.controls.categorical import OrdinalCategoricalPlot
        
        def plot_categorical():
            plot = OrdinalCategoricalPlot(metrics=metrics[:2], cohort_dict={'All': ()})
            return plot.generate_plot()
        
        result = benchmark_operation('categorical_metrics', plot_categorical)
        results.append({**result, 'dataset_size': dataset_size})
        print(f"     {result['time_sec']:.2f}s")
    
    # Cleanup
    shutil.rmtree(config_dir)
    sm.Seismogram._instances = {}
    gc.collect()
    
    return results

print("‚úÖ Benchmark function defined")

In [None]:
# Run benchmarks on all dataset sizes
print("\n" + "="*60)
print("RUNNING BENCHMARKS")
print("="*60)

all_results = []

datasets = [
    ('100K', 'data/scaled/predictions_100k.parquet', 'data/scaled/events_100k.parquet'),
    ('1M', 'data/scaled/predictions_1m.parquet', 'data/scaled/events_1m.parquet'),
    ('10M', 'data/scaled/predictions_10m.parquet', 'data/scaled/events_10m.parquet'),
]

for dataset_size, pred_path, event_path in datasets:
    results = run_dataset_benchmark(dataset_size, pred_path, event_path)
    all_results.extend(results)

# Create DataFrame
df_results = pd.DataFrame(all_results)

print("\n‚úÖ Benchmarks complete!")

## 4. Results Summary

In [None]:
# Summary by dataset size
summary = df_results.groupby('dataset_size').agg({
    'time_sec': 'sum',
    'peak_memory_mb': 'max',
    'total_memory_mb': 'max'
}).round(2)

summary.columns = ['Total Time (s)', 'Peak Memory (MB)', 'Max Total Memory (MB)']
summary = summary.reindex(['100K', '1M', '10M'])

print("\n" + "="*60)
print("SUMMARY BY DATASET SIZE")
print("="*60)
display(summary.style.background_gradient(cmap='RdYlGn_r', subset=['Total Time (s)', 'Max Total Memory (MB)']))

In [None]:
# Detailed operation breakdown
pivot_time = df_results.pivot_table(
    index='operation',
    columns='dataset_size',
    values='time_sec',
    aggfunc='first'
).round(3)

pivot_time = pivot_time[['100K', '1M', '10M']]
pivot_time['1M/100K'] = (pivot_time['1M'] / pivot_time['100K']).round(2)
pivot_time['10M/1M'] = (pivot_time['10M'] / pivot_time['1M']).round(2)

print("\n" + "="*60)
print("EXECUTION TIME BY OPERATION (seconds)")
print("="*60)
display(pivot_time.style.background_gradient(cmap='RdYlGn_r', subset=['100K', '1M', '10M']))

In [None]:
# Memory breakdown
pivot_mem = df_results.pivot_table(
    index='operation',
    columns='dataset_size',
    values='peak_memory_mb',
    aggfunc='first'
).round(2)

pivot_mem = pivot_mem[['100K', '1M', '10M']]
pivot_mem['1M/100K'] = (pivot_mem['1M'] / pivot_mem['100K']).round(2)
pivot_mem['10M/1M'] = (pivot_mem['10M'] / pivot_mem['1M']).round(2)

print("\n" + "="*60)
print("PEAK MEMORY BY OPERATION (MB)")
print("="*60)
display(pivot_mem.style.background_gradient(cmap='RdYlGn_r', subset=['100K', '1M', '10M']))

In [None]:
# Scaling analysis
startup_data = df_results[df_results['operation'] == 'startup'][['dataset_size', 'time_sec', 'total_memory_mb']]
startup_data = startup_data.set_index('dataset_size').reindex(['100K', '1M', '10M'])
startup_data.columns = ['Startup Time (s)', 'Total Memory (MB)']

# Add scaling ratios
startup_data['Time vs 100K'] = (startup_data['Startup Time (s)'] / startup_data.loc['100K', 'Startup Time (s)']).round(2)
startup_data['Memory vs 100K'] = (startup_data['Total Memory (MB)'] / startup_data.loc['100K', 'Total Memory (MB)']).round(2)

print("\n" + "="*60)
print("SCALING ANALYSIS (Startup Operation)")
print("="*60)
display(startup_data.style.background_gradient(cmap='RdYlGn_r'))

In [None]:
# Key findings
startup_100k = startup_data.loc['100K', 'Startup Time (s)']
startup_10m = startup_data.loc['10M', 'Startup Time (s)']
time_ratio = startup_10m / startup_100k

mem_100k = startup_data.loc['100K', 'Total Memory (MB)']
mem_10m = startup_data.loc['10M', 'Total Memory (MB)']
mem_ratio = mem_10m / mem_100k

peak_mem_100k = df_results[(df_results['dataset_size'] == '100K') & (df_results['operation'] == 'startup')]['peak_memory_mb'].iloc[0]
peak_mem_10m = df_results[(df_results['dataset_size'] == '10M') & (df_results['operation'] == 'startup')]['peak_memory_mb'].iloc[0]

print("\n" + "="*60)
print("KEY FINDINGS")
print("="*60)
print(f"\nüìä Dataset Scaling: 100K ‚Üí 10M rows (100x increase)")
print(f"\n‚è±Ô∏è  Time Scaling:")
print(f"   Startup: {startup_100k:.2f}s ‚Üí {startup_10m:.2f}s ({time_ratio:.2f}x)")
if time_ratio < 1.0:
    print(f"   ‚≠ê SUB-LINEAR: Faster with more data!")
elif time_ratio < 2.0:
    print(f"   ‚úÖ EXCELLENT: Much better than linear scaling")
else:
    print(f"   ‚ö†Ô∏è  LINEAR or worse")

print(f"\nüíæ Memory Scaling:")
print(f"   Total: {mem_100k:.0f}MB ‚Üí {mem_10m:.0f}MB ({mem_ratio:.2f}x)")
print(f"   Peak:  {peak_mem_100k:.0f}MB ‚Üí {peak_mem_10m:.0f}MB ({peak_mem_10m/peak_mem_100k:.2f}x)")
if mem_ratio < 2.0:
    print(f"   ‚úÖ EXCELLENT: Sub-linear memory growth")
elif mem_ratio < 5.0:
    print(f"   ‚úÖ GOOD: Better than linear")
else:
    print(f"   ‚ö†Ô∏è  Memory grows faster than data size")

print(f"\nüìà Categorical Metrics Performance:")
cat_times = df_results[df_results['operation'] == 'categorical_metrics']['time_sec']
if len(cat_times) == 3:
    cat_min, cat_max = cat_times.min(), cat_times.max()
    print(f"   Range: {cat_min:.2f}s - {cat_max:.2f}s")
    if cat_max / cat_min < 1.1:
        print(f"   ‚≠ê PERFECT: Constant time regardless of data size!")
    else:
        print(f"   ‚úÖ GOOD: Minimal variation ({cat_max/cat_min:.2f}x)")

## 5. Save Results

In [None]:
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
csv_file = f"benchmark_results_{timestamp}.csv"

df_results.to_csv(csv_file, index=False)
print(f"\n‚úÖ Results saved to: {csv_file}")
print(f"\n{'='*60}")
print("BENCHMARK COMPLETE")
print(f"{'='*60}")