# Seismometer Performance Benchmark

## Purpose

Measures time and memory for **each operation** in the classifier_bin notebook:
- **18 operations** measured individually (startup + 17 operations)
- **2 configurations**: per_context OFF vs ON
- **Multiple data sizes**: base dataset + scaled versions

## Configuration

Edit `SCALE_SIZES` in cell 2 to control which scaled datasets to generate:
```python
SCALE_SIZES = [1_000_000, 10_000_000]  # 1M, 10M
```

The benchmark will run on:
- Base dataset (data/predictions.parquet as-is)
- Scaled datasets (generated in data/scaled/)

## Output

4 tables showing:
1. **Memory** - Config A (baseline) with % of total
2. **Memory** - Config B (per_context) with overhead %
3. **Time** - Config A (baseline) with % of total
4. **Time** - Config B (per_context) with overhead %

**Just run all cells!**

## Setup

In [1]:
import sys
import time
import psutil
import os
import gc
import threading
import pandas as pd
import polars as pl
from pathlib import Path
import shutil
import numpy as np

sys.path.insert(0, str(Path.cwd().parent.parent / 'src'))
import seismometer as sm

print("✓ Imports loaded")

# ============================================================================
# CONFIGURATION: Edit this list to change benchmark data sizes
# ============================================================================
SCALE_SIZES = [1_000_000, 2_000_000]  # 1M, 10M rows
# Examples:
#   [1_000_000]                           # Just 1M (base + 1M = 2 sizes)
#   [1_000_000, 10_000_000]               # 1M, 10M (base + 2 = 3 sizes)
#   [5_000_000, 10_000_000, 100_000_000]  # 5M, 10M, 100M (base + 3 = 4 sizes)
# ============================================================================

print(f"✓ Configuration: Base + {len(SCALE_SIZES)} scaled sizes")

✓ Imports loaded
✓ Configuration: Base + 2 scaled sizes


## 1. Create Scaled Datasets

In [2]:
def create_scaled_datasets(scale_sizes=[1_000_000, 3_000_000]):
    """Generate scaled datasets from base data.
    
    Parameters
    ----------
    scale_sizes : list[int]
        Target row counts for scaled datasets (e.g., [1_000_000, 10_000_000])
    
    The base dataset (data/predictions.parquet) is used as-is.
    Scaled versions are generated in data/scaled/ directory.
    """
    scaled_dir = Path('data/scaled')
    scaled_dir.mkdir(exist_ok=True)
    
    # Dynamically create size_to_suffix mapping
    def size_to_label(size):
        if size >= 1_000_000:
            return f"{size//1_000_000}m"
        elif size >= 1000:
            return f"{size//1000}k"
        else:
            return str(size)
    
    size_to_suffix = {size: size_to_label(size) for size in scale_sizes}
    
    # Check which scaled datasets already exist
    missing = [s for s in scale_sizes if not (scaled_dir / f'predictions_{size_to_suffix[s]}.parquet').exists()]
    
    if not missing:
        print(f"✓ All scaled datasets exist: {[size_to_suffix[s] for s in scale_sizes]}")
        return size_to_suffix
    
    print(f"Creating scaled datasets: {[size_to_suffix[s] for s in missing]}...")
    
    # Load base data
    predictions = pl.read_parquet('data/predictions.parquet').with_columns([
        pl.col('encounter_id').cast(pl.Utf8), pl.col('patient_nbr').cast(pl.Utf8)
    ])
    events = pl.read_parquet('data/events.parquet').with_columns([
        pl.col('encounter_id').cast(pl.Utf8), pl.col('patient_nbr').cast(pl.Utf8)
    ])
    
    base_len = len(predictions)
    print(f"  Base dataset: {base_len:,} rows")
    
    def replicate(df, target_rows):
        """Replicate dataframe to target row count with unique IDs."""
        replicas = int(np.ceil(target_rows / base_len))
        return (
            pl.concat([df] * replicas).with_row_index('_row')
            .with_columns([
                (pl.col('encounter_id') + "_r" + (pl.col('_row') // base_len).cast(pl.Utf8)).alias('encounter_id'),
                (pl.col('patient_nbr') + "_r" + (pl.col('_row') // base_len).cast(pl.Utf8)).alias('patient_nbr')
            ]).drop('_row').head(target_rows)
        )
    
    # Generate only missing scaled datasets
    for target_size in missing:
        suffix = size_to_suffix[target_size]
        print(f"  Generating {suffix} ({target_size:,} rows)...", end=' ', flush=True)
        replicate(predictions, target_size).write_parquet(f'data/scaled/predictions_{suffix}.parquet')
        replicate(events, int(len(events) * target_size / base_len)).write_parquet(f'data/scaled/events_{suffix}.parquet')
        print("✓")
    
    print(f"✓ Scaled datasets ready")
    return size_to_suffix

# Generate scaled datasets and get the suffix mapping
size_to_suffix = create_scaled_datasets(SCALE_SIZES)

Creating scaled datasets: ['2m']...
  Base dataset: 104,307 rows
  Generating 2m (2,000,000 rows)... ✓
✓ Scaled datasets ready


## 2. Benchmark Infrastructure

In [3]:
class MemoryMonitor:
    def __init__(self):
        self.peak_rss = 0
        self.measuring = False
        self.thread = None
        self.process = psutil.Process(os.getpid())
    
    def start(self):
        self.peak_rss = 0
        self.measuring = True
        self.thread = threading.Thread(target=self._sample, daemon=True)
        self.thread.start()
    
    def stop(self):
        self.measuring = False
        if self.thread:
            self.thread.join(timeout=1.0)
        return self.peak_rss
    
    def _sample(self):
        while self.measuring:
            self.peak_rss = max(self.peak_rss, self.process.memory_info().rss)
            time.sleep(0.1)


def get_operations(per_context=False):
    """Return all operations (after startup).
    
    Note: show_cohort_summaries is called TWICE - once with (False,False) and once with (True,True)
    """
    return [
        ('show_info', lambda: sm.show_info(plot_help=True)),
        ('ExploreSubgroups', lambda: sm.ExploreSubgroups()),
        ('feature_alerts', lambda: sm.feature_alerts()),
        ('feature_summary', lambda: sm.feature_summary()),
        ('cohort_comparison_report', lambda: sm.cohort_comparison_report()),
        ('target_feature_summary', lambda: sm.target_feature_summary()),
        ('ExploreModelEvaluation', lambda: sm.ExploreModelEvaluation()),
        ('plot_model_evaluation', lambda: sm.plot_model_evaluation(
            {}, 'Readmitted within 30 Days', 'Risk30DayReadmission', (0.10, 0.20), per_context=per_context
        )),
        ('ExploreFairnessAudit', lambda: sm.ExploreFairnessAudit()),
        ('show_cohort_summaries_baseline', lambda: sm.show_cohort_summaries(by_target=False, by_score=False)),
        ('show_cohort_summaries_per_context', lambda: sm.show_cohort_summaries(by_target=True, by_score=True)),
        ('ExploreCohortEvaluation', lambda: sm.ExploreCohortEvaluation()),
        ('ExploreAnalyticsTable', lambda: sm.ExploreAnalyticsTable()),
        ('ExploreOrdinalMetrics', lambda: sm.ExploreOrdinalMetrics()),
        ('ExploreCohortOrdinalMetrics', lambda: sm.ExploreCohortOrdinalMetrics()),
        ('ExploreCohortLeadTime', lambda: sm.ExploreCohortLeadTime()),
        ('ExploreCohortOutcomeInterventionTimes', lambda: sm.ExploreCohortOutcomeInterventionTimes()),
    ]


def benchmark_dataset(dataset_size, pred_path, event_path, per_context=False):
    """Measure all operations individually.
    
    Modifies config.yml temporarily to point to the specified data files,
    then restores the original after benchmarking.
    """
    from IPython.utils.capture import capture_output
    
    config_name = 'per_context' if per_context else 'baseline'
    print(f"\n{'='*70}")
    print(f"{dataset_size} - {config_name}")
    print(f"{'='*70}")

    # Backup original config.yml
    shutil.copy('config.yml', 'config.yml.backup')

    try:
        # Modify config.yml to point to the correct data files
        # Config has separate fields: data_dir, prediction_path, event_path
        # Extract paths relative to data_dir
        pred_relative = pred_path.replace('data/', '', 1) if pred_path.startswith('data/') else pred_path
        event_relative = event_path.replace('data/', '', 1) if event_path.startswith('data/') else event_path
        
        with open('config.yml', 'r') as f:
            config_original = f.read()
        
        # Replace the path fields in config (keeping data_dir as "data")
        import re
        config = re.sub(r'prediction_path:\s*"[^"]*"', f'prediction_path: "{pred_relative}"', config_original)
        config = re.sub(r'event_path:\s*"[^"]*"', f'event_path: "{event_relative}"', config)
        
        with open('config.yml', 'w') as f:
            f.write(config)

        results = []

        # 1. Startup
        print("  run_startup...", end=' ', flush=True)
        sm.Seismogram._instances = {}
        gc.collect()

        monitor = MemoryMonitor()
        monitor.start()
        t0 = time.perf_counter()
        sm.run_startup(config_path='.', log_level=30, reset=True)
        t_startup = time.perf_counter() - t0
        rss_startup = monitor.stop()

        print(f"✓ {t_startup:.2f}s, {rss_startup/1024/1024:.0f}MB")
        results.append({'operation': 'run_startup', 'time_sec': t_startup, 'peak_rss_mb': rss_startup/1024/1024})

        # 2-18. Other operations
        for op_name, op_func in get_operations(per_context):
            print(f"  {op_name}...", end=' ', flush=True)
            gc.collect()

            monitor = MemoryMonitor()
            monitor.start()
            t0 = time.perf_counter()

            try:
                # Suppress ALL display output (stdout, stderr, widgets, HTML, etc.)
                with capture_output(stdout=False, stderr=False, display=True):
                    result = op_func()
                    
                    # For Explore widgets, trigger the update button click
                    if op_name.startswith('Explore') and result is not None:
                        # Try to find and click the update button
                        if hasattr(result, 'children'):
                            # Look for button in widget tree
                            def find_and_click_update(widget):
                                if hasattr(widget, 'description') and 'update' in str(widget.description).lower():
                                    # Trigger the button click
                                    if hasattr(widget, 'click'):
                                        widget.click()
                                    elif hasattr(widget, '_click_handlers'):
                                        for handler in widget._click_handlers.callbacks:
                                            handler(widget)
                                    return True
                                if hasattr(widget, 'children'):
                                    for child in widget.children:
                                        if find_and_click_update(child):
                                            return True
                                return False
                            
                            find_and_click_update(result)
                
                t_op = time.perf_counter() - t0
                rss_op = monitor.stop()
                print(f"✓ {t_op:.2f}s, {rss_op/1024/1024:.0f}MB")
                results.append({'operation': op_name, 'time_sec': t_op, 'peak_rss_mb': rss_op/1024/1024})
            except Exception as e:
                monitor.stop()
                print(f"✗ {str(e)[:50]}")
                results.append({'operation': op_name, 'time_sec': None, 'peak_rss_mb': None})

        return results

    finally:
        # Restore original config.yml
        shutil.move('config.yml.backup', 'config.yml')
        sm.Seismogram._instances = {}
        gc.collect()

print("✓ Infrastructure ready")

✓ Infrastructure ready


## 3. Run Benchmarks

In [None]:
# Auto-detect base dataset size
base_pred_path = Path('data/predictions.parquet')
base_rows = len(pl.read_parquet(base_pred_path))

def format_size(n):
    """Format row count as human-readable label with rounding."""
    if n >= 1_000_000:
        # Round to nearest 100K for millions
        rounded = round(n / 100_000) * 100_000
        return f"{rounded//1_000_000}M" if rounded >= 1_000_000 else f"{rounded//1000}K"
    elif n >= 100_000:
        # Round to nearest 100K for 100K+
        rounded = round(n / 100_000) * 100_000
        return f"{rounded//1000}K"
    elif n >= 1000:
        # Round to nearest 10K for smaller sizes
        rounded = round(n / 10_000) * 10_000
        return f"{rounded//1000}K"
    else:
        return str(n)

# Build dataset list: base + scaled
datasets = [
    (format_size(base_rows), 'data/predictions.parquet', 'data/events.parquet'),
]

# Add scaled datasets using the size_to_suffix mapping from cell 4
for scale_size in SCALE_SIZES:
    suffix = size_to_suffix[scale_size]
    datasets.append((
        format_size(scale_size),
        f'data/scaled/predictions_{suffix}.parquet',
        f'data/scaled/events_{suffix}.parquet'
    ))

print("\n" + "="*70)
print(f"BENCHMARKS: 18 ops × {len(datasets)} sizes × 2 configs = {18*len(datasets)*2} measurements")
print("="*70)
print(f"Datasets: {', '.join([d[0] for d in datasets])}")
print()

all_results = []

for dataset_size, pred_path, event_path in datasets:
    for per_context in [False, True]:
        results = benchmark_dataset(dataset_size, pred_path, event_path, per_context)
        for r in results:
            r['dataset_size'] = dataset_size
            r['config'] = 'per_context' if per_context else 'baseline'
            all_results.append(r)

df = pd.DataFrame(all_results)
print("\n" + "="*70)
print("✓ COMPLETE")
print("="*70)


BENCHMARKS: 18 ops × 3 sizes × 2 configs = 108 measurements
Datasets: 100K, 1M, 2M


100K - baseline
  run_startup... ✓ 2.27s, 2742MB
  show_info... ✓ 0.01s, 2646MB
  ExploreSubgroups... ✓ 0.21s, 2648MB
  feature_alerts... ✓ 0.11s, 2699MB
  feature_summary... ✓ 0.09s, 2699MB
  cohort_comparison_report... ✓ 0.20s, 2739MB
  target_feature_summary... ✓ 0.10s, 2739MB
  ExploreModelEvaluation... ✓ 3.04s, 2847MB
  plot_model_evaluation... ✓ 1.39s, 2856MB
  ExploreFairnessAudit... ✓ 0.98s, 2859MB
  show_cohort_summaries_baseline... ✓ 0.43s, 2877MB
  show_cohort_summaries_per_context... ✓ 3.09s, 2906MB
  ExploreCohortEvaluation... ✓ 1.28s, 2906MB
  ExploreAnalyticsTable... ✓ 0.65s, 2903MB
  ExploreOrdinalMetrics... ✓ 0.95s, 2903MB
  ExploreCohortOrdinalMetrics... ✓ 0.92s, 2910MB
  ExploreCohortLeadTime... ✓ 0.54s, 2911MB
  ExploreCohortOutcomeInterventionTimes... ✓ 3.29s, 2895MB

100K - per_context
  run_startup... ✓ 1.80s, 3112MB
  show_info... ✓ 0.01s, 3053MB
  ExploreSubgroups... ✓ 0.20s, 

## 4. Memory Tables

In [None]:
# Config A: Baseline
mem_a = df[df['config']=='baseline'].pivot_table(index='operation', columns='dataset_size', values='peak_rss_mb', aggfunc='first')

# Get column order (sorted by size)
cols = sorted(mem_a.columns, key=lambda x: int(x.rstrip('KM')) * (1000 if 'K' in x else 1_000_000 if 'M' in x else 1))
mem_a = mem_a[cols]

# Add % of total (using largest dataset)
largest_col = cols[-1]
mem_a['% of Total'] = (mem_a[largest_col] / mem_a[largest_col].max() * 100).round(1)

print("\n" + "="*80)
print("PEAK RSS (MB) - Config A: Baseline (per_context=False)")
print("="*80)
print(mem_a.to_string(float_format=lambda x: f"{x:,.0f}"))
print(f"\nPeak: {' / '.join([f'{col}={mem_a[col].max():.0f}MB' for col in cols])}")

In [None]:
# Config B: Per-context
mem_b = df[df['config']=='per_context'].pivot_table(index='operation', columns='dataset_size', values='peak_rss_mb', aggfunc='first')

# Use same column order as mem_a
mem_b = mem_b[cols]

# Calculate overhead
overhead = []
for op in mem_b.index:
    if op in mem_a.index:
        vals_a = mem_a.loc[op, cols].values
        vals_b = mem_b.loc[op, cols].values
        avg_oh = ((vals_b - vals_a) / vals_a * 100).mean()
        overhead.append(avg_oh)
    else:
        overhead.append(0)
mem_b['vs Baseline'] = [f"{x:+.0f}%" if not np.isnan(x) else '-' for x in overhead]

print("\n" + "="*80)
print("PEAK RSS (MB) - Config B: Per-context (per_context=True)")
print("="*80)
print(mem_b.to_string(float_format=lambda x: f"{x:,.0f}"))
print(f"\nPeak: {' / '.join([f'{col}={mem_b[col].max():.0f}MB' for col in cols])}")
print(f"Overhead: {(mem_b[largest_col].max()/mem_a[largest_col].max()-1)*100:+.0f}%")

## 5. Time Tables

In [None]:
# Config A: Baseline
time_a = df[df['config']=='baseline'].pivot_table(index='operation', columns='dataset_size', values='time_sec', aggfunc='first')[cols]

# Add % of total (using largest dataset)
time_a['% of Total'] = (time_a[largest_col] / time_a[largest_col].sum() * 100).round(1)

print("\n" + "="*80)
print("RUNTIME (seconds) - Config A: Baseline (per_context=False)")
print("="*80)
print(time_a.to_string(float_format=lambda x: f"{x:,.2f}"))
print(f"\nTotal: {' / '.join([f'{col}={time_a[col].sum():.1f}s' for col in cols])}")

In [None]:
# Config B: Per-context
time_b = df[df['config']=='per_context'].pivot_table(index='operation', columns='dataset_size', values='time_sec', aggfunc='first')[cols]

# Calculate overhead
time_oh = []
for op in time_b.index:
    if op in time_a.index:
        vals_a = time_a.loc[op, cols].values
        vals_b = time_b.loc[op, cols].values
        avg_oh = ((vals_b - vals_a) / vals_a * 100).mean()
        time_oh.append(avg_oh)
    else:
        time_oh.append(0)
time_b['vs Baseline'] = [f"{x:+.0f}%" if not np.isnan(x) else '-' for x in time_oh]

print("\n" + "="*80)
print("RUNTIME (seconds) - Config B: Per-context (per_context=True)")
print("="*80)
print(time_b.to_string(float_format=lambda x: f"{x:,.2f}"))
print(f"\nTotal: {' / '.join([f'{col}={time_b[col].sum():.1f}s' for col in cols])}")
print(f"Overhead: {(time_b[largest_col].sum()/time_a[largest_col].sum()-1)*100:+.0f}%")

## 6. Save Results

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv(f"benchmark_results_{timestamp}.csv", index=False)
print(f"\n✓ Saved: benchmark_results_{timestamp}.csv")