# I/O Benchmarking: Network vs Local Storage

This notebook benchmarks different storage configurations for data extraction workflows:

## Test Scenarios

1. **Network → Network**: Read from `\\rbo-s1`, write back to `\\rbo-s1`
2. **Network → Local**: Read from `\\rbo-s1`, write to local NVMe (`D://demo//staging`)
3. **Local → Local**: Copy to local first, then read/write locally
4. **Parameter Effects**: Test different processing parameters (fix_phase, use_fft, register_z)

## Key Questions

- Is it faster to extract directly to network or local storage?
- Does copying raw files to local storage first improve total workflow time?
- How much do processing parameters affect I/O vs computation time?
- Which file format (`.bin`, `.tiff`, `.zarr`, `.h5`) is fastest for each scenario?

In [None]:
# Imports
from pathlib import Path
import numpy as np
import time
import shutil
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mbo_utilities as mbo

## Configuration

In [None]:
# Storage paths
NETWORK_PATH = Path(r"\\rbo-s1\S1_DATA\lbm\demo_user")
LOCAL_PATH = Path(r"D:\demo\staging")

# Source data
RAW_TIFFS_NETWORK = NETWORK_PATH / "raw_scanimage_tiffs"
RAW_TIFFS_LOCAL = LOCAL_PATH / "raw_scanimage_tiffs"

# Output directories
OUTPUT_NETWORK = NETWORK_PATH / "benchmark_output"
OUTPUT_LOCAL = LOCAL_PATH / "benchmark_output"

# Create directories
LOCAL_PATH.mkdir(parents=True, exist_ok=True)
RAW_TIFFS_LOCAL.mkdir(parents=True, exist_ok=True)
OUTPUT_NETWORK.mkdir(parents=True, exist_ok=True)
OUTPUT_LOCAL.mkdir(parents=True, exist_ok=True)

print(f"Network path: {NETWORK_PATH}")
print(f"Local path: {LOCAL_PATH}")
print(f"Network path exists: {NETWORK_PATH.exists()}")
print(f"Local path exists: {LOCAL_PATH.exists()}")

## Test Parameters

Define different parameter combinations to test

In [None]:
# File formats to test
FILE_FORMATS = ['.bin', '.tiff', '.zarr', '.h5']

# Processing parameter sets
PARAM_SETS = {
    'minimal': {
        'fix_phase': False,
        'use_fft': False,
        'register_z': False,
        'description': 'No processing (fastest)'
    },
    'phase_only': {
        'fix_phase': True,
        'use_fft': False,
        'register_z': False,
        'description': 'Scan-phase correction only'
    },
    'phase_fft': {
        'fix_phase': True,
        'use_fft': True,
        'register_z': False,
        'description': 'FFT scan-phase correction'
    },
    'full': {
        'fix_phase': True,
        'use_fft': True,
        'register_z': True,
        'description': 'Full processing pipeline'
    }
}

# Subset of planes to extract (for faster testing)
TEST_PLANES = [6, 7, 8]  # Test with 3 planes
TEST_FRAMES = 1000  # Number of frames to extract

print("Parameter sets to test:")
for name, params in PARAM_SETS.items():
    print(f"  {name}: {params['description']}")
print(f"\nFile formats: {FILE_FORMATS}")
print(f"Test planes: {TEST_PLANES}")
print(f"Test frames: {TEST_FRAMES}")

## Utility Functions

In [None]:
def get_directory_size(path):
    """Calculate total size of directory in MB"""
    total_size = 0
    for item in Path(path).rglob('*'):
        if item.is_file():
            total_size += item.stat().st_size
    return total_size / (1024 * 1024)  # Convert to MB

def time_operation(func, *args, **kwargs):
    """Time a function execution"""
    start_time = time.time()
    result = func(*args, **kwargs)
    elapsed_time = time.time() - start_time
    return result, elapsed_time

def copy_files_timed(src, dst):
    """Copy files and return time taken"""
    start_time = time.time()
    if dst.exists():
        shutil.rmtree(dst)
    shutil.copytree(src, dst)
    elapsed_time = time.time() - start_time
    size_mb = get_directory_size(dst)
    return elapsed_time, size_mb

def cleanup_output(path):
    """Clean up output directory"""
    if path.exists():
        shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)

print("Utility functions loaded")

## Benchmark 1: Network → Network

Read from network storage, write back to network storage

In [None]:
results_network_to_network = []

# Get raw files
raw_files = mbo.get_files(RAW_TIFFS_NETWORK)
print(f"Found {len(raw_files)} TIFF files on network")

for param_name, param_set in PARAM_SETS.items():
    for file_format in FILE_FORMATS:
        print(f"\nTesting: {param_name} → {file_format}")
        
        # Clean output directory
        test_output = OUTPUT_NETWORK / f"net2net_{param_name}"
        cleanup_output(test_output)
        
        try:
            # Load scan
            scan = mbo.imread(raw_files)
            scan.roi = None
            scan.fix_phase = param_set['fix_phase']
            scan.use_fft = param_set['use_fft']
            
            # Time the write operation
            start_time = time.time()
            mbo.imwrite(
                scan,
                test_output,
                ext=file_format,
                num_frames=TEST_FRAMES,
                planes=TEST_PLANES,
                overwrite=True,
                register_z=param_set['register_z']
            )
            elapsed_time = time.time() - start_time
            
            # Get output size
            output_size = get_directory_size(test_output)
            
            # Calculate throughput
            throughput = output_size / elapsed_time if elapsed_time > 0 else 0
            
            results_network_to_network.append({
                'scenario': 'Network → Network',
                'params': param_name,
                'format': file_format,
                'time_sec': elapsed_time,
                'size_mb': output_size,
                'throughput_mb_per_sec': throughput,
                'success': True
            })
            
            print(f"  Time: {elapsed_time:.2f}s, Size: {output_size:.2f}MB, Throughput: {throughput:.2f}MB/s")
            
        except Exception as e:
            print(f"  ERROR: {e}")
            results_network_to_network.append({
                'scenario': 'Network → Network',
                'params': param_name,
                'format': file_format,
                'time_sec': None,
                'size_mb': None,
                'throughput_mb_per_sec': None,
                'success': False,
                'error': str(e)
            })

df_net2net = pd.DataFrame(results_network_to_network)
print("\n" + "="*80)
print("Network → Network Results:")
print(df_net2net[df_net2net['success']])

## Benchmark 2: Network → Local

Read from network storage, write to local NVMe

In [None]:
results_network_to_local = []

for param_name, param_set in PARAM_SETS.items():
    for file_format in FILE_FORMATS:
        print(f"\nTesting: {param_name} → {file_format}")
        
        # Clean output directory
        test_output = OUTPUT_LOCAL / f"net2local_{param_name}"
        cleanup_output(test_output)
        
        try:
            # Load scan from network
            scan = mbo.imread(raw_files)
            scan.roi = None
            scan.fix_phase = param_set['fix_phase']
            scan.use_fft = param_set['use_fft']
            
            # Time the write operation to local
            start_time = time.time()
            mbo.imwrite(
                scan,
                test_output,
                ext=file_format,
                num_frames=TEST_FRAMES,
                planes=TEST_PLANES,
                overwrite=True,
                register_z=param_set['register_z']
            )
            elapsed_time = time.time() - start_time
            
            # Get output size
            output_size = get_directory_size(test_output)
            
            # Calculate throughput
            throughput = output_size / elapsed_time if elapsed_time > 0 else 0
            
            results_network_to_local.append({
                'scenario': 'Network → Local',
                'params': param_name,
                'format': file_format,
                'time_sec': elapsed_time,
                'size_mb': output_size,
                'throughput_mb_per_sec': throughput,
                'success': True
            })
            
            print(f"  Time: {elapsed_time:.2f}s, Size: {output_size:.2f}MB, Throughput: {throughput:.2f}MB/s")
            
        except Exception as e:
            print(f"  ERROR: {e}")
            results_network_to_local.append({
                'scenario': 'Network → Local',
                'params': param_name,
                'format': file_format,
                'time_sec': None,
                'size_mb': None,
                'throughput_mb_per_sec': None,
                'success': False,
                'error': str(e)
            })

df_net2local = pd.DataFrame(results_network_to_local)
print("\n" + "="*80)
print("Network → Local Results:")
print(df_net2local[df_net2local['success']])

## Benchmark 3: Copy First, Then Local → Local

Copy raw files to local storage first, then process locally

In [None]:
results_copy_then_local = []

# First, copy raw files to local (only once)
print("Copying raw files from network to local...")
copy_time, copy_size = copy_files_timed(RAW_TIFFS_NETWORK, RAW_TIFFS_LOCAL)
print(f"Copy completed: {copy_time:.2f}s, {copy_size:.2f}MB")
print(f"Copy throughput: {copy_size / copy_time:.2f}MB/s")

# Get local files
local_files = mbo.get_files(RAW_TIFFS_LOCAL)
print(f"Found {len(local_files)} TIFF files locally")

for param_name, param_set in PARAM_SETS.items():
    for file_format in FILE_FORMATS:
        print(f"\nTesting: {param_name} → {file_format}")
        
        # Clean output directory
        test_output = OUTPUT_LOCAL / f"local2local_{param_name}"
        cleanup_output(test_output)
        
        try:
            # Load scan from local
            scan = mbo.imread(local_files)
            scan.roi = None
            scan.fix_phase = param_set['fix_phase']
            scan.use_fft = param_set['use_fft']
            
            # Time the write operation
            start_time = time.time()
            mbo.imwrite(
                scan,
                test_output,
                ext=file_format,
                num_frames=TEST_FRAMES,
                planes=TEST_PLANES,
                overwrite=True,
                register_z=param_set['register_z']
            )
            elapsed_time = time.time() - start_time
            
            # Get output size
            output_size = get_directory_size(test_output)
            
            # Calculate throughput (just for processing)
            throughput = output_size / elapsed_time if elapsed_time > 0 else 0
            
            # Total time includes copy time
            total_time = copy_time + elapsed_time
            total_throughput = output_size / total_time if total_time > 0 else 0
            
            results_copy_then_local.append({
                'scenario': 'Copy + Local → Local',
                'params': param_name,
                'format': file_format,
                'copy_time_sec': copy_time,
                'process_time_sec': elapsed_time,
                'total_time_sec': total_time,
                'size_mb': output_size,
                'process_throughput_mb_per_sec': throughput,
                'total_throughput_mb_per_sec': total_throughput,
                'success': True
            })
            
            print(f"  Process time: {elapsed_time:.2f}s, Total time: {total_time:.2f}s")
            print(f"  Size: {output_size:.2f}MB, Process throughput: {throughput:.2f}MB/s")
            
        except Exception as e:
            print(f"  ERROR: {e}")
            results_copy_then_local.append({
                'scenario': 'Copy + Local → Local',
                'params': param_name,
                'format': file_format,
                'copy_time_sec': copy_time,
                'process_time_sec': None,
                'total_time_sec': None,
                'size_mb': None,
                'process_throughput_mb_per_sec': None,
                'total_throughput_mb_per_sec': None,
                'success': False,
                'error': str(e)
            })

df_copy_local = pd.DataFrame(results_copy_then_local)
print("\n" + "="*80)
print("Copy + Local → Local Results:")
print(df_copy_local[df_copy_local['success']])

## Combined Results Analysis

In [None]:
# Combine all results
all_results = []

# Add network to network
for r in results_network_to_network:
    if r['success']:
        all_results.append(r)

# Add network to local
for r in results_network_to_local:
    if r['success']:
        all_results.append(r)

# Add copy + local (process time only for fair comparison)
for r in results_copy_then_local:
    if r['success']:
        all_results.append({
            'scenario': 'Local → Local (after copy)',
            'params': r['params'],
            'format': r['format'],
            'time_sec': r['process_time_sec'],
            'size_mb': r['size_mb'],
            'throughput_mb_per_sec': r['process_throughput_mb_per_sec'],
            'success': True
        })

df_all = pd.DataFrame(all_results)

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = LOCAL_PATH / f"benchmark_results_{timestamp}.csv"
df_all.to_csv(results_file, index=False)
print(f"Results saved to: {results_file}")

# Display summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

summary = df_all.groupby('scenario').agg({
    'time_sec': ['mean', 'std', 'min', 'max'],
    'throughput_mb_per_sec': ['mean', 'std', 'min', 'max']
}).round(2)

print(summary)

## Visualization: Processing Time Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Time comparison by scenario
ax = axes[0, 0]
sns.boxplot(data=df_all, x='scenario', y='time_sec', ax=ax)
ax.set_title('Processing Time by Scenario', fontsize=14, fontweight='bold')
ax.set_ylabel('Time (seconds)', fontweight='bold')
ax.set_xlabel('Scenario', fontweight='bold')
ax.tick_params(axis='x', rotation=45)
ax.grid(True, alpha=0.3)

# 2. Throughput comparison by scenario
ax = axes[0, 1]
sns.boxplot(data=df_all, x='scenario', y='throughput_mb_per_sec', ax=ax)
ax.set_title('Throughput by Scenario', fontsize=14, fontweight='bold')
ax.set_ylabel('Throughput (MB/s)', fontweight='bold')
ax.set_xlabel('Scenario', fontweight='bold')
ax.tick_params(axis='x', rotation=45)
ax.grid(True, alpha=0.3)

# 3. Time by file format
ax = axes[1, 0]
sns.barplot(data=df_all, x='format', y='time_sec', hue='scenario', ax=ax)
ax.set_title('Processing Time by File Format', fontsize=14, fontweight='bold')
ax.set_ylabel('Time (seconds)', fontweight='bold')
ax.set_xlabel('File Format', fontweight='bold')
ax.legend(title='Scenario', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3, axis='y')

# 4. Time by parameter set
ax = axes[1, 1]
sns.barplot(data=df_all, x='params', y='time_sec', hue='scenario', ax=ax)
ax.set_title('Processing Time by Parameters', fontsize=14, fontweight='bold')
ax.set_ylabel('Time (seconds)', fontweight='bold')
ax.set_xlabel('Parameter Set', fontweight='bold')
ax.legend(title='Scenario', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(LOCAL_PATH / f"benchmark_plots_{timestamp}.png", dpi=300, bbox_inches='tight')
plt.show()

## Speedup Analysis

In [None]:
# Calculate speedup relative to Network → Network baseline
baseline = df_all[df_all['scenario'] == 'Network → Network']['time_sec'].mean()

speedup_data = []
for scenario in df_all['scenario'].unique():
    scenario_times = df_all[df_all['scenario'] == scenario]['time_sec']
    mean_time = scenario_times.mean()
    speedup = baseline / mean_time
    percent_faster = (speedup - 1) * 100
    
    speedup_data.append({
        'scenario': scenario,
        'mean_time_sec': mean_time,
        'speedup': speedup,
        'percent_faster': percent_faster
    })

df_speedup = pd.DataFrame(speedup_data).sort_values('speedup', ascending=False)

print("\n" + "="*80)
print("SPEEDUP ANALYSIS (relative to Network → Network)")
print("="*80)
print(df_speedup.to_string(index=False))

# Plot speedup
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(df_speedup['scenario'], df_speedup['speedup'], 
              color=['red' if x < 1 else 'green' for x in df_speedup['speedup']],
              alpha=0.7, edgecolor='black')
ax.axhline(1, color='black', linestyle='--', linewidth=2, label='Baseline')
ax.set_ylabel('Speedup Factor', fontsize=12, fontweight='bold')
ax.set_xlabel('Scenario', fontsize=12, fontweight='bold')
ax.set_title('Speedup Relative to Network → Network', fontsize=14, fontweight='bold')
ax.tick_params(axis='x', rotation=45)
ax.grid(True, alpha=0.3, axis='y')
ax.legend()

# Add value labels on bars
for bar, speedup in zip(bars, df_speedup['speedup']):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{speedup:.2f}x',
            ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(LOCAL_PATH / f"speedup_analysis_{timestamp}.png", dpi=300, bbox_inches='tight')
plt.show()

## Recommendations

In [None]:
# Find best configurations
best_overall = df_all.loc[df_all['time_sec'].idxmin()]
best_throughput = df_all.loc[df_all['throughput_mb_per_sec'].idxmax()]

# Best by file format
best_by_format = df_all.groupby('format')['time_sec'].idxmin()
best_formats = df_all.loc[best_by_format]

print("\n" + "="*80)
print("RECOMMENDATIONS")
print("="*80)

print("\n1. FASTEST OVERALL CONFIGURATION:")
print(f"   Scenario: {best_overall['scenario']}")
print(f"   Format: {best_overall['format']}")
print(f"   Parameters: {best_overall['params']}")
print(f"   Time: {best_overall['time_sec']:.2f}s")
print(f"   Throughput: {best_overall['throughput_mb_per_sec']:.2f}MB/s")

print("\n2. HIGHEST THROUGHPUT:")
print(f"   Scenario: {best_throughput['scenario']}")
print(f"   Format: {best_throughput['format']}")
print(f"   Parameters: {best_throughput['params']}")
print(f"   Throughput: {best_throughput['throughput_mb_per_sec']:.2f}MB/s")

print("\n3. BEST FILE FORMATS:")
for _, row in best_formats.iterrows():
    print(f"   {row['format']}: {row['time_sec']:.2f}s ({row['scenario']})")

print("\n4. WORKFLOW COMPARISON:")
net2net_avg = df_all[df_all['scenario'] == 'Network → Network']['time_sec'].mean()
net2local_avg = df_all[df_all['scenario'] == 'Network → Local']['time_sec'].mean()
local2local_avg = df_all[df_all['scenario'] == 'Local → Local (after copy)']['time_sec'].mean()

print(f"   Network → Network: {net2net_avg:.2f}s (avg)")
print(f"   Network → Local: {net2local_avg:.2f}s (avg)")
print(f"   Local → Local: {local2local_avg:.2f}s (avg, excluding copy time)")

if net2local_avg < net2net_avg:
    improvement = ((net2net_avg - net2local_avg) / net2net_avg) * 100
    print(f"\n   ✓ Writing to local is {improvement:.1f}% faster than network")
else:
    print(f"\n   ✗ Network storage is faster in this configuration")

# Check if copy + process is worth it
if len(results_copy_then_local) > 0 and results_copy_then_local[0]['success']:
    total_with_copy = results_copy_then_local[0]['total_time_sec']
    print(f"\n5. COPY STRATEGY:")
    print(f"   Copy time: {copy_time:.2f}s")
    print(f"   Total time (copy + process): {total_with_copy:.2f}s")
    if total_with_copy < net2net_avg:
        print(f"   ✓ Copying first is worth it for repeated processing")
    else:
        print(f"   ✗ Direct processing is faster for single runs")

## Cleanup (Optional)

Remove benchmark output files to free up space

In [None]:
# Uncomment to clean up test outputs
# shutil.rmtree(OUTPUT_NETWORK)
# shutil.rmtree(OUTPUT_LOCAL)
# shutil.rmtree(RAW_TIFFS_LOCAL)
# print("Cleanup complete")