# Benchmark: subsample_array Performance

Testing performance of `subsample_array` and nan/inf filtering with MBO imread outputs.

In [None]:
import numpy as np
from pathlib import Path
import mbo_utilities as mbo
from fastplotlib.utils import subsample_array

In [None]:
# Test data path - adjust as needed
TEST_DATA_DIR = Path("E:/tests/lbm/mbo_utilities")
TEST_INPUT = TEST_DATA_DIR / "test_input.tif"

print(f"Test input exists: {TEST_INPUT.exists()}")

In [None]:
# Load the test data
data = mbo.imread(TEST_INPUT)
print(f"Data shape: {data.shape}")
print(f"Data dtype: {data.dtype}")
print(f"Total elements: {np.prod(data.shape):,}")

## Benchmark: subsample_array alone

In [None]:
# For 2D display, ignore the last 2 dims (y, x)
ignore_dims = (-2, -1) if data.ndim >= 2 else None
print(f"ignore_dims: {ignore_dims}")

In [None]:
%%timeit -n 3 -r 5
sub = subsample_array(data, ignore_dims=ignore_dims)

In [None]:
# Check subsampled shape
sub = subsample_array(data, ignore_dims=ignore_dims)
print(f"Subsampled shape: {sub.shape}")
print(f"Subsampled elements: {np.prod(sub.shape):,}")
print(f"Reduction factor: {np.prod(data.shape) / np.prod(sub.shape):.1f}x")

## Benchmark: nan/inf filtering

In [None]:
%%timeit -n 3 -r 5
sub = subsample_array(data, ignore_dims=ignore_dims)
sub_real = sub[~(np.isnan(sub) | np.isinf(sub))]

In [None]:
# Check filtered result
sub = subsample_array(data, ignore_dims=ignore_dims)
sub_real = sub[~(np.isnan(sub) | np.isinf(sub))]
print(f"Filtered shape: {sub_real.shape}")
print(f"NaN/Inf count: {sub.size - sub_real.size}")

## Benchmark: Alternative filtering approaches

In [None]:
# Pre-compute subsampled array for filtering benchmarks
sub = subsample_array(data, ignore_dims=ignore_dims)
print(f"Testing filtering on array of shape {sub.shape}")

In [None]:
%%timeit -n 10 -r 5
# Method 1: Combined isnan/isinf
sub_real = sub[~(np.isnan(sub) | np.isinf(sub))]

In [None]:
%%timeit -n 10 -r 5
# Method 2: np.isfinite (should be faster)
sub_real = sub[np.isfinite(sub)]

In [None]:
%%timeit -n 10 -r 5
# Method 3: Check if filtering is even needed first
if sub.dtype.kind == 'f':  # Only floats can have nan/inf
    sub_real = sub[np.isfinite(sub)]
else:
    sub_real = sub.ravel()

## Benchmark: Different max_size values

In [None]:
for max_size in [1e5, 5e5, 1e6, 5e6, 1e7]:
    sub = subsample_array(data, max_size=max_size, ignore_dims=ignore_dims)
    print(f"max_size={max_size:.0e}: shape={sub.shape}, elements={np.prod(sub.shape):,}")

In [None]:
print("\nTiming for different max_size values:")
import timeit

for max_size in [1e5, 5e5, 1e6, 5e6]:
    timer = timeit.Timer(
        stmt='subsample_array(data, max_size=max_size, ignore_dims=ignore_dims)',
        globals={'subsample_array': subsample_array, 'data': data, 'max_size': max_size, 'ignore_dims': ignore_dims}
    )
    times = timer.repeat(repeat=3, number=3)
    avg_time = np.mean(times) / 3
    print(f"max_size={max_size:.0e}: {avg_time*1000:.2f} ms")

## Test with larger/different data if available

In [None]:
# Look for other test files
if TEST_DATA_DIR.exists():
    tif_files = list(TEST_DATA_DIR.glob("*.tif")) + list(TEST_DATA_DIR.glob("*.tiff"))
    raw_files = list(TEST_DATA_DIR.glob("*.raw"))
    print(f"Found {len(tif_files)} TIFF files")
    print(f"Found {len(raw_files)} RAW files")
    for f in tif_files[:5]:
        print(f"  - {f.name}")