In [1]:
!uv pip install jupyterlab-vim

[2mUsing Python 3.12.9 environment at: C:\Users\RBO\repos\mbo_utilities\.venv[0m
[2mAudited [1m1 package[0m [2min 93ms[0m[0m


In [6]:
%load_ext autoreload
%autoreload 2
import mbo_utilities as mbo
import numpy as np
import zarr

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
data = mbo.imread(r"D://demo//raw",)
data.shape

(1574, 14, 550, 440)

In [4]:
import time
start = time.time()
mbo.imwrite(data, "./zarr", ext=".zarr", planes=[1, 2, 3], sharded=False)
end = time.time()
print(f"No sharding: {end - start}")

Saving plane01_stitched.zarr:   0%|          | 0/37 [00:00<?, ?it/s]

Saving plane02_stitched.zarr:   0%|          | 0/37 [00:00<?, ?it/s]

Saving plane03_stitched.zarr:   0%|          | 0/37 [00:00<?, ?it/s]

No sharding: 13.621733903884888


In [5]:
start = time.time()
mbo.imwrite(data, "./zarr-sharded", ext=".zarr", planes=[1, 2, 3], sharded=True)
end = time.time()
print(f"Sharding: {end - start}")

Saving plane01_stitched.zarr:   0%|          | 0/37 [00:00<?, ?it/s]

Saving plane02_stitched.zarr:   0%|          | 0/37 [00:00<?, ?it/s]

Saving plane03_stitched.zarr:   0%|          | 0/37 [00:00<?, ?it/s]

Sharding: 94.21228432655334


In [18]:
from pathlib import Path
import shutil
import time
import numpy as np

def make_codecs(sharded: bool, level: int, inner_shape=None):
    # Zarr v3: must have at least one ArrayBytesCodec (BytesCodec).
    from zarr.codecs import BytesCodec, GzipCodec
    if sharded:
        from zarr.codecs import ShardingCodec, Crc32cCodec
        if inner_shape is None:
            raise ValueError("inner_shape required for sharded layouts")
        return [ShardingCodec(
            chunk_shape=tuple(inner_shape),
            codecs=[BytesCodec()] if level <= 0 else [BytesCodec(), GzipCodec(level=int(level))],
            index_codecs=[BytesCodec(), Crc32cCodec()],
        )]
    else:
        return [BytesCodec()] if level <= 0 else [BytesCodec(), GzipCodec(level=int(level))]

def make_volumetric(path, shape, dtype, sharded=False, level=0, t_chunk=256):
    import zarr
    t, z, y, x = map(int, shape)
    chunks = (int(t_chunk), 1, y, x)
    codecs = make_codecs(sharded, level, inner_shape=(1, 1, y, x) if sharded else None)
    return zarr.create(
        store=str(path),
        shape=(t, z, y, x),
        chunks=chunks,
        dtype=dtype,
        codecs=codecs,
        overwrite=True,
    )

def make_planar_group(path, shape, dtype, sharded=False, level=0, t_chunk=256):
    import zarr
    import shutil
    path = Path(path)
    if path.exists():
        shutil.rmtree(path)
    # create the root group dir
    zarr.group(store=str(path), overwrite=True)

    t, z, y, x = map(int, shape)
    chunks = (int(t_chunk), 1, y, x)
    codecs = make_codecs(sharded, level, inner_shape=(1, 1, y, x) if sharded else None)

    # create one dataset per z under the same store, using the `path=` kwarg
    for zz in range(z):
        zarr.create(
            store=str(path),
            path=f"z{zz:04d}",
            shape=(t, 1, y, x),
            chunks=chunks,
            dtype=dtype,
            codecs=codecs,
            overwrite=True,
        )
    # return a zarr group handle for convenience
    return zarr.group(store=str(path))


def write_volumetric_by_z(arr, data, t_chunk=256):
    t, z, y, x = map(int, data.shape)
    step = int(t_chunk)
    for zz in range(z):
        i = 0
        while i < t:
            j = min(i + step, t)
            block = data[i:j, zz]          # (tt, y, x)
            if not isinstance(block, np.ndarray):
                block = np.asarray(block)
            if block.ndim == 3:
                block = block[:, None, :, :]   # (tt,1,y,x)
            if not block.flags["C_CONTIGUOUS"]:
                block = np.ascontiguousarray(block)
            t0 = time.perf_counter()
            arr[i:j, zz:zz+1, :, :] = block
            dt = time.perf_counter() - t0
            print(f"vol z={zz} t[{i}:{j}) {dt:.3f}s")
            i = j

def write_planar_by_z(root, data, t_chunk=256):
    t, z, y, x = map(int, data.shape)
    step = int(t_chunk)
    for zz in range(z):
        arr = root[f"z{zz:04d}"]
        i = 0
        while i < t:
            j = min(i + step, t)
            block = data[i:j, zz]          # (tt, y, x)
            if not isinstance(block, np.ndarray):
                block = np.asarray(block)
            if block.ndim == 3:
                block = block[:, None, :, :]   # (tt,1,y,x)
            if not block.flags["C_CONTIGUOUS"]:
                block = np.ascontiguousarray(block)
            t0 = time.perf_counter()
            arr[i:j, :, :, :] = block
            dt = time.perf_counter() - t0
            print(f"planar z={zz} t[{i}:{j}) {dt:.3f}s")
            i = j

def run_benchmarks(data, outdir, t_chunk=256, levels=(0, 1)):
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)
    t, z, y, x = map(int, data.shape)
    dtype = np.asarray(data[0, 0]).dtype

    cases = [
        ("vol_noshard_nocomp",   True,  False, 0),
        ("vol_noshard_comp",     True,  False, levels[-1]),
        ("vol_shard_nocomp",     True,  True,  0),
        ("vol_shard_comp",       True,  True,  levels[-1]),
        ("planar_noshard_nocomp",False, False, 0),
        ("planar_noshard_comp",  False, False, levels[-1]),
        ("planar_shard_nocomp",  False, True,  0),
        ("planar_shard_comp",    False, True,  levels[-1]),
    ]

    for name, is_vol, sharded, level in cases:
        dst = outdir / f"{name}.zarr"
        if dst.exists():
            shutil.rmtree(dst)

        if is_vol:
            arr = make_volumetric(dst, (t, z, y, x), dtype, sharded=sharded, level=level, t_chunk=t_chunk)
            print(f"start {name}")
            t0 = time.perf_counter()
            write_volumetric_by_z(arr, data, t_chunk=t_chunk)
            print(f"done  {name} {time.perf_counter() - t0:.3f}s")
        else:
            root = make_planar_group(dst, (t, z, y, x), dtype, sharded=sharded, level=level, t_chunk=t_chunk)
            print(f"start {name}")
            t0 = time.perf_counter()
            write_planar_by_z(root, data, t_chunk=t_chunk)
            print(f"done  {name} {time.perf_counter() - t0:.3f}s")

    return str(outdir)

In [19]:
out = run_benchmarks(data, r"D:\bench_zarr", t_chunk=256, levels=(0, 1))

start vol_noshard_nocomp
vol z=0 t[0:256) 0.246s
vol z=0 t[256:512) 0.095s
vol z=0 t[512:768) 0.101s
vol z=0 t[768:1024) 0.109s
vol z=0 t[1024:1280) 0.102s
vol z=0 t[1280:1536) 0.136s
vol z=0 t[1536:1574) 0.132s
vol z=1 t[0:256) 0.096s
vol z=1 t[256:512) 0.122s
vol z=1 t[512:768) 0.097s
vol z=1 t[768:1024) 0.101s
vol z=1 t[1024:1280) 0.109s
vol z=1 t[1280:1536) 0.104s
vol z=1 t[1536:1574) 0.165s
vol z=2 t[0:256) 0.105s
vol z=2 t[256:512) 0.162s
vol z=2 t[512:768) 0.094s
vol z=2 t[768:1024) 0.113s
vol z=2 t[1024:1280) 0.146s
vol z=2 t[1280:1536) 0.106s
vol z=2 t[1536:1574) 0.134s
vol z=3 t[0:256) 0.099s
vol z=3 t[256:512) 0.100s
vol z=3 t[512:768) 0.124s
vol z=3 t[768:1024) 0.164s
vol z=3 t[1024:1280) 0.116s
vol z=3 t[1280:1536) 0.148s
vol z=3 t[1536:1574) 0.145s
vol z=4 t[0:256) 0.166s
vol z=4 t[256:512) 0.109s
vol z=4 t[512:768) 0.124s
vol z=4 t[768:1024) 0.111s
vol z=4 t[1024:1280) 0.101s
vol z=4 t[1280:1536) 0.103s
vol z=4 t[1536:1574) 0.141s
vol z=5 t[0:256) 0.099s
vol z=5 t[256:51

TypeError: Group.create_array() got an unexpected keyword argument 'codecs'