In [None]:
import dask.array as da
import h5py
import numpy as np
from dask.distributed import Client, LocalCluster
import zarr
import time

zarr_group = zarr.open("local_testing/data/fused.zarr", mode="r")
output_path = "local_testing/output/output.h5"

In [2]:
cluster = LocalCluster(n_workers=2, threads_per_worker=2, memory_limit="8GB")
client = Client(cluster)

In [3]:
cluster.get_client()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 2
Total threads: 4,Total memory: 14.90 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:59616,Workers: 0
Dashboard: http://127.0.0.1:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:59625,Total threads: 2
Dashboard: http://127.0.0.1:59627/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59619,
Local directory: /var/folders/nr/l531ymh174n5bw1dh4b67_z40000gn/T/dask-scratch-space/worker-ewltylmp,Local directory: /var/folders/nr/l531ymh174n5bw1dh4b67_z40000gn/T/dask-scratch-space/worker-ewltylmp

0,1
Comm: tcp://127.0.0.1:59624,Total threads: 2
Dashboard: http://127.0.0.1:59626/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:59621,
Local directory: /var/folders/nr/l531ymh174n5bw1dh4b67_z40000gn/T/dask-scratch-space/worker-n9krcbxy,Local directory: /var/folders/nr/l531ymh174n5bw1dh4b67_z40000gn/T/dask-scratch-space/worker-n9krcbxy


In [4]:
# Example of creating a single hdf5

# single_path = "local_testing/data/fused.zarr/ch0/s0"
# test_output = "local_testing/output/test_img.h5"

# test_img = da.from_zarr(single_path)

# start = time.time()
# with h5py.File(test_output, "w") as f:
#     f.create_dataset(
#         "data", data=test_img.compute(), shape=test_img.shape, dtype=test_img.dtype
#     )
# end = time.time()
# print(f"Single image conversion took {end - start:.2f} seconds")


In [5]:
start = time.time()
with h5py.File(output_path, "w") as h5f:
    for group_name in zarr_group.group_keys():
        subgroup = zarr_group[group_name]
        h5_subgroup = h5f.create_group(group_name)
        for array_name in subgroup.array_keys():
            z = subgroup[array_name]
            print(f"Converting {group_name}/{array_name}")
            dask_arr = da.from_zarr(z)
            h5_subgroup.create_dataset(
                array_name,
                data=dask_arr,
                shape=dask_arr.shape,
                dtype=dask_arr.dtype,
                chunks=True,
                compression="gzip",
                
            )

end = time.time()
print(f"Time taken to write data: {end - start} seconds")




Converting ch1/s2
Converting ch1/s1
Converting ch1/s3
Converting ch1/s0
Converting ch0/s2
Converting ch0/s1
Converting ch0/s3
Converting ch0/s0
Converting ch2/s2
Converting ch2/s3
Converting ch2/s0
Converting ch2/s1
Time taken to write data: 413.6439371109009 seconds


In [6]:
# Close the Dask client and cluster
client.close()
cluster.close()