# Large data processing

The biggest limitation to GPU-processing, beside its learning curve, is the memory space. It does not go higher than `32Gb` for commercial GPU. This can easily limits the maximum image size to process to `8Gb`, even less when we want to apply more complex algorithm requiring temporary steps.

We can tile our image and process each tile separatly to overcome the memory bottleneck using `dask`.

In [6]:
import dask.array as da
import dask.distributed as dd
import timeit
import numpy as np
import pyclesperanto as cle

### Load zarr data

In [None]:
darray = da.random.random((30, 20000, 20000), chunks=(30, 200, 200))
darray

### Setup Dask client

In [None]:
nb_devices = len(cle.list_available_devices(device_type="gpu"))

cluster = dd.LocalCluster(n_workers=nb_devices, threads_per_worker=1, processes=False)
client = dd.Client(cluster)
print(client)

In [None]:
workers = client.scheduler_info()['workers']
worker_dev_map = {worker: idx for idx, worker in enumerate(workers)}
for worker in workers:
    print(worker_dev_map[worker], "-", worker, ":" ,cle.select_device(worker_dev_map[worker], device_type="gpu").name)

### Define mini-Pipeline

In [None]:
def gpu_operation(image):
    # fetch the device associate to the worker
    worker = dd.get_worker()
    gpu_index = worker_dev_map[worker.address]
    device = cle.select_device(gpu_index, "gpu")
    # run processing on the worker device
    img_dev = cle.push(image, device=device)
    proj = cle.extended_depth_of_focus_variance_projection(img_dev, sigma=100)
    bged = cle.top_hat(proj, radius_x=30.0, radius_y=30.0, connectivity="sphere")
    # return the results as numpy array
    return np.asarray(bged)

In [None]:
processed_image = da.map_overlap(gpu_operation, darray, dtype=darray.dtype, drop_axis=0, depth=[darray.shape[0], 40, 40])
processed_image

In [None]:
darray.visualize()

### Execute processing

In [None]:
start = timeit.default_timer()
result = processed_image.compute()
end = timeit.default_timer()
print(f"Time to compute: {end - start} seconds")

## Process Spatial Data

In [None]:
import zarr
import spatialdata as sd
import spatialdata_plot as sdplt

xenium_path = "/Users/strigaud/Desktop/xenium/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_xe_outs/transcripts.zarr"
sdata = sd.read_zarr(xenium_path)


SyntaxError: invalid syntax (3840541548.py, line 2)

In [5]:
sdata

SpatialData object, with associated Zarr store: /Users/strigaud/Desktop/xenium/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_xe_outs/transcripts.zarr
with coordinate systems:

with the following elements in the Zarr store but not in the SpatialData object:
    ▸ 3 (Grids)
    ▸ 2 (Grids)
    ▸ 4 (Grids)
    ▸ 0 (Grids)
    ▸ 6 (Grids)
    ▸ gene (Density)
    ▸ 1 (Grids)
    ▸ 5 (Grids)

In [9]:
from numcodecs import Blosc
from skimage.io import imread

image = imread("https://github.com/StRigaud/clesperanto_workshop_I2K24/raw/refs/heads/main/data/P1_H_C3H_M004_17-cropped.tif") # we only want 1 channel

#compress AND change the numpy array into a zarr array
compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)

# Convert image into zarr array
chunk_size = (500, 500)
zarray = zarr.array(image, chunks=chunk_size, compressor=compressor)

# save zarr to disk
zarr_filename = './P1_H_C3H_M004_17-cropped.zarr'
zarr.convenience.save(zarr_filename, zarray)