# Large data processing

The biggest limitation to GPU-processing, beside its learning curve, is the memory space. It does not go higher than `32Gb` for commercial GPU. This can easily limits the maximum image size to process to `8Gb`, even less when we want to apply more complex algorithm requiring temporary steps.

This issue already exist outside of GPU-acceleartion and the solution is to tile our image and process each tile separatly to overcome the memory bottleneck. We can rely on the `dask` library to distribute our data accross our list of device, in the same way we would do on an HPC.

In [None]:
import dask.array as da
import dask.distributed as dd

import timeit
import numpy as np
import matplotlib.pyplot as plt

import pyclesperanto as cle
try:
    import cupy as xp
except:
    import numpy as xp
    Warning("Cupy not found, using numpy instead.")
try:
    import cupyx.scipy.ndimage as xdi
except:
    import scipy.ndimage as xdi
    Warning("Cupy not found, using scipy instead.")

import zarr
from skimage.io import imread

### Load zarr data

In [None]:
image = imread("https://github.com/StRigaud/clesperanto_workshop_I2K24/raw/refs/heads/main/data/P1_H_C3H_M004_17-cropped.tif") # we only want 1 channel

# Convert image into zarr array
chunk_size = (500, 500)
zarr_filename = './data/P1_H_C3H_M004_17-cropped.zarr'
zarray = zarr.create_array(store=zarr_filename, data=image)

In [None]:
# darray = da.random.random((2, 20000, 20000), chunks=(2, 1000, 1000))
darray = da.from_zarr(zarr_filename, chunks=(2, 1000, 1000), name='zarray')
darray

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(30, 10))
axs[0].imshow(darray[0], cmap='gray')
axs[1].imshow(darray[1], cmap='gray')
plt.show()

In [None]:
nb_devices = len(cle.list_available_devices(device_type="gpu"))

cluster = dd.LocalCluster(n_workers=nb_devices, threads_per_worker=1, processes=False)
client = dd.Client(cluster)
print(client)

In [None]:
workers = client.scheduler_info()['workers']
worker_dev_map = {worker: idx for idx, worker in enumerate(workers)}
for worker in workers:
    print(worker_dev_map[worker], "-", worker, ":" ,cle.select_device(worker_dev_map[worker], device_type="gpu").name)

### clesperanto mini-Pipeline

In [None]:
def cle_operation(image, block_info=None):
    # fetch the device associate to the worker
    worker = dd.get_worker()
    gpu_index = worker_dev_map[worker.address]
    device = cle.select_device(gpu_index, "gpu")
    chunk_coord = block_info[None]['chunk-location'] if block_info is not None else None

    print(f"Processing chunk {chunk_coord} with {device.name} ({gpu_index})")

    # TODO: add processing here

    # return the results as numpy array
    return np.asarray(image)

In [None]:
processed_image = da.map_overlap(cle_operation, darray[1], dtype=darray.dtype)
processed_image

In [None]:
start = timeit.default_timer()
result = processed_image.compute()
end = timeit.default_timer()
print(f"Time to compute: {end - start} seconds")

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(30, 10))
axs.imshow(processed_image, cmap='viridis')
plt.show()

## Exercise 1: Extend the function to quantify this image