# Multi-gpu Tile Processing

The biggest limitation to GPU-processing, beside its learning curve, is the memory space. It does not go higher than `32Gb` for commercial GPU. This can easily limits the maximum image size to process to `8Gb`, even less when we want to apply more complex algorithm requiring temporary steps.

We can tile our image and process each tile separatly to overcome the memory bottleneck using `dask`.

In [None]:
import dask.array as da
import dask.distributed as dd
import timeit
import numpy as np
import pyclesperanto as cle

In [None]:
darray = da.random.random((30, 20000, 20000), chunks=(30, 200, 200))
darray

In [None]:
nb_devices = len(cle.list_available_devices(device_type="gpu"))

cluster = dd.LocalCluster(n_workers=nb_devices, threads_per_worker=1, processes=False)
client = dd.Client(cluster)
print(client)

In [None]:
workers = client.scheduler_info()['workers']
worker_dev_map = {worker: idx for idx, worker in enumerate(workers)}
for worker in workers:
    print(worker_dev_map[worker], "-", worker, ":" ,cle.select_device(worker_dev_map[worker], device_type="gpu").name)

In [None]:
def gpu_operation(image):
    # fetch the device associate to the worker
    worker = dd.get_worker()
    gpu_index = worker_dev_map[worker.address]
    device = cle.select_device(gpu_index, "gpu")
    # run processing on the worker device
    img_dev = cle.push(image, device=device)
    proj = cle.extended_depth_of_focus_variance_projection(img_dev, sigma=100)
    bged = cle.top_hat(proj, radius_x=30.0, radius_y=30.0, connectivity="sphere")
    # return the results as numpy array
    return np.asarray(bged)

In [None]:
processed_image = da.map_overlap(gpu_operation, darray, dtype=darray.dtype, drop_axis=0, depth=[darray.shape[0], 40, 40])
processed_image

In [None]:
darray.visualize()

In [None]:
start = timeit.default_timer()
result = processed_image.compute()
end = timeit.default_timer()
print(f"Time to compute: {end - start} seconds")