# Parallel sum benchmark

This notebook performs a simple parallel sum benchmark with both CPUs and GPUs using Dask. You can adjust the model parameters and the number of CPU cores or GPUs to use below.

In [None]:
# Define model parameters for the parallel sum
xdim = 500000
ydim = 500000
x_chunk_size = 10000
y_chunk_size = 10000

# Define how many CPU cores or GPUs we will use
num_cpu_cores = 8
num_gpus = 1

In [None]:
# Import libraries for interacting with Dask and CUDA
import time
import cupy
import dask
import dask.array as da
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, LocalCluster

In [None]:
# Common functions for performing both CPU and GPU benchmark

def create_data(rs, xdim, ydim, x_chunk_size, y_chunk_size):
    x = rs.normal(10, 1, size=(xdim, ydim), chunks=(x_chunk_size, y_chunk_size))
    return x

def run(data):
    (data + 1)[::2, ::2].sum().compute()
    return

In [None]:
# Run GPU benchmark
cluster = LocalCUDACluster(n_workers=num_gpus)
client = Client(cluster)

start = time.time()
print("Allocating and initializing arrays using GPU memory with CuPY")
rs = da.random.RandomState(RandomState=cupy.random.RandomState)
x = create_data(rs, xdim, ydim, x_chunk_size, y_chunk_size)
print('Array size: {:.2f} TB.  Computing parallel sum . . .'.format(x.nbytes/1e12))
run(x)
end = time.time()
delta = end - start

print("GPU parallel sum complete!")
print("Wall time create data + computation time: {:10.8f} seconds".format(delta))

In [None]:
# Run CPU benchmark
cluster = LocalCluster(n_workers=num_cpu_cores)
client = Client(cluster)

start = time.time()
print("Allocating and initializing arrays using CPU memory")
rs = da.random.RandomState()
x = create_data(rs, xdim, ydim, x_chunk_size, y_chunk_size)
print('Array size: {:.2f} TB.  Computing parallel sum . . .'.format(x.nbytes/1e12))
run(x)
end = time.time()
delta = end - start

print("GPU parallel sum complete!")
print('Wall time create data + computation time: {:10.8f} seconds'.format(delta))