From 8bd64ccc08af77f78652288f48cc3e5837026912 Mon Sep 17 00:00:00 2001 From: Joaquin Anton Date: Tue, 25 Oct 2022 18:36:00 +0200 Subject: [PATCH] Limit max block size in ReduceInnerSmall kernel --- dali/kernels/reduce/reduce_gpu_impl.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dali/kernels/reduce/reduce_gpu_impl.cuh b/dali/kernels/reduce/reduce_gpu_impl.cuh index 7dd862a67fc..068f6fd51b3 100644 --- a/dali/kernels/reduce/reduce_gpu_impl.cuh +++ b/dali/kernels/reduce/reduce_gpu_impl.cuh @@ -1202,12 +1202,14 @@ class ReduceImplGPU { auto launch_params = [&](auto kernel, int nsamples, int shm_size) { int preferred_block_size = 256; + int max_block_size = 256; // yields better results compared to larger blocks int preferred_grid_size; // unused CUDA_CALL(cudaOccupancyMaxPotentialBlockSize( &preferred_grid_size, &preferred_block_size, kernel, - shm_size)); + shm_size, + max_block_size)); dim3 block(32, preferred_block_size / 32); int gridx = std::max(32, 512/nsamples);