Skip to content

Commit

Permalink
Limit max block size in ReduceInnerSmall kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
jantonguirao committed Oct 25, 2022
1 parent c4bbdc5 commit 8bd64cc
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion dali/kernels/reduce/reduce_gpu_impl.cuh
Expand Up @@ -1202,12 +1202,14 @@ class ReduceImplGPU {

auto launch_params = [&](auto kernel, int nsamples, int shm_size) {
int preferred_block_size = 256;
int max_block_size = 256; // yields better results compared to larger blocks
int preferred_grid_size; // unused
CUDA_CALL(cudaOccupancyMaxPotentialBlockSize(
&preferred_grid_size,
&preferred_block_size,
kernel,
shm_size));
shm_size,
max_block_size));

dim3 block(32, preferred_block_size / 32);
int gridx = std::max(32, 512/nsamples);
Expand Down

0 comments on commit 8bd64cc

Please sign in to comment.