diff --git a/dali/kernels/normalize/normalize_gpu_impl.cuh b/dali/kernels/normalize/normalize_gpu_impl.cuh index 99498bb4b0..7e9bb210f4 100644 --- a/dali/kernels/normalize/normalize_gpu_impl.cuh +++ b/dali/kernels/normalize/normalize_gpu_impl.cuh @@ -422,6 +422,7 @@ class NormalizeImplGPU { template std::pair GetLaunchParams(const TensorListShape<> &data_shape, int max_block) const { + assert(max_block > 0); int optimum_block = std::is_same::value ? 1024 : 256; int64_t block = std::min(max_block, optimum_block); int64_t max_size = 0; @@ -432,7 +433,7 @@ class NormalizeImplGPU { } if (max_size < block) block = max_size; - int max_blocks_per_sample = div_ceil(max_size, block); + int max_blocks_per_sample = max_size == 0 ? 0 : div_ceil(max_size, block); dim3 grid(std::min(max_blocks_per_sample, std::max(32, 2048 / num_samples_)), num_samples_); return { grid, dim3(block) }; } @@ -448,8 +449,10 @@ class NormalizeImplGPU { dim3 grid, block; int max_block = MaxThreadsPerBlockStatic(NormalizeKernel); std::tie(grid, block) = GetLaunchParams(in.shape, max_block); - NormalizeKernel<<>>(gpu_descs, global_scale, shift); - CUDA_CALL(cudaGetLastError()); + if (grid.x > 0) { + NormalizeKernel<<>>(gpu_descs, global_scale, shift); + CUDA_CALL(cudaGetLastError()); + } } template @@ -463,9 +466,11 @@ class NormalizeImplGPU { dim3 grid, block; int max_block = MaxThreadsPerBlockStatic(NormalizeInvStdDevKernel); std::tie(grid, block) = GetLaunchParams(in.shape, max_block); - NormalizeInvStdDevKernel<<>>( - gpu_descs, epsilon, global_scale, shift); - CUDA_CALL(cudaGetLastError()); + if (grid.x > 0) { + NormalizeInvStdDevKernel<<>>(gpu_descs, epsilon, global_scale, + shift); + CUDA_CALL(cudaGetLastError()); + } } std::string axes_str() const { diff --git a/dali/pipeline/util/copy_with_stride.cu b/dali/pipeline/util/copy_with_stride.cu index 51f953eab3..5d38c0fe38 100644 --- a/dali/pipeline/util/copy_with_stride.cu +++ b/dali/pipeline/util/copy_with_stride.cu @@ -234,7 +234,7 @@ void FillSampleAlignmentInfo(StridedCopyDesc &sample) { assert(0 <= sample.aligned.skip_left && sample.aligned.skip_left < vec_len); sample.aligned.skip_left = std::min(sample.size, sample.aligned.skip_left); int64_t remaining_size = sample.size - sample.aligned.skip_left; - assert(0 <= remaining_size && remaining_size < sample.size); + assert(0 <= remaining_size && remaining_size <= sample.size); sample.aligned.size = align_down(remaining_size, vec_len); sample.aligned.skip_right = remaining_size - sample.aligned.size; assert(0 <= sample.aligned.skip_right && sample.aligned.skip_right < vec_len); diff --git a/dali/test/python/operator_1/test_normalize.py b/dali/test/python/operator_1/test_normalize.py index d03da2da90..461e693411 100644 --- a/dali/test/python/operator_1/test_normalize.py +++ b/dali/test/python/operator_1/test_normalize.py @@ -17,6 +17,8 @@ import nvidia.dali.ops as ops import numpy as np from test_utils import dali_type +from nvidia.dali import fn, pipeline_def, types +from nose2.tools import params def normalize(x, axes=None, mean=None, stddev=None, ddof=0, eps=0): @@ -501,3 +503,17 @@ def test_types(): shift, scale, ) + + +@params("cpu", "gpu") +def test_batch_of_empty_samples(device): + @pipeline_def + def pipeline(): + empty_sample = types.Constant([]) + if device == "gpu": + empty_sample = empty_sample.gpu() + return fn.normalize(empty_sample, mean=5, stddev=1) + + p = pipeline(batch_size=4, device_id=0, num_threads=4) + p.build() + p.run()