Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coverity fixes: fix fn.normalize crash on batch of empty samples, fix broken assertion in copy_with_stride #5223

Merged
merged 3 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions dali/kernels/normalize/normalize_gpu_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ class NormalizeImplGPU {

template <typename Desc>
std::pair<dim3, dim3> GetLaunchParams(const TensorListShape<> &data_shape, int max_block) const {
assert(max_block > 0);
int optimum_block = std::is_same<Desc, Op_Scalar>::value ? 1024 : 256;
int64_t block = std::min(max_block, optimum_block);
int64_t max_size = 0;
Expand All @@ -432,7 +433,7 @@ class NormalizeImplGPU {
}
if (max_size < block)
block = max_size;
int max_blocks_per_sample = div_ceil(max_size, block);
int max_blocks_per_sample = max_size == 0 ? 0 : div_ceil(max_size, block);
dim3 grid(std::min(max_blocks_per_sample, std::max(32, 2048 / num_samples_)), num_samples_);
return { grid, dim3(block) };
}
Expand All @@ -448,8 +449,10 @@ class NormalizeImplGPU {
dim3 grid, block;
int max_block = MaxThreadsPerBlockStatic(NormalizeKernel<Desc>);
std::tie(grid, block) = GetLaunchParams<Desc>(in.shape, max_block);
NormalizeKernel<<<grid, block, 0, ctx.gpu.stream>>>(gpu_descs, global_scale, shift);
CUDA_CALL(cudaGetLastError());
if (grid.x > 0) {
NormalizeKernel<<<grid, block, 0, ctx.gpu.stream>>>(gpu_descs, global_scale, shift);
CUDA_CALL(cudaGetLastError());
}
}

template <typename Desc, typename BaseParam, typename ScaleParam>
Expand All @@ -463,9 +466,11 @@ class NormalizeImplGPU {
dim3 grid, block;
int max_block = MaxThreadsPerBlockStatic(NormalizeInvStdDevKernel<Desc>);
std::tie(grid, block) = GetLaunchParams<Desc>(in.shape, max_block);
NormalizeInvStdDevKernel<<<grid, block, 0, ctx.gpu.stream>>>(
gpu_descs, epsilon, global_scale, shift);
CUDA_CALL(cudaGetLastError());
if (grid.x > 0) {
NormalizeInvStdDevKernel<<<grid, block, 0, ctx.gpu.stream>>>(gpu_descs, epsilon, global_scale,
shift);
CUDA_CALL(cudaGetLastError());
}
}

std::string axes_str() const {
Expand Down
2 changes: 1 addition & 1 deletion dali/pipeline/util/copy_with_stride.cu
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ void FillSampleAlignmentInfo(StridedCopyDesc &sample) {
assert(0 <= sample.aligned.skip_left && sample.aligned.skip_left < vec_len);
sample.aligned.skip_left = std::min<int64_t>(sample.size, sample.aligned.skip_left);
int64_t remaining_size = sample.size - sample.aligned.skip_left;
assert(0 <= remaining_size && remaining_size < sample.size);
assert(0 <= remaining_size && remaining_size <= sample.size);
sample.aligned.size = align_down(remaining_size, vec_len);
sample.aligned.skip_right = remaining_size - sample.aligned.size;
assert(0 <= sample.aligned.skip_right && sample.aligned.skip_right < vec_len);
Expand Down
16 changes: 16 additions & 0 deletions dali/test/python/operator_1/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import nvidia.dali.ops as ops
import numpy as np
from test_utils import dali_type
from nvidia.dali import fn, pipeline_def, types
from nose2.tools import params


def normalize(x, axes=None, mean=None, stddev=None, ddof=0, eps=0):
Expand Down Expand Up @@ -501,3 +503,17 @@ def test_types():
shift,
scale,
)


@params("cpu", "gpu")
def test_batch_of_empty_samples(device):
@pipeline_def
def pipeline():
empty_sample = types.Constant([])
if device == "gpu":
empty_sample = empty_sample.gpu()
return fn.normalize(empty_sample, mean=5, stddev=1)

p = pipeline(batch_size=4, device_id=0, num_threads=4)
p.build()
p.run()