From ea03e19b26924b3e022dcd6c09f3e2e9541d986c Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 5 Mar 2026 18:17:32 +0100 Subject: [PATCH 1/3] Handle 0 items in warpspeed scan --- cub/cub/device/dispatch/dispatch_scan.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index bbe76d436e9..25cbd61a0ce 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -410,6 +410,12 @@ struct DispatchScan template CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(ActivePolicyT) { + if (num_items == 0) + { + temp_storage_bytes = 1; // just fulfill the contract that CUB always requires some temporary storage + return cudaSuccess; + } + using InputT = ::cuda::std::iter_value_t; using OutputT = ::cuda::std::iter_value_t; using WarpspeedPolicy = typename ActivePolicyT::WarpspeedPolicy; From 44de595bd7087baee19a8a78bcf0d017b584f4dc Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 6 Mar 2026 19:34:39 +0100 Subject: [PATCH 2/3] Test 0 num_Items --- cub/test/catch2_test_device_scan_alignment.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/test/catch2_test_device_scan_alignment.cu b/cub/test/catch2_test_device_scan_alignment.cu index a0777d34802..712ff5b691e 100644 --- a/cub/test/catch2_test_device_scan_alignment.cu +++ b/cub/test/catch2_test_device_scan_alignment.cu @@ -39,7 +39,7 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", value constexpr offset_t max_num_items = 8192; const auto offset = GENERATE_COPY(values({0, 1, 3, 4, 7, 8, 11, 12, 16}), take(3, random(0, max_offset))); - const auto num_items = GENERATE_COPY(values({1, max_num_items}), take(64, random(0, max_num_items))); + const auto num_items = GENERATE_COPY(values({0, 1, max_num_items}), take(64, random(2, max_num_items - 1))); CAPTURE(num_items, offset); From 66da4cafe1ccd2f64029d3ba7aa7dee5bdeb9c5c Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 6 Mar 2026 20:13:33 +0100 Subject: [PATCH 3/3] Fix --- cub/cub/detail/warpspeed/squad/load_store.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cub/cub/detail/warpspeed/squad/load_store.cuh b/cub/cub/detail/warpspeed/squad/load_store.cuh index 2e704e7ff19..eb4ed969f0b 100644 --- a/cub/cub/detail/warpspeed/squad/load_store.cuh +++ b/cub/cub/detail/warpspeed/squad/load_store.cuh @@ -236,7 +236,7 @@ squadStoreBulkSync(Squad squad, CpAsyncOobInfo cpAsyncOobInfo, const :: constexpr ::cuda::std::uint16_t byteMask = 0xFFFF; const ::cuda::std::uint16_t byteMaskStart = byteMask << cpAsyncOobInfo.smemStartSkipBytes; - const ::cuda::std::uint16_t byteMaskEnd = byteMask >> (16 - cpAsyncOobInfo.smemEndBytesAfter16BBoundary); + const ::cuda::std::uint16_t byteMaskEnd = byteMask >> (16 - cpAsyncOobInfo.smemEndBytesAfter16BBoundary) % 16; // byteMaskStart contains zeroes at the left # if _CCCL_CUDA_COMPILER(NVCC, >=, 13, 2) const ::cuda::std::uint16_t byteMaskSmall = byteMaskStart & byteMaskEnd;