From 37dac27daacdc5d82c62b94e189546f1d7cfd196 Mon Sep 17 00:00:00 2001 From: David Bayer Date: Fri, 29 May 2026 15:45:27 +0200 Subject: [PATCH] [cudax] Implement `cudax::coop::reduce` prototype --- .../cuda/experimental/__coop/reduce.cuh | 98 +++++++++ cudax/include/cuda/experimental/coop.cuh | 26 +++ cudax/test/CMakeLists.txt | 10 + cudax/test/coop/reduce/this_block.cu | 186 ++++++++++++++++ cudax/test/coop/reduce/this_thread.cu | 203 ++++++++++++++++++ cudax/test/coop/reduce/this_warp.cu | 176 +++++++++++++++ 6 files changed, 699 insertions(+) create mode 100644 cudax/include/cuda/experimental/__coop/reduce.cuh create mode 100644 cudax/include/cuda/experimental/coop.cuh create mode 100644 cudax/test/coop/reduce/this_block.cu create mode 100644 cudax/test/coop/reduce/this_thread.cu create mode 100644 cudax/test/coop/reduce/this_warp.cu diff --git a/cudax/include/cuda/experimental/__coop/reduce.cuh b/cudax/include/cuda/experimental/__coop/reduce.cuh new file mode 100644 index 00000000000..fb71f8a071b --- /dev/null +++ b/cudax/include/cuda/experimental/__coop/reduce.cuh @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH +#define _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#if !defined(_CCCL_DOXYGEN_INVOKED) + +namespace cuda::experimental::coop +{ +template +[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> +__reduce_impl(this_thread<_Hierarchy>, _Tp (&__thread_data)[_Np], _RedFn __red_fn) +{ + return ::cub::ThreadReduce(__thread_data, __red_fn); +} + +template +[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> +__reduce_impl(this_warp<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn) +{ + using _WarpReduce = ::cub::WarpReduce<_Tp>; + __shared__ typename _WarpReduce::TempStorage __scratch; + + const auto __result = _WarpReduce{__scratch}.Reduce(__thread_data, __red_fn); + return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt; +} + +template +[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> +__reduce_impl(this_block<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn) +{ + using _BlockExts = decltype(gpu_thread.extents(block, __group.hierarchy())); + static_assert(_BlockExts::rank_dynamic() == 0, + "cuda::coop::reduce requires the block level to have all static extents."); + + using _BlockReduce = + ::cub::BlockReduce<_Tp, + static_cast(_BlockExts::static_extent(0)), + ::cub::BLOCK_REDUCE_WARP_REDUCTIONS, + static_cast(_BlockExts::static_extent(1)), + static_cast(_BlockExts::static_extent(2))>; + __shared__ typename _BlockReduce::TempStorage __scratch; + + const auto __result = _BlockReduce{__scratch}.Reduce(__thread_data, __red_fn); + return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt; +} + +template +[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> +reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn) +{ + static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent, + "cuda::coop::reduce requires the group to have statically known size"); + + if (!gpu_thread.is_part_of(__group)) + { + return ::cuda::std::nullopt; + } + + return ::cuda::experimental::coop::__reduce_impl(__group, __thread_data, __red_fn); +} +} // namespace cuda::experimental::coop + +#endif // !_CCCL_DOXYGEN_INVOKED + +#include + +#endif // _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH diff --git a/cudax/include/cuda/experimental/coop.cuh b/cudax/include/cuda/experimental/coop.cuh new file mode 100644 index 00000000000..51741e1fd2a --- /dev/null +++ b/cudax/include/cuda/experimental/coop.cuh @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL_COOP +#define _CUDA_EXPERIMENTAL_COOP + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#endif // _CUDA_EXPERIMENTAL_COOP diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index 5a6d5a67688..53a79acdc74 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -170,6 +170,16 @@ cudax_add_catch2_test(test_target group.this_group group/this_group.cu ) +cudax_add_catch2_test(test_target coop.reduce.this_thread + coop/reduce/this_thread.cu +) +cudax_add_catch2_test(test_target coop.reduce.this_warp + coop/reduce/this_warp.cu +) +cudax_add_catch2_test(test_target coop.reduce.this_block + coop/reduce/this_block.cu +) + if (cudax_ENABLE_CUFILE) cudax_add_catch2_test(test_target cufile.driver_attributes cufile/driver_attributes.cu diff --git a/cudax/test/coop/reduce/this_block.cu b/cudax/test/coop/reduce/this_block.cu new file mode 100644 index 00000000000..5857e0c9937 --- /dev/null +++ b/cudax/test/coop/reduce/this_block.cu @@ -0,0 +1,186 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +/*********************************************************************************************************************** + * Thread Reduce Wrapper Kernels + **********************************************************************************************************************/ + +struct ReduceKernel +{ + template + __device__ void operator()( + Config config, + cuda::std::integral_constant, + const T* __restrict__ d_in, + T* __restrict__ d_out, + RedOp red_op) + { + cudax::this_block block{config}; + + T thread_data[NumItems]; + for (int i = 0; i < NumItems; ++i) + { + thread_data[i] = d_in[cuda::gpu_thread.rank_as(block) + i * cuda::gpu_thread.count_as(block)]; + } + const auto result = cudax::coop::reduce(block, thread_data, red_op); + + REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(block)); + if (cuda::gpu_thread.is_root_rank(block)) + { + *d_out = result.value(); + } + } +}; + +/*********************************************************************************************************************** + * Type list definition + **********************************************************************************************************************/ + +using integral_type_list = + c2h::type_list; + +using fp_type_list = c2h::type_list; + +using operator_integral_list = + c2h::type_list, + cuda::std::multiplies<>, + cuda::std::bit_and<>, + cuda::std::bit_or<>, + cuda::std::bit_xor<>, + cuda::minimum<>, + cuda::maximum<>>; + +using operator_fp_list = c2h::type_list, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>; + +using block_size_list = c2h::enum_type_list; + +/*********************************************************************************************************************** + * Verify results and kernel launch + **********************************************************************************************************************/ + +template +void verify_results(const T& expected_data, const T& test_results) +{ + if constexpr (cuda::std::is_floating_point_v) + { + REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05})); + } + else + { + REQUIRE(expected_data == test_results); + } +} + +template +void run_reduce_kernel( + cuda::stream_ref stream, + cuda::std::integral_constant, + int num_items, + const c2h::device_vector& in, + c2h::device_vector& out, + RedOp red_op) +{ + const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims()); + const auto in_ptr = thrust::raw_pointer_cast(in.data()); + const auto out_ptr = thrust::raw_pointer_cast(out.data()); + const ReduceKernel kernel{}; + + switch (num_items) + { + case 1: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 4: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + default: + FAIL("Unsupported number of items"); + } + stream.sync(); +} + +constexpr int max_size = 4; +constexpr int num_seeds = 10; + +/*********************************************************************************************************************** + * Test cases + **********************************************************************************************************************/ + +_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data + +C2H_TEST("reduce/this_block Integral Type Tests", + "[reduce][this_block]", + integral_type_list, + operator_integral_list, + block_size_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + using block_size_t = c2h::get<2, TestType>; + constexpr auto reduce_op = op_t{}; + constexpr auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size * block_size_t::value); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items : {1, 4}) + { + auto reference_result = + cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op); + run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +} + +C2H_TEST("reduce/this_block Floating-Point Type Tests", + "[reduce][this_block]", + fp_type_list, + operator_fp_list, + block_size_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + using block_size_t = c2h::get<2, TestType>; + constexpr auto reduce_op = op_t{}; + const auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size * block_size_t::value); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items : {1, 4}) + { + auto reference_result = + cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op); + run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +} diff --git a/cudax/test/coop/reduce/this_thread.cu b/cudax/test/coop/reduce/this_thread.cu new file mode 100644 index 00000000000..f6339dac344 --- /dev/null +++ b/cudax/test/coop/reduce/this_thread.cu @@ -0,0 +1,203 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +/*********************************************************************************************************************** + * Thread Reduce Wrapper Kernels + **********************************************************************************************************************/ + +struct ReduceKernel +{ + template + __device__ void operator()( + Config config, + cuda::std::integral_constant, + const T* __restrict__ d_in, + T* __restrict__ d_out, + RedOp red_op) + { + T thread_data[NumItems]; + for (int i = 0; i < NumItems; ++i) + { + thread_data[i] = d_in[i]; + } + const auto result = cudax::coop::reduce(cudax::this_thread{config}, thread_data, red_op); + REQUIRE(result.has_value()); + *d_out = result.value(); + } +}; + +/*********************************************************************************************************************** + * Type list definition + **********************************************************************************************************************/ + +using integral_type_list = + c2h::type_list; + +using fp_type_list = c2h::type_list; + +using operator_integral_list = + c2h::type_list, + cuda::std::multiplies<>, + cuda::std::bit_and<>, + cuda::std::bit_or<>, + cuda::std::bit_xor<>, + cuda::minimum<>, + cuda::maximum<>>; + +using operator_fp_list = c2h::type_list, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>; + +/*********************************************************************************************************************** + * Verify results and kernel launch + **********************************************************************************************************************/ + +template +void verify_results(const T& expected_data, const T& test_results) +{ + if constexpr (cuda::std::is_floating_point_v) + { + REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05})); + } + else + { + REQUIRE(expected_data == test_results); + } +} + +template +void run_reduce_kernel( + cuda::stream_ref stream, int num_items, const c2h::device_vector& in, c2h::device_vector& out, RedOp red_op) +{ + const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<1>()); + const auto in_ptr = thrust::raw_pointer_cast(in.data()); + const auto out_ptr = thrust::raw_pointer_cast(out.data()); + const ReduceKernel kernel{}; + + switch (num_items) + { + case 1: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 2: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 3: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 4: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 5: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 6: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 7: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 8: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 9: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 10: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 11: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 12: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 13: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 14: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 15: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 16: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + default: + FAIL("Unsupported number of items"); + } + stream.sync(); +} + +constexpr int max_size = 16; +constexpr int num_seeds = 10; + +/*********************************************************************************************************************** + * Test cases + **********************************************************************************************************************/ + +_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data + +C2H_TEST("reduce/this_thread Integral Type Tests", "[reduce][this_thread]", integral_type_list, operator_integral_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + constexpr auto reduce_op = op_t{}; + constexpr auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items = 1; num_items <= max_size; ++num_items) + { + auto reference_result = cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items, operator_identity, reduce_op); + run_reduce_kernel(stream, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +} + +C2H_TEST("reduce/this_thread Floating-Point Type Tests", "[reduce][this_thread]", fp_type_list, operator_fp_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + constexpr auto reduce_op = op_t{}; + const auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items = 1; num_items <= max_size; ++num_items) + { + auto reference_result = cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items, operator_identity, reduce_op); + run_reduce_kernel(stream, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +} diff --git a/cudax/test/coop/reduce/this_warp.cu b/cudax/test/coop/reduce/this_warp.cu new file mode 100644 index 00000000000..e6a1cae5d0d --- /dev/null +++ b/cudax/test/coop/reduce/this_warp.cu @@ -0,0 +1,176 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +/*********************************************************************************************************************** + * Thread Reduce Wrapper Kernels + **********************************************************************************************************************/ + +struct ReduceKernel +{ + template + __device__ void operator()( + Config config, + cuda::std::integral_constant, + const T* __restrict__ d_in, + T* __restrict__ d_out, + RedOp red_op) + { + cudax::this_warp warp{config}; + + T thread_data[NumItems]; + for (int i = 0; i < NumItems; ++i) + { + thread_data[i] = d_in[cuda::gpu_thread.rank_as(warp) + i * cuda::gpu_thread.count_as(warp)]; + } + const auto result = cudax::coop::reduce(warp, thread_data, red_op); + + REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(warp)); + if (cuda::gpu_thread.is_root_rank(warp)) + { + *d_out = result.value(); + } + } +}; + +/*********************************************************************************************************************** + * Type list definition + **********************************************************************************************************************/ + +using integral_type_list = + c2h::type_list; + +using fp_type_list = c2h::type_list; + +using operator_integral_list = + c2h::type_list, + cuda::std::multiplies<>, + cuda::std::bit_and<>, + cuda::std::bit_or<>, + cuda::std::bit_xor<>, + cuda::minimum<>, + cuda::maximum<>>; + +using operator_fp_list = c2h::type_list, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>; + +/*********************************************************************************************************************** + * Verify results and kernel launch + **********************************************************************************************************************/ + +template +void verify_results(const T& expected_data, const T& test_results) +{ + if constexpr (cuda::std::is_floating_point_v) + { + REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05})); + } + else + { + REQUIRE(expected_data == test_results); + } +} + +template +void run_thread_reduce_kernel( + cuda::stream_ref stream, int num_items, const c2h::device_vector& in, c2h::device_vector& out, RedOp red_op) +{ + const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<32>()); + const auto in_ptr = thrust::raw_pointer_cast(in.data()); + const auto out_ptr = thrust::raw_pointer_cast(out.data()); + const ReduceKernel kernel{}; + + switch (num_items) + { + case 1: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 2: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 3: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + case 4: + cuda::launch(stream, config, kernel, cuda::std::integral_constant{}, in_ptr, out_ptr, red_op); + break; + default: + FAIL("Unsupported number of items"); + } + stream.sync(); +} + +constexpr int warp_size = 32; +constexpr int max_size = 4; +constexpr int num_seeds = 10; + +/*********************************************************************************************************************** + * Test cases + **********************************************************************************************************************/ + +_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data + +C2H_TEST("reduce/this_warp Integral Type Tests", "[reduce][this_warp]", integral_type_list, operator_integral_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + constexpr auto reduce_op = op_t{}; + constexpr auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size * warp_size); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items = 1; num_items <= max_size; ++num_items) + { + auto reference_result = + cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * warp_size, operator_identity, reduce_op); + run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +} + +C2H_TEST("reduce/this_warp Floating-Point Type Tests", "[reduce][this_warp]", fp_type_list, operator_fp_list) +{ + using value_t = c2h::get<0, TestType>; + using op_t = c2h::get<1, TestType>; + constexpr auto reduce_op = op_t{}; + const auto operator_identity = cuda::identity_element(); + CAPTURE(c2h::type_name(), max_size, c2h::type_name()); + c2h::device_vector d_in(max_size * warp_size); + c2h::device_vector d_out(1); + c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits::min()); + c2h::host_vector h_in = d_in; + cuda::stream stream{cuda::devices[0]}; + for (int num_items = 1; num_items <= max_size; ++num_items) + { + auto reference_result = + cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * warp_size, operator_identity, reduce_op); + run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op); + verify_results(reference_result, c2h::host_vector(d_out)[0]); + } +}