-
Notifications
You must be signed in to change notification settings - Fork 397
[cudax] Initial cudax::coop::reduce prototype
#9154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of CUDA Experimental in CUDA C++ Core Libraries, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH | ||
| #define _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH | ||
|
|
||
| #include <cuda/std/detail/__config> | ||
|
|
||
| #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| #endif // no system header | ||
|
|
||
| #include <cub/block/block_reduce.cuh> | ||
| #include <cub/thread/thread_reduce.cuh> | ||
| #include <cub/warp/warp_reduce.cuh> | ||
|
|
||
| #include <cuda/std/__cstddef/types.h> | ||
| #include <cuda/std/__functional/operations.h> | ||
| #include <cuda/std/optional> | ||
|
|
||
| #include <cuda/experimental/group.cuh> | ||
|
|
||
| #include <cuda/std/__cccl/prologue.h> | ||
|
|
||
| #if !defined(_CCCL_DOXYGEN_INVOKED) | ||
|
|
||
| namespace cuda::experimental::coop | ||
| { | ||
| template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn> | ||
| [[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> | ||
| __reduce_impl(this_thread<_Hierarchy>, _Tp (&__thread_data)[_Np], _RedFn __red_fn) | ||
| { | ||
| return ::cub::ThreadReduce(__thread_data, __red_fn); | ||
| } | ||
|
|
||
| template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn> | ||
| [[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> | ||
| __reduce_impl(this_warp<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn) | ||
| { | ||
| using _WarpReduce = ::cub::WarpReduce<_Tp>; | ||
| __shared__ typename _WarpReduce::TempStorage __scratch; | ||
|
|
||
| const auto __result = _WarpReduce{__scratch}.Reduce(__thread_data, __red_fn); | ||
| return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt; | ||
|
Comment on lines
+54
to
+55
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am confused by this line. Is this because the value is only valid in the leader thread? Should we broadcast it rather than diverging further? Otherwise, why do we even compute it if its not desired
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is because only the root rank has the correct value. CUB does the same thing, but returns garbage for non-root ranks. I think returning optional is the better way. We've already discussed this before and we agreed that we would start like this and add an API that would also broadcast the result in the future. |
||
| } | ||
|
|
||
| template <class _Hierarchy, class _Tp, cuda::std::size_t _Np, class _RedFn> | ||
| [[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> | ||
| __reduce_impl(this_block<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn) | ||
| { | ||
| using _BlockExts = decltype(gpu_thread.extents(block, __group.hierarchy())); | ||
| static_assert(_BlockExts::rank_dynamic() == 0, | ||
| "cuda::coop::reduce requires the block level to have all static extents."); | ||
|
|
||
| using _BlockReduce = | ||
| ::cub::BlockReduce<_Tp, | ||
| static_cast<int>(_BlockExts::static_extent(0)), | ||
| ::cub::BLOCK_REDUCE_WARP_REDUCTIONS, | ||
| static_cast<int>(_BlockExts::static_extent(1)), | ||
| static_cast<int>(_BlockExts::static_extent(2))>; | ||
| __shared__ typename _BlockReduce::TempStorage __scratch; | ||
|
|
||
| const auto __result = _BlockReduce{__scratch}.Reduce(__thread_data, __red_fn); | ||
| return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt; | ||
| } | ||
|
|
||
| template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn> | ||
| [[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp> | ||
| reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn) | ||
| { | ||
| static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent, | ||
| "cuda::coop::reduce requires the group to have statically known size"); | ||
|
|
||
| if (!gpu_thread.is_part_of(__group)) | ||
| { | ||
| return ::cuda::std::nullopt; | ||
| } | ||
|
|
||
| return ::cuda::experimental::coop::__reduce_impl(__group, __thread_data, __red_fn); | ||
| } | ||
| } // namespace cuda::experimental::coop | ||
|
|
||
| #endif // !_CCCL_DOXYGEN_INVOKED | ||
|
|
||
| #include <cuda/std/__cccl/epilogue.h> | ||
|
|
||
| #endif // _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of CUDA Experimental in CUDA C++ Core Libraries, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef _CUDA_EXPERIMENTAL_COOP | ||
| #define _CUDA_EXPERIMENTAL_COOP | ||
|
|
||
| #include <cuda/std/detail/__config> | ||
|
|
||
| #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| #endif // no system header | ||
|
|
||
| #include <cuda/experimental/__coop/reduce.cuh> | ||
|
|
||
| #endif // _CUDA_EXPERIMENTAL_COOP | ||
|
davebayer marked this conversation as resolved.
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,186 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of CUDA Experimental in CUDA C++ Core Libraries, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include <cuda/devices> | ||
| #include <cuda/functional> | ||
| #include <cuda/hierarchy> | ||
| #include <cuda/launch> | ||
| #include <cuda/std/algorithm> | ||
| #include <cuda/std/type_traits> | ||
| #include <cuda/stream> | ||
|
|
||
| #include <cuda/experimental/coop.cuh> | ||
| #include <cuda/experimental/group.cuh> | ||
|
|
||
| #include <testing.cuh> | ||
|
|
||
| #include <c2h/catch2_test_helper.h> | ||
| #include <c2h/extended_types.h> | ||
| #include <c2h/generators.h> | ||
| #include <catch2/matchers/catch_matchers_floating_point.hpp> | ||
|
|
||
| /*********************************************************************************************************************** | ||
| * Thread Reduce Wrapper Kernels | ||
| **********************************************************************************************************************/ | ||
|
|
||
| struct ReduceKernel | ||
| { | ||
| template <class Config, int NumItems, class T, class RedOp> | ||
| __device__ void operator()( | ||
| Config config, | ||
| cuda::std::integral_constant<int, NumItems>, | ||
| const T* __restrict__ d_in, | ||
| T* __restrict__ d_out, | ||
| RedOp red_op) | ||
| { | ||
| cudax::this_block block{config}; | ||
|
|
||
| T thread_data[NumItems]; | ||
| for (int i = 0; i < NumItems; ++i) | ||
| { | ||
| thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(block) + i * cuda::gpu_thread.count_as<int>(block)]; | ||
| } | ||
| const auto result = cudax::coop::reduce(block, thread_data, red_op); | ||
|
|
||
| REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(block)); | ||
| if (cuda::gpu_thread.is_root_rank(block)) | ||
| { | ||
| *d_out = result.value(); | ||
| } | ||
| } | ||
| }; | ||
|
davebayer marked this conversation as resolved.
|
||
|
|
||
| /*********************************************************************************************************************** | ||
| * Type list definition | ||
| **********************************************************************************************************************/ | ||
|
|
||
| using integral_type_list = | ||
| c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>; | ||
|
|
||
| using fp_type_list = c2h::type_list<float, double>; | ||
|
|
||
| using operator_integral_list = | ||
| c2h::type_list<cuda::std::plus<>, | ||
| cuda::std::multiplies<>, | ||
| cuda::std::bit_and<>, | ||
| cuda::std::bit_or<>, | ||
| cuda::std::bit_xor<>, | ||
| cuda::minimum<>, | ||
| cuda::maximum<>>; | ||
|
|
||
| using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>; | ||
|
|
||
| using block_size_list = c2h::enum_type_list<int, 3, 32, 63, 128>; | ||
|
|
||
| /*********************************************************************************************************************** | ||
| * Verify results and kernel launch | ||
| **********************************************************************************************************************/ | ||
|
|
||
| template <class T> | ||
| void verify_results(const T& expected_data, const T& test_results) | ||
| { | ||
| if constexpr (cuda::std::is_floating_point_v<T>) | ||
| { | ||
| REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05})); | ||
| } | ||
| else | ||
| { | ||
| REQUIRE(expected_data == test_results); | ||
| } | ||
| } | ||
|
|
||
| template <int BlockSize, class T, class RedOp> | ||
| void run_reduce_kernel( | ||
| cuda::stream_ref stream, | ||
| cuda::std::integral_constant<int, BlockSize>, | ||
| int num_items, | ||
| const c2h::device_vector<T>& in, | ||
| c2h::device_vector<T>& out, | ||
| RedOp red_op) | ||
| { | ||
| const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<BlockSize>()); | ||
| const auto in_ptr = thrust::raw_pointer_cast(in.data()); | ||
| const auto out_ptr = thrust::raw_pointer_cast(out.data()); | ||
| const ReduceKernel kernel{}; | ||
|
|
||
| switch (num_items) | ||
| { | ||
| case 1: | ||
| cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op); | ||
| break; | ||
| case 4: | ||
| cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op); | ||
| break; | ||
| default: | ||
| FAIL("Unsupported number of items"); | ||
| } | ||
| stream.sync(); | ||
| } | ||
|
|
||
| constexpr int max_size = 4; | ||
| constexpr int num_seeds = 10; | ||
|
|
||
| /*********************************************************************************************************************** | ||
| * Test cases | ||
| **********************************************************************************************************************/ | ||
|
|
||
| _CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data | ||
|
|
||
| C2H_TEST("reduce/this_block Integral Type Tests", | ||
| "[reduce][this_block]", | ||
| integral_type_list, | ||
| operator_integral_list, | ||
| block_size_list) | ||
| { | ||
| using value_t = c2h::get<0, TestType>; | ||
| using op_t = c2h::get<1, TestType>; | ||
| using block_size_t = c2h::get<2, TestType>; | ||
| constexpr auto reduce_op = op_t{}; | ||
| constexpr auto operator_identity = cuda::identity_element<op_t, value_t>(); | ||
| CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>()); | ||
| c2h::device_vector<value_t> d_in(max_size * block_size_t::value); | ||
| c2h::device_vector<value_t> d_out(1); | ||
| c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min()); | ||
| c2h::host_vector<value_t> h_in = d_in; | ||
| cuda::stream stream{cuda::devices[0]}; | ||
| for (int num_items : {1, 4}) | ||
| { | ||
| auto reference_result = | ||
| cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op); | ||
| run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op); | ||
| verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]); | ||
| } | ||
| } | ||
|
|
||
| C2H_TEST("reduce/this_block Floating-Point Type Tests", | ||
| "[reduce][this_block]", | ||
| fp_type_list, | ||
| operator_fp_list, | ||
| block_size_list) | ||
| { | ||
| using value_t = c2h::get<0, TestType>; | ||
| using op_t = c2h::get<1, TestType>; | ||
| using block_size_t = c2h::get<2, TestType>; | ||
| constexpr auto reduce_op = op_t{}; | ||
| const auto operator_identity = cuda::identity_element<op_t, value_t>(); | ||
| CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>()); | ||
| c2h::device_vector<value_t> d_in(max_size * block_size_t::value); | ||
| c2h::device_vector<value_t> d_out(1); | ||
| c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min()); | ||
| c2h::host_vector<value_t> h_in = d_in; | ||
| cuda::stream stream{cuda::devices[0]}; | ||
| for (int num_items : {1, 4}) | ||
| { | ||
| auto reference_result = | ||
| cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op); | ||
| run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op); | ||
| verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.