NVIDIA · davebayer · May 29, 2026 · miscco · May 29, 2026 · davebayer
@@ -0,0 +1,98 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH
+#define _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__functional/operations.h>
+#include <cuda/std/optional>
+
+#include <cuda/experimental/group.cuh>
+
+#include <cuda/std/__cccl/prologue.h>
+
+#if !defined(_CCCL_DOXYGEN_INVOKED)
+
+namespace cuda::experimental::coop
+{
+template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
+[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
+__reduce_impl(this_thread<_Hierarchy>, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
+{
+  return ::cub::ThreadReduce(__thread_data, __red_fn);
+}
+
+template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
+[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
+__reduce_impl(this_warp<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
+{
+  using _WarpReduce = ::cub::WarpReduce<_Tp>;
+  __shared__ typename _WarpReduce::TempStorage __scratch;
+
+  const auto __result = _WarpReduce{__scratch}.Reduce(__thread_data, __red_fn);
+  return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt;
+}
+
+template <class _Hierarchy, class _Tp, cuda::std::size_t _Np, class _RedFn>
+[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
+__reduce_impl(this_block<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
+{
+  using _BlockExts = decltype(gpu_thread.extents(block, __group.hierarchy()));
+  static_assert(_BlockExts::rank_dynamic() == 0,
+                "cuda::coop::reduce requires the block level to have all static extents.");
+
+  using _BlockReduce =
+    ::cub::BlockReduce<_Tp,
+                       static_cast<int>(_BlockExts::static_extent(0)),
+                       ::cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                       static_cast<int>(_BlockExts::static_extent(1)),
+                       static_cast<int>(_BlockExts::static_extent(2))>;
+  __shared__ typename _BlockReduce::TempStorage __scratch;
+
+  const auto __result = _BlockReduce{__scratch}.Reduce(__thread_data, __red_fn);
+  return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt;
+}
+
+template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
+[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
+reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)
+{
+  static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent,
+                "cuda::coop::reduce requires the group to have statically known size");
+
+  if (!gpu_thread.is_part_of(__group))
+  {
+    return ::cuda::std::nullopt;
+  }
+
+  return ::cuda::experimental::coop::__reduce_impl(__group, __thread_data, __red_fn);
+}
+} // namespace cuda::experimental::coop
+
+#endif // !_CCCL_DOXYGEN_INVOKED
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL_COOP
+#define _CUDA_EXPERIMENTAL_COOP
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/experimental/__coop/reduce.cuh>
+
+#endif // _CUDA_EXPERIMENTAL_COOP
@@ -170,6 +170,16 @@ cudax_add_catch2_test(test_target group.this_group
     group/this_group.cu
 )
 
+cudax_add_catch2_test(test_target coop.reduce.this_thread
+    coop/reduce/this_thread.cu
+)
+cudax_add_catch2_test(test_target coop.reduce.this_warp
+    coop/reduce/this_warp.cu
+)
+cudax_add_catch2_test(test_target coop.reduce.this_block
+    coop/reduce/this_block.cu
+)
+
 if (cudax_ENABLE_CUFILE)
   cudax_add_catch2_test(test_target cufile.driver_attributes
       cufile/driver_attributes.cu

@@ -0,0 +1,186 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/devices>
+#include <cuda/functional>
+#include <cuda/hierarchy>
+#include <cuda/launch>
+#include <cuda/std/algorithm>
+#include <cuda/std/type_traits>
+#include <cuda/stream>
+
+#include <cuda/experimental/coop.cuh>
+#include <cuda/experimental/group.cuh>
+
+#include <testing.cuh>
+
+#include <c2h/catch2_test_helper.h>
+#include <c2h/extended_types.h>
+#include <c2h/generators.h>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+/***********************************************************************************************************************
+ * Thread Reduce Wrapper Kernels
+ **********************************************************************************************************************/
+
+struct ReduceKernel
+{
+  template <class Config, int NumItems, class T, class RedOp>
+  __device__ void operator()(
+    Config config,
+    cuda::std::integral_constant<int, NumItems>,
+    const T* __restrict__ d_in,
+    T* __restrict__ d_out,
+    RedOp red_op)
+  {
+    cudax::this_block block{config};
+
+    T thread_data[NumItems];
+    for (int i = 0; i < NumItems; ++i)
+    {
+      thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(block) + i * cuda::gpu_thread.count_as<int>(block)];
+    }
+    const auto result = cudax::coop::reduce(block, thread_data, red_op);
+
+    REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(block));
+    if (cuda::gpu_thread.is_root_rank(block))
+    {
+      *d_out = result.value();
+    }
+  }
+};
+
+/***********************************************************************************************************************
+ * Type list definition
+ **********************************************************************************************************************/
+
+using integral_type_list =
+  c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;
+
+using fp_type_list = c2h::type_list<float, double>;
+
+using operator_integral_list =
+  c2h::type_list<cuda::std::plus<>,
+                 cuda::std::multiplies<>,
+                 cuda::std::bit_and<>,
+                 cuda::std::bit_or<>,
+                 cuda::std::bit_xor<>,
+                 cuda::minimum<>,
+                 cuda::maximum<>>;
+
+using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;
+
+using block_size_list = c2h::enum_type_list<int, 3, 32, 63, 128>;
+
+/***********************************************************************************************************************
+ * Verify results and kernel launch
+ **********************************************************************************************************************/
+
+template <class T>
+void verify_results(const T& expected_data, const T& test_results)
+{
+  if constexpr (cuda::std::is_floating_point_v<T>)
+  {
+    REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
+  }
+  else
+  {
+    REQUIRE(expected_data == test_results);
+  }
+}
+
+template <int BlockSize, class T, class RedOp>
+void run_reduce_kernel(
+  cuda::stream_ref stream,
+  cuda::std::integral_constant<int, BlockSize>,
+  int num_items,
+  const c2h::device_vector<T>& in,
+  c2h::device_vector<T>& out,
+  RedOp red_op)
+{
+  const auto config  = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<BlockSize>());
+  const auto in_ptr  = thrust::raw_pointer_cast(in.data());
+  const auto out_ptr = thrust::raw_pointer_cast(out.data());
+  const ReduceKernel kernel{};
+
+  switch (num_items)
+  {
+    case 1:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
+      break;
+    case 4:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
+      break;
+    default:
+      FAIL("Unsupported number of items");
+  }
+  stream.sync();
+}
+
+constexpr int max_size  = 4;
+constexpr int num_seeds = 10;
+
+/***********************************************************************************************************************
+ * Test cases
+ **********************************************************************************************************************/
+
+_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data
+
+C2H_TEST("reduce/this_block Integral Type Tests",
+         "[reduce][this_block]",
+         integral_type_list,
+         operator_integral_list,
+         block_size_list)
+{
+  using value_t                    = c2h::get<0, TestType>;
+  using op_t                       = c2h::get<1, TestType>;
+  using block_size_t               = c2h::get<2, TestType>;
+  constexpr auto reduce_op         = op_t{};
+  constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
+  CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
+  c2h::device_vector<value_t> d_in(max_size * block_size_t::value);
+  c2h::device_vector<value_t> d_out(1);
+  c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
+  c2h::host_vector<value_t> h_in = d_in;
+  cuda::stream stream{cuda::devices[0]};
+  for (int num_items : {1, 4})
+  {
+    auto reference_result =
+      cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op);
+    run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op);
+    verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
+  }
+}
+
+C2H_TEST("reduce/this_block Floating-Point Type Tests",
+         "[reduce][this_block]",
+         fp_type_list,
+         operator_fp_list,
+         block_size_list)
+{
+  using value_t                = c2h::get<0, TestType>;
+  using op_t                   = c2h::get<1, TestType>;
+  using block_size_t           = c2h::get<2, TestType>;
+  constexpr auto reduce_op     = op_t{};
+  const auto operator_identity = cuda::identity_element<op_t, value_t>();
+  CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
+  c2h::device_vector<value_t> d_in(max_size * block_size_t::value);
+  c2h::device_vector<value_t> d_out(1);
+  c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
+  c2h::host_vector<value_t> h_in = d_in;
+  cuda::stream stream{cuda::devices[0]};
+  for (int num_items : {1, 4})
+  {
+    auto reference_result =
+      cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op);
+    run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op);
+    verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
+  }
+}