Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions cudax/include/cuda/experimental/__coop/reduce.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH
#define _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_reduce.cuh>
#include <cub/thread/thread_reduce.cuh>
#include <cub/warp/warp_reduce.cuh>

#include <cuda/std/__cstddef/types.h>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/optional>

#include <cuda/experimental/group.cuh>

#include <cuda/std/__cccl/prologue.h>

#if !defined(_CCCL_DOXYGEN_INVOKED)

namespace cuda::experimental::coop
{
template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
__reduce_impl(this_thread<_Hierarchy>, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
{
return ::cub::ThreadReduce(__thread_data, __red_fn);
}

template <class _Hierarchy, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
__reduce_impl(this_warp<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
{
using _WarpReduce = ::cub::WarpReduce<_Tp>;
__shared__ typename _WarpReduce::TempStorage __scratch;

const auto __result = _WarpReduce{__scratch}.Reduce(__thread_data, __red_fn);
return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt;
Comment thread
davebayer marked this conversation as resolved.
Comment on lines +54 to +55
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am confused by this line. Is this because the value is only valid in the leader thread? Should we broadcast it rather than diverging further?

Otherwise, why do we even compute it if its not desired

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is because only the root rank has the correct value. CUB does the same thing, but returns garbage for non-root ranks. I think returning optional is the better way.

We've already discussed this before and we agreed that we would start like this and add an API that would also broadcast the result in the future.

}

template <class _Hierarchy, class _Tp, cuda::std::size_t _Np, class _RedFn>
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
__reduce_impl(this_block<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
{
using _BlockExts = decltype(gpu_thread.extents(block, __group.hierarchy()));
static_assert(_BlockExts::rank_dynamic() == 0,
"cuda::coop::reduce requires the block level to have all static extents.");

using _BlockReduce =
::cub::BlockReduce<_Tp,
static_cast<int>(_BlockExts::static_extent(0)),
::cub::BLOCK_REDUCE_WARP_REDUCTIONS,
static_cast<int>(_BlockExts::static_extent(1)),
static_cast<int>(_BlockExts::static_extent(2))>;
__shared__ typename _BlockReduce::TempStorage __scratch;

const auto __result = _BlockReduce{__scratch}.Reduce(__thread_data, __red_fn);
return (gpu_thread.is_root_rank(__group)) ? ::cuda::std::optional{__result} : ::cuda::std::nullopt;
}

template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)
{
static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent,
"cuda::coop::reduce requires the group to have statically known size");

if (!gpu_thread.is_part_of(__group))
{
return ::cuda::std::nullopt;
}

return ::cuda::experimental::coop::__reduce_impl(__group, __thread_data, __red_fn);
}
} // namespace cuda::experimental::coop

#endif // !_CCCL_DOXYGEN_INVOKED

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA_EXPERIMENTAL___COOP_REDUCE_CUH
26 changes: 26 additions & 0 deletions cudax/include/cuda/experimental/coop.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_EXPERIMENTAL_COOP
#define _CUDA_EXPERIMENTAL_COOP

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/experimental/__coop/reduce.cuh>

#endif // _CUDA_EXPERIMENTAL_COOP
Comment thread
davebayer marked this conversation as resolved.
10 changes: 10 additions & 0 deletions cudax/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ cudax_add_catch2_test(test_target group.this_group
group/this_group.cu
)

cudax_add_catch2_test(test_target coop.reduce.this_thread
coop/reduce/this_thread.cu
)
cudax_add_catch2_test(test_target coop.reduce.this_warp
coop/reduce/this_warp.cu
)
cudax_add_catch2_test(test_target coop.reduce.this_block
coop/reduce/this_block.cu
)

if (cudax_ENABLE_CUFILE)
cudax_add_catch2_test(test_target cufile.driver_attributes
cufile/driver_attributes.cu
Expand Down
186 changes: 186 additions & 0 deletions cudax/test/coop/reduce/this_block.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/devices>
#include <cuda/functional>
#include <cuda/hierarchy>
#include <cuda/launch>
#include <cuda/std/algorithm>
#include <cuda/std/type_traits>
#include <cuda/stream>

#include <cuda/experimental/coop.cuh>
#include <cuda/experimental/group.cuh>

#include <testing.cuh>

#include <c2h/catch2_test_helper.h>
#include <c2h/extended_types.h>
#include <c2h/generators.h>
#include <catch2/matchers/catch_matchers_floating_point.hpp>

/***********************************************************************************************************************
* Thread Reduce Wrapper Kernels
**********************************************************************************************************************/

struct ReduceKernel
{
template <class Config, int NumItems, class T, class RedOp>
__device__ void operator()(
Config config,
cuda::std::integral_constant<int, NumItems>,
const T* __restrict__ d_in,
T* __restrict__ d_out,
RedOp red_op)
{
cudax::this_block block{config};

T thread_data[NumItems];
for (int i = 0; i < NumItems; ++i)
{
thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(block) + i * cuda::gpu_thread.count_as<int>(block)];
}
const auto result = cudax::coop::reduce(block, thread_data, red_op);

REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(block));
if (cuda::gpu_thread.is_root_rank(block))
{
*d_out = result.value();
}
}
};
Comment thread
davebayer marked this conversation as resolved.

/***********************************************************************************************************************
* Type list definition
**********************************************************************************************************************/

using integral_type_list =
c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;

using fp_type_list = c2h::type_list<float, double>;

using operator_integral_list =
c2h::type_list<cuda::std::plus<>,
cuda::std::multiplies<>,
cuda::std::bit_and<>,
cuda::std::bit_or<>,
cuda::std::bit_xor<>,
cuda::minimum<>,
cuda::maximum<>>;

using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;

using block_size_list = c2h::enum_type_list<int, 3, 32, 63, 128>;

/***********************************************************************************************************************
* Verify results and kernel launch
**********************************************************************************************************************/

template <class T>
void verify_results(const T& expected_data, const T& test_results)
{
if constexpr (cuda::std::is_floating_point_v<T>)
{
REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
}
else
{
REQUIRE(expected_data == test_results);
}
}

template <int BlockSize, class T, class RedOp>
void run_reduce_kernel(
cuda::stream_ref stream,
cuda::std::integral_constant<int, BlockSize>,
int num_items,
const c2h::device_vector<T>& in,
c2h::device_vector<T>& out,
RedOp red_op)
{
const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<BlockSize>());
const auto in_ptr = thrust::raw_pointer_cast(in.data());
const auto out_ptr = thrust::raw_pointer_cast(out.data());
const ReduceKernel kernel{};

switch (num_items)
{
case 1:
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
break;
case 4:
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
break;
default:
FAIL("Unsupported number of items");
}
stream.sync();
}

constexpr int max_size = 4;
constexpr int num_seeds = 10;

/***********************************************************************************************************************
* Test cases
**********************************************************************************************************************/

_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data

C2H_TEST("reduce/this_block Integral Type Tests",
"[reduce][this_block]",
integral_type_list,
operator_integral_list,
block_size_list)
{
using value_t = c2h::get<0, TestType>;
using op_t = c2h::get<1, TestType>;
using block_size_t = c2h::get<2, TestType>;
constexpr auto reduce_op = op_t{};
constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
c2h::device_vector<value_t> d_in(max_size * block_size_t::value);
c2h::device_vector<value_t> d_out(1);
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
c2h::host_vector<value_t> h_in = d_in;
cuda::stream stream{cuda::devices[0]};
for (int num_items : {1, 4})
{
auto reference_result =
cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op);
run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op);
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
}
}

C2H_TEST("reduce/this_block Floating-Point Type Tests",
"[reduce][this_block]",
fp_type_list,
operator_fp_list,
block_size_list)
{
using value_t = c2h::get<0, TestType>;
using op_t = c2h::get<1, TestType>;
using block_size_t = c2h::get<2, TestType>;
constexpr auto reduce_op = op_t{};
const auto operator_identity = cuda::identity_element<op_t, value_t>();
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
c2h::device_vector<value_t> d_in(max_size * block_size_t::value);
c2h::device_vector<value_t> d_out(1);
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
c2h::host_vector<value_t> h_in = d_in;
cuda::stream stream{cuda::devices[0]};
for (int num_items : {1, 4})
{
auto reference_result =
cuda::std::accumulate(h_in.begin(), h_in.begin() + num_items * block_size_t::value, operator_identity, reduce_op);
run_reduce_kernel(stream, block_size_t{}, num_items, d_in, d_out, reduce_op);
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
}
}
Loading
Loading