Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
struct policy_selector_t
{
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto operator()(cuda::compute_capability) const
-> cub::detail::adjacent_difference::adjacent_difference_policy
-> cub::AdjacentDifferencePolicy
{
return {TUNE_THREADS_PER_BLOCK,
TUNE_ITEMS_PER_THREAD,
Expand Down
17 changes: 17 additions & 0 deletions cub/cub/device/device_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,23 @@ CUB_NAMESPACE_BEGIN
//!
//! // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1]
//!
//! @par Tuning
//! All algorithms in DeviceAdjacentDifference that accept an environment can be tuned by passing a custom
//! :ref:`policy selector <cub-policy-selectors>` that returns an @ref AdjacentDifferencePolicy, as shown in the
//! example below:
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_adjacent_difference_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin subtract-left-copy-policy-selector
//! :end-before: example-end subtract-left-copy-policy-selector
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_adjacent_difference_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin subtract-left-copy-tuning
//! :end-before: example-end subtract-left-copy-tuning
Comment on lines +95 to +105
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

🧩 Analysis chain

🌐 Web query:

Does C++17 support designated initializers like T{ .field = value }, or is that a C++20-only feature?

💡 Result:

Designated initializers, using the syntax T{.field = value }, are a C++20-only feature [1][2][3][4]. They are not part of the C++17 standard [1]. While some compilers (such as GCC and Clang) provided designated initializers as a non-standard language extension long before C++20 [1][5][6], they were not officially supported in C++17 [1]. If you use this syntax in C++17 mode, you are relying on compiler-specific extensions rather than standard C++ [1]. Compilers often allow this code to compile by default, but it may trigger errors or warnings if you compile with strict standard compliance flags (such as -Wpedantic or -pedantic) [1][2]. In contrast, C++20 formally introduced designated initializers for aggregate types [2][3][4]. There are specific differences between the C++20 implementation and C99 designated initializers: for instance, C++20 requires that members be initialized in their declaration order, whereas C99 allows arbitrary order [3][4].

Citations:


important: The @par Tuning docs in cub/cub/device/device_adjacent_difference.cuh include the subtract-left-copy-policy-selector example from cub/test/catch2_test_device_adjacent_difference_env_api.cu, which uses C++20 designated initializers (return {.threads_per_block = ...}). Copy-pasting this into a C++17 project will fail under standard-conforming builds; update the snippet to use C++17-compatible aggregate initialization (e.g., positional initialization) or mirror the existing _CCCL_STD_VER >= 2020 fallback.

//!
//! @endrst
struct DeviceAdjacentDifference
{
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ _CCCL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
_CCCL_GRID_CONSTANT const OffsetT num_items)
{
static_assert(::cuda::std::is_empty_v<PolicySelector>);
static constexpr adjacent_difference_policy policy = current_policy<PolicySelector>();
static constexpr AdjacentDifferencePolicy policy = current_policy<PolicySelector>();
using AdjacentDifferencePolicyT =
AgentAdjacentDifferencePolicy<policy.threads_per_block,
policy.items_per_thread,
Expand Down Expand Up @@ -100,10 +100,10 @@ template <typename PolicyHub>
struct policy_selector_from_hub
{
// this is only called in device code, so we can ignore the cc parameter
_CCCL_DEVICE_API constexpr auto operator()(::cuda::compute_capability) const -> adjacent_difference_policy
_CCCL_DEVICE_API constexpr auto operator()(::cuda::compute_capability) const -> AdjacentDifferencePolicy
{
using p = typename PolicyHub::MaxPolicy::ActivePolicy::AdjacentDifferencePolicy;
return adjacent_difference_policy{
return AdjacentDifferencePolicy{
p::BLOCK_THREADS, p::ITEMS_PER_THREAD, p::LOAD_ALGORITHM, p::LOAD_MODIFIER, p::STORE_ALGORITHM};
}
};
Expand All @@ -123,7 +123,7 @@ template <typename InputIteratorT,
MayAlias AliasOpt,
ReadOption ReadOpt,
typename PolicyHub = detail::adjacent_difference::policy_hub<InputIteratorT, AliasOpt == MayAlias::Yes>>
struct DispatchAdjacentDifference
struct CCCL_DEPRECATED_BECAUSE("Please use DeviceAdjacentDifference") DispatchAdjacentDifference
{
using InputT = detail::it_value_t<InputIteratorT>;

Expand Down Expand Up @@ -344,7 +344,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
return error;
}

const adjacent_difference_policy active_policy = policy_selector(cc);
const AdjacentDifferencePolicy active_policy = policy_selector(cc);
#if _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)
NV_IF_TARGET(NV_IS_HOST, ({
::std::stringstream ss;
Expand Down
39 changes: 20 additions & 19 deletions cub/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,43 +22,44 @@

CUB_NAMESPACE_BEGIN

namespace detail::adjacent_difference
{
struct adjacent_difference_policy
//! The tuning policy for all algorithms in @ref DeviceAdjacentDifference.
struct AdjacentDifferencePolicy
{
int threads_per_block;
int items_per_thread;
BlockLoadAlgorithm load_algorithm;
CacheLoadModifier load_modifier;
BlockStoreAlgorithm store_algorithm;

_CCCL_HOST_DEVICE_API constexpr friend bool
operator==(const adjacent_difference_policy& lhs, const adjacent_difference_policy& rhs)
int threads_per_block; //!< Number of threads in a CUDA block
int items_per_thread; //!< Number of items processed per thread
BlockLoadAlgorithm load_algorithm; //!< The @ref BlockLoadAlgorithm used for loading items from global memory
CacheLoadModifier load_modifier; //!< The @ref CacheLoadModifier used for loading items from global memory
BlockStoreAlgorithm store_algorithm; //!< The @ref BlockStoreAlgorithm used for storing items to global memory

[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
operator==(const AdjacentDifferencePolicy& lhs, const AdjacentDifferencePolicy& rhs)
{
return lhs.threads_per_block == rhs.threads_per_block && lhs.items_per_thread == rhs.items_per_thread
&& lhs.load_algorithm == rhs.load_algorithm && lhs.load_modifier == rhs.load_modifier
&& lhs.store_algorithm == rhs.store_algorithm;
}

_CCCL_HOST_DEVICE_API constexpr friend bool
operator!=(const adjacent_difference_policy& lhs, const adjacent_difference_policy& rhs)
[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr friend bool
operator!=(const AdjacentDifferencePolicy& lhs, const AdjacentDifferencePolicy& rhs)
{
return !(lhs == rhs);
}

#if _CCCL_HOSTED()
friend ::std::ostream& operator<<(::std::ostream& os, const adjacent_difference_policy& p)
friend ::std::ostream& operator<<(::std::ostream& os, const AdjacentDifferencePolicy& p)
{
return os << "adjacent_difference_policy { .threads_per_block = " << p.threads_per_block
return os << "AdjacentDifferencePolicy { .threads_per_block = " << p.threads_per_block
<< ", .items_per_thread = " << p.items_per_thread << ", .load_algorithm = " << p.load_algorithm
<< ", .load_modifier = " << p.load_modifier << ", .store_algorithm = " << p.store_algorithm << " }";
}
#endif // _CCCL_HOSTED()
};

namespace detail::adjacent_difference
{
#if _CCCL_HAS_CONCEPTS()
template <typename T>
concept adjacent_difference_policy_selector = policy_selector<T, adjacent_difference_policy>;
concept adjacent_difference_policy_selector = policy_selector<T, AdjacentDifferencePolicy>;
#endif // _CCCL_HAS_CONCEPTS()
Comment on lines 61 to 63
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

important: Constrain the public selector concept to stateless selectors.

adjacent_difference_policy_selector accepts selectors that return AdjacentDifferencePolicy, but the device path still requires ::cuda::std::is_empty_v<PolicySelector> later in DeviceAdjacentDifferenceDifferenceKernel. With the API now public, that means a stateful selector passes the front-door constraint and then fails deep in dispatch. Fold the emptiness requirement into this concept so unsupported selectors are rejected at the API boundary instead. Based on learnings, policy selectors used at the CUB device API layer are stateless.


struct policy_selector
Expand All @@ -67,9 +68,9 @@ struct policy_selector
bool may_alias;

[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability) const
-> adjacent_difference_policy
-> AdjacentDifferencePolicy
{
return adjacent_difference_policy{
return AdjacentDifferencePolicy{
128,
nominal_8B_items_to_items(7, value_type_size),
BLOCK_LOAD_WARP_TRANSPOSE,
Expand All @@ -87,7 +88,7 @@ template <typename InputIteratorT, bool MayAlias>
struct policy_selector_from_types
{
[[nodiscard]] _CCCL_HOST_DEVICE_API constexpr auto operator()(::cuda::compute_capability cc) const
-> adjacent_difference_policy
-> AdjacentDifferencePolicy
{
constexpr auto policies = policy_selector{static_cast<int>(sizeof(it_value_t<InputIteratorT>)), MayAlias};
return policies(cc);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// TODO(bgruber): drop this test with CCCL 4.0 when we drop the adjacent difference dispatcher

// disable deprecation warnings for DispatchAdjacentDifference
#define CCCL_IGNORE_DEPRECATED_API

#include "insert_nested_NVTX_range_guard.h"

#include <cub/device/device_adjacent_difference.cuh>
Expand All @@ -14,9 +19,6 @@

using namespace cub;

// TODO(bgruber): drop this test with CCCL 4.0 when we drop the adjacent difference dispatcher after publishing the
// tuning API

template <typename InputIteratorT>
struct my_policy_hub
{
Expand Down
35 changes: 33 additions & 2 deletions cub/test/catch2_test_device_adjacent_difference_env.cu
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,7 @@ struct block_size_extracting_minus_t
template <int ThreadsPerBlock>
struct adj_diff_tuning
{
_CCCL_HOST_DEVICE_API constexpr auto operator()(cuda::compute_capability) const
-> cub::detail::adjacent_difference::adjacent_difference_policy
_CCCL_HOST_DEVICE_API constexpr auto operator()(cuda::compute_capability) const -> cub::AdjacentDifferencePolicy
{
return {ThreadsPerBlock, 1, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, cub::BLOCK_STORE_DIRECT};
}
Expand Down Expand Up @@ -241,3 +240,35 @@ C2H_TEST("DeviceAdjacentDifference::SubtractRight can be tuned", "[adjacent_diff
}

#endif // TEST_LAUNCH != 1

#if _CCCL_COMPILER(GCC, >=, 8) // gcc 7 cannot preserve constexpr-ness from p1 to p2
C2H_TEST("AdjacentDifferencePolicy", "[adjacent_difference][device]")
{
STATIC_REQUIRE(::cuda::std::semiregular<cub::AdjacentDifferencePolicy>);
STATIC_REQUIRE(::cuda::std::is_aggregate_v<cub::AdjacentDifferencePolicy>);

// aggregate init
constexpr auto p1 = cub::AdjacentDifferencePolicy{
128,
7,
cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE,
cub::CacheLoadModifier::LOAD_LDG,
cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE};

# if _CCCL_STD_VER >= 2020
// designated init
constexpr auto p2 = cub::AdjacentDifferencePolicy{
.threads_per_block = 128,
.items_per_thread = 7,
.load_algorithm = cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE,
.load_modifier = cub::CacheLoadModifier::LOAD_LDG,
.store_algorithm = cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE};
# else // _CCCL_STD_VER >= 2020
constexpr auto p2 = p1;
# endif // _CCCL_STD_VER >= 2020

// comparison
STATIC_REQUIRE(p1 == p2);
STATIC_REQUIRE_FALSE(p1 != p2);
}
#endif // _CCCL_COMPILER(GCC, >=, 8)
43 changes: 43 additions & 0 deletions cub/test/catch2_test_device_adjacent_difference_env_api.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <thrust/device_vector.h>

#include <cuda/__execution/tune.h>
#include <cuda/devices>
#include <cuda/stream>

Expand Down Expand Up @@ -104,3 +105,45 @@ C2H_TEST("cub::DeviceAdjacentDifference::SubtractRight accepts stream", "[adjace
REQUIRE(error == cudaSuccess);
REQUIRE(data == expected);
}

#if _CCCL_STD_VER >= 2020

// example-begin subtract-left-copy-policy-selector
struct AdjacentDifferencePolicySelector
{
__host__ __device__ constexpr auto operator()(cuda::compute_capability cc) const -> cub::AdjacentDifferencePolicy
{
return {.threads_per_block = 128,
.items_per_thread = cc > cuda::compute_capability{9, 0} ? 11 : 7,
.load_algorithm = cub::BLOCK_LOAD_WARP_TRANSPOSE,
.load_modifier = cub::LOAD_LDG,
.store_algorithm = cub::BLOCK_STORE_WARP_TRANSPOSE};
}
};
// example-end subtract-left-copy-policy-selector

C2H_TEST("cub::DeviceAdjacentDifference::SubtractLeftCopy env-based API with tuning", "[adjacent_difference][env]")
{
// example-begin subtract-left-copy-tuning
auto input = thrust::device_vector<int>{1, 2, 1, 2, 1, 2, 1, 2};
auto output = thrust::device_vector<int>(8, thrust::no_init);

const auto error = cub::DeviceAdjacentDifference::SubtractLeftCopy(
input.begin(),
output.begin(),
input.size(),
cuda::std::minus{},
cuda::execution::tune(AdjacentDifferencePolicySelector{}));
if (error != cudaSuccess)
{
std::cerr << "cub::DeviceAdjacentDifference::SubtractLeftCopy failed with status: " << error << '\n';
}

thrust::device_vector<int> expected{1, 1, -1, 1, -1, 1, -1, 1};
// example-end subtract-left-copy-tuning

REQUIRE(error == cudaSuccess);
REQUIRE(output == expected);
}

#endif // _CCCL_STD_VER >= 2020
Loading