-
Notifications
You must be signed in to change notification settings - Fork 342
Implement parallel cuda::std::replace_copy
#7410
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of CUDA Experimental in CUDA C++ Core Libraries, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include <thrust/device_vector.h> | ||
|
|
||
| #include <cuda/memory_pool> | ||
| #include <cuda/std/__pstl_algorithm> | ||
| #include <cuda/stream_ref> | ||
|
|
||
| #include "nvbench_helper.cuh" | ||
|
|
||
| template <typename T> | ||
| static void basic(nvbench::state& state, nvbench::type_list<T>) | ||
| { | ||
| const auto elements = static_cast<std::size_t>(state.get_int64("Elements")); | ||
|
|
||
| thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42}); | ||
| thrust::device_vector<T> out(elements, thrust::no_init); | ||
|
|
||
| state.add_element_count(elements); | ||
| state.add_global_memory_reads<T>(elements); | ||
| state.add_global_memory_writes<T>(elements); | ||
|
|
||
| caching_allocator_t alloc{}; | ||
| auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc); | ||
|
|
||
| state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, | ||
| [&](nvbench::launch& launch) { | ||
| cuda::std::replace_copy( | ||
| policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), 42, 1337); | ||
| }); | ||
| } | ||
|
|
||
| NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) | ||
| .set_name("base") | ||
| .set_type_axes_names({"T{ct}"}) | ||
| .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of CUDA Experimental in CUDA C++ Core Libraries, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include <thrust/device_vector.h> | ||
|
|
||
| #include <cuda/memory_pool> | ||
| #include <cuda/std/__pstl_algorithm> | ||
| #include <cuda/stream_ref> | ||
|
|
||
| #include "nvbench_helper.cuh" | ||
|
|
||
| struct equal_to_42 | ||
| { | ||
| template <class T> | ||
| __device__ constexpr bool operator()(const T& val) const noexcept | ||
| { | ||
| return val == static_cast<T>(42); | ||
| } | ||
| }; | ||
|
|
||
| template <typename T> | ||
| static void basic(nvbench::state& state, nvbench::type_list<T>) | ||
| { | ||
| const auto elements = static_cast<std::size_t>(state.get_int64("Elements")); | ||
|
|
||
| thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42}); | ||
| thrust::device_vector<T> out(elements, thrust::no_init); | ||
|
|
||
| state.add_element_count(elements); | ||
| state.add_global_memory_reads<T>(elements); | ||
| state.add_global_memory_writes<T>(elements); | ||
|
|
||
| caching_allocator_t alloc{}; | ||
| auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc); | ||
|
|
||
| state.exec( | ||
| nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) { | ||
| cuda::std::replace_copy_if( | ||
| policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), equal_to_42{}, 1337); | ||
| }); | ||
| } | ||
|
|
||
| NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) | ||
| .set_name("base") | ||
| .set_type_axes_names({"T{ct}"}) | ||
| .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of libcu++, the C++ Standard Library for your entire system, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef _CUDA_STD___PSTL_REPLACE_COPY_H | ||
| #define _CUDA_STD___PSTL_REPLACE_COPY_H | ||
|
|
||
| #include <cuda/std/detail/__config> | ||
|
|
||
| #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| #endif // no system header | ||
|
|
||
| #if !_CCCL_COMPILER(NVRTC) | ||
|
|
||
| # include <cuda/std/__algorithm/replace_copy.h> | ||
| # include <cuda/std/__concepts/concept_macros.h> | ||
| # include <cuda/std/__execution/policy.h> | ||
| # include <cuda/std/__iterator/concepts.h> | ||
| # include <cuda/std/__iterator/iterator_traits.h> | ||
| # include <cuda/std/__pstl/dispatch.h> | ||
| # include <cuda/std/__type_traits/always_false.h> | ||
| # include <cuda/std/__type_traits/is_comparable.h> | ||
| # include <cuda/std/__type_traits/is_execution_policy.h> | ||
| # include <cuda/std/__type_traits/is_nothrow_copy_constructible.h> | ||
| # include <cuda/std/__utility/move.h> | ||
|
|
||
| # if _CCCL_HAS_BACKEND_CUDA() | ||
| # include <cuda/std/__pstl/cuda/transform.h> | ||
| # endif // _CCCL_HAS_BACKEND_CUDA() | ||
|
|
||
| # include <cuda/std/__cccl/prologue.h> | ||
|
|
||
| _CCCL_BEGIN_NAMESPACE_CUDA_STD | ||
|
|
||
| template <class _Tp> | ||
| struct __replace_copy_select | ||
| { | ||
| _Tp __old_value_; | ||
| _Tp __new_value_; | ||
|
|
||
| _CCCL_HOST_API constexpr __replace_copy_select(const _Tp& __old_value, | ||
| const _Tp& __new_value) noexcept(is_nothrow_copy_constructible_v<_Tp>) | ||
| : __old_value_(__old_value) | ||
| , __new_value_(__new_value) | ||
| {} | ||
|
|
||
| template <class _Up> | ||
| [[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const | ||
| noexcept(is_nothrow_copy_constructible_v<_Tp>) | ||
| { | ||
| return __val == __old_value_ ? __new_value_ : static_cast<_Tp>(__val); | ||
| } | ||
| }; | ||
|
|
||
| _CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT | ||
|
|
||
| _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _Tp = iter_value_t<_InputIterator>) | ||
| _CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND | ||
| is_execution_policy_v<_Policy>) | ||
| _CCCL_HOST_API _OutputIterator replace_copy( | ||
| [[maybe_unused]] const _Policy& __policy, | ||
| _InputIterator __first, | ||
| _InputIterator __last, | ||
| _OutputIterator __result, | ||
| const _Tp& __old_value, | ||
| const _Tp& __new_value) | ||
| { | ||
| static_assert(__is_cpp17_equality_comparable_v<_Tp, iter_reference_t<_InputIterator>>, | ||
| "cuda::std::replace_copy requires T to be comparable with iter_reference_t<InputIterator>"); | ||
|
|
||
| if (__first == __last) | ||
| { | ||
| return __result; | ||
| } | ||
|
|
||
| [[maybe_unused]] auto __dispatch = | ||
| ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>(); | ||
| if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>) | ||
| { | ||
| return __dispatch( | ||
| __policy, | ||
| ::cuda::std::move(__first), | ||
| ::cuda::std::move(__last), | ||
| ::cuda::std::move(__result), | ||
| __replace_copy_select{__old_value, __new_value}); | ||
| } | ||
| else | ||
| { | ||
| static_assert(__always_false_v<_Policy>, "Parallel cuda::std::replace_copy requires at least one selected backend"); | ||
| return ::cuda::std::replace_copy( | ||
| ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), __old_value, __new_value); | ||
| } | ||
| } | ||
|
|
||
| _CCCL_END_NAMESPACE_ARCH_DEPENDENT | ||
|
|
||
| _CCCL_END_NAMESPACE_CUDA_STD | ||
|
|
||
| # include <cuda/std/__cccl/epilogue.h> | ||
|
|
||
| #endif // !_CCCL_COMPILER(NVRTC) | ||
|
|
||
| #endif // _CUDA_STD___PSTL_REPLACE_COPY_H | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of libcu++, the C++ Standard Library for your entire system, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef _CUDA_STD___PSTL_REPLACE_COPY_IF_H | ||
| #define _CUDA_STD___PSTL_REPLACE_COPY_IF_H | ||
|
|
||
| #include <cuda/std/detail/__config> | ||
|
|
||
| #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| #endif // no system header | ||
|
|
||
| #if !_CCCL_COMPILER(NVRTC) | ||
|
|
||
| # include <cuda/std/__algorithm/replace_copy_if.h> | ||
| # include <cuda/std/__concepts/concept_macros.h> | ||
| # include <cuda/std/__execution/policy.h> | ||
| # include <cuda/std/__functional/invoke.h> | ||
| # include <cuda/std/__iterator/concepts.h> | ||
| # include <cuda/std/__iterator/iterator_traits.h> | ||
| # include <cuda/std/__pstl/dispatch.h> | ||
| # include <cuda/std/__type_traits/always_false.h> | ||
| # include <cuda/std/__type_traits/is_execution_policy.h> | ||
| # include <cuda/std/__type_traits/is_nothrow_copy_constructible.h> | ||
| # include <cuda/std/__type_traits/is_nothrow_move_constructible.h> | ||
| # include <cuda/std/__utility/move.h> | ||
|
|
||
| # if _CCCL_HAS_BACKEND_CUDA() | ||
| # include <cuda/std/__pstl/cuda/transform.h> | ||
| # endif // _CCCL_HAS_BACKEND_CUDA() | ||
|
|
||
| # include <cuda/std/__cccl/prologue.h> | ||
|
|
||
| _CCCL_BEGIN_NAMESPACE_CUDA_STD | ||
| template <class _UnaryPred, class _Tp> | ||
| struct __replace_copy_if_select | ||
| { | ||
| _UnaryPred __pred_; | ||
| _Tp __new_value_; | ||
|
|
||
| _CCCL_HOST_API constexpr __replace_copy_if_select(_UnaryPred __pred, const _Tp& __new_value) noexcept( | ||
| is_nothrow_move_constructible_v<_UnaryPred> && is_nothrow_copy_constructible_v<_Tp>) | ||
| : __pred_(__pred) | ||
| , __new_value_(__new_value) | ||
| {} | ||
|
Comment on lines
+52
to
+56
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto does not work everywhere |
||
|
|
||
| template <class _Up> | ||
| [[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const | ||
| noexcept(is_nothrow_invocable_v<const _UnaryPred&, const _Up&> && is_nothrow_copy_constructible_v<_Tp>) | ||
| { | ||
| return ::cuda::std::invoke(__pred_, __val) ? __new_value_ : static_cast<_Tp>(__val); | ||
| } | ||
| }; | ||
|
|
||
| _CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT | ||
|
|
||
| _CCCL_TEMPLATE( | ||
| class _Policy, class _InputIterator, class _OutputIterator, class _UnaryPred, class _Tp = iter_value_t<_InputIterator>) | ||
| _CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND | ||
| is_execution_policy_v<_Policy>) | ||
| _CCCL_HOST_API _OutputIterator replace_copy_if( | ||
| [[maybe_unused]] const _Policy& __policy, | ||
| _InputIterator __first, | ||
| _InputIterator __last, | ||
| _OutputIterator __result, | ||
| _UnaryPred __pred, | ||
| const _Tp& __new_value) | ||
| { | ||
| static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>, | ||
| "cuda::std::replace_copy_if: UnaryPred must satisfy indirect_unary_predicate<InputIterator>"); | ||
|
|
||
| if (__first == __last) | ||
| { | ||
| return __result; | ||
| } | ||
|
|
||
| [[maybe_unused]] auto __dispatch = | ||
| ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>(); | ||
| if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>) | ||
| { | ||
| return __dispatch( | ||
| __policy, | ||
| ::cuda::std::move(__first), | ||
| ::cuda::std::move(__last), | ||
| ::cuda::std::move(__result), | ||
| __replace_copy_if_select{::cuda::std::move(__pred), __new_value}); | ||
| } | ||
| else | ||
| { | ||
| static_assert(__always_false_v<_Policy>, | ||
| "Parallel cuda::std::replace_copy_if requires at least one selected backend"); | ||
| return ::cuda::std::replace_copy_if( | ||
| ::cuda::std::move(__first), | ||
| ::cuda::std::move(__last), | ||
| ::cuda::std::move(__result), | ||
| ::cuda::std::move(__pred), | ||
| __new_value); | ||
| } | ||
| } | ||
|
|
||
| _CCCL_END_NAMESPACE_ARCH_DEPENDENT | ||
|
|
||
| _CCCL_END_NAMESPACE_CUDA_STD | ||
|
|
||
| # include <cuda/std/__cccl/epilogue.h> | ||
|
|
||
| #endif // !_CCCL_COMPILER(NVRTC) | ||
|
|
||
| #endif // _CUDA_STD___PSTL_REPLACE_COPY_IF_H | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion: just use aggregate init and drop the ctor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This does not work in all CTK versions supported