Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions libcudacxx/benchmarks/bench/replace_copy/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/stream_ref>

#include "nvbench_helper.cuh"

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
thrust::device_vector<T> out(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};
auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
cuda::std::replace_copy(
policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), 42, 1337);
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
53 changes: 53 additions & 0 deletions libcudacxx/benchmarks/bench/replace_copy_if/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/stream_ref>

#include "nvbench_helper.cuh"

struct equal_to_42
{
template <class T>
__device__ constexpr bool operator()(const T& val) const noexcept
{
return val == static_cast<T>(42);
}
};

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
thrust::device_vector<T> out(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};
auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(alloc);

state.exec(
nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cuda::std::replace_copy_if(
policy.with_stream(launch.get_stream().get_stream()), in.begin(), in.end(), out.begin(), equal_to_42{}, 1337);
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
114 changes: 114 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/replace_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_REPLACE_COPY_H
#define _CUDA_STD___PSTL_REPLACE_COPY_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if !_CCCL_COMPILER(NVRTC)

# include <cuda/std/__algorithm/replace_copy.h>
# include <cuda/std/__concepts/concept_macros.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__iterator/concepts.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__type_traits/is_comparable.h>
# include <cuda/std/__type_traits/is_execution_policy.h>
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
# include <cuda/std/__utility/move.h>

# if _CCCL_HAS_BACKEND_CUDA()
# include <cuda/std/__pstl/cuda/transform.h>
# endif // _CCCL_HAS_BACKEND_CUDA()

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD

template <class _Tp>
struct __replace_copy_select
{
_Tp __old_value_;
_Tp __new_value_;

_CCCL_HOST_API constexpr __replace_copy_select(const _Tp& __old_value,
const _Tp& __new_value) noexcept(is_nothrow_copy_constructible_v<_Tp>)
: __old_value_(__old_value)
, __new_value_(__new_value)
{}
Comment on lines +52 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: just use aggregate init and drop the ctor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not work in all CTK versions supported


template <class _Up>
[[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const
noexcept(is_nothrow_copy_constructible_v<_Tp>)
{
return __val == __old_value_ ? __new_value_ : static_cast<_Tp>(__val);
}
};

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _Tp = iter_value_t<_InputIterator>)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
is_execution_policy_v<_Policy>)
_CCCL_HOST_API _OutputIterator replace_copy(
[[maybe_unused]] const _Policy& __policy,
_InputIterator __first,
_InputIterator __last,
_OutputIterator __result,
const _Tp& __old_value,
const _Tp& __new_value)
{
static_assert(__is_cpp17_equality_comparable_v<_Tp, iter_reference_t<_InputIterator>>,
"cuda::std::replace_copy requires T to be comparable with iter_reference_t<InputIterator>");

if (__first == __last)
{
return __result;
}

[[maybe_unused]] auto __dispatch =
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
{
return __dispatch(
__policy,
::cuda::std::move(__first),
::cuda::std::move(__last),
::cuda::std::move(__result),
__replace_copy_select{__old_value, __new_value});
}
else
{
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::replace_copy requires at least one selected backend");
return ::cuda::std::replace_copy(
::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), __old_value, __new_value);
}
}

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD

# include <cuda/std/__cccl/epilogue.h>

#endif // !_CCCL_COMPILER(NVRTC)

#endif // _CUDA_STD___PSTL_REPLACE_COPY_H
120 changes: 120 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/replace_copy_if.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_REPLACE_COPY_IF_H
#define _CUDA_STD___PSTL_REPLACE_COPY_IF_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if !_CCCL_COMPILER(NVRTC)

# include <cuda/std/__algorithm/replace_copy_if.h>
# include <cuda/std/__concepts/concept_macros.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__functional/invoke.h>
# include <cuda/std/__iterator/concepts.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__type_traits/is_execution_policy.h>
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
# include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
# include <cuda/std/__utility/move.h>

# if _CCCL_HAS_BACKEND_CUDA()
# include <cuda/std/__pstl/cuda/transform.h>
# endif // _CCCL_HAS_BACKEND_CUDA()

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD
template <class _UnaryPred, class _Tp>
struct __replace_copy_if_select
{
_UnaryPred __pred_;
_Tp __new_value_;

_CCCL_HOST_API constexpr __replace_copy_if_select(_UnaryPred __pred, const _Tp& __new_value) noexcept(
is_nothrow_move_constructible_v<_UnaryPred> && is_nothrow_copy_constructible_v<_Tp>)
: __pred_(__pred)
, __new_value_(__new_value)
{}
Comment on lines +52 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto does not work everywhere


template <class _Up>
[[nodiscard]] _CCCL_DEVICE_API constexpr _Tp operator()(const _Up& __val) const
noexcept(is_nothrow_invocable_v<const _UnaryPred&, const _Up&> && is_nothrow_copy_constructible_v<_Tp>)
{
return ::cuda::std::invoke(__pred_, __val) ? __new_value_ : static_cast<_Tp>(__val);
}
};

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

_CCCL_TEMPLATE(
class _Policy, class _InputIterator, class _OutputIterator, class _UnaryPred, class _Tp = iter_value_t<_InputIterator>)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
is_execution_policy_v<_Policy>)
_CCCL_HOST_API _OutputIterator replace_copy_if(
[[maybe_unused]] const _Policy& __policy,
_InputIterator __first,
_InputIterator __last,
_OutputIterator __result,
_UnaryPred __pred,
const _Tp& __new_value)
{
static_assert(indirect_unary_predicate<_UnaryPred, _InputIterator>,
"cuda::std::replace_copy_if: UnaryPred must satisfy indirect_unary_predicate<InputIterator>");

if (__first == __last)
{
return __result;
}

[[maybe_unused]] auto __dispatch =
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
{
return __dispatch(
__policy,
::cuda::std::move(__first),
::cuda::std::move(__last),
::cuda::std::move(__result),
__replace_copy_if_select{::cuda::std::move(__pred), __new_value});
}
else
{
static_assert(__always_false_v<_Policy>,
"Parallel cuda::std::replace_copy_if requires at least one selected backend");
return ::cuda::std::replace_copy_if(
::cuda::std::move(__first),
::cuda::std::move(__last),
::cuda::std::move(__result),
::cuda::std::move(__pred),
__new_value);
}
}

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD

# include <cuda/std/__cccl/epilogue.h>

#endif // !_CCCL_COMPILER(NVRTC)

#endif // _CUDA_STD___PSTL_REPLACE_COPY_IF_H
2 changes: 2 additions & 0 deletions libcudacxx/include/cuda/std/__pstl_algorithm
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include <cuda/std/__pstl/for_each_n.h>
#include <cuda/std/__pstl/reduce.h>
#include <cuda/std/__pstl/replace.h>
#include <cuda/std/__pstl/replace_copy.h>
#include <cuda/std/__pstl/replace_copy_if.h>
#include <cuda/std/__pstl/replace_if.h>
#include <cuda/std/__pstl/transform.h>

Expand Down
Loading
Loading