From 99ac5ffcbfbf1b032774007601fba90593699d39 Mon Sep 17 00:00:00 2001 From: David Bayer Date: Tue, 2 Jun 2026 12:32:12 +0200 Subject: [PATCH] [libcu++] Replace `cudaStreamPerThread` with `cudaStream{}` in PSTL algorithms --- libcudacxx/include/cuda/std/__pstl/cuda/shift_left.h | 2 +- libcudacxx/include/cuda/std/__pstl/cuda/shift_right.h | 2 +- libcudacxx/include/cuda/std/__pstl/cuda/sort.h | 4 ++-- .../include/cuda/std/__pstl/cuda/stable_partition.h | 2 +- .../include/cuda/std/__pstl/cuda/temporary_storage.h | 2 +- .../execution_policy/get_memory_resource.pass.cpp | 8 ++++---- .../cuda/execution/execution_policy/get_stream.pass.cpp | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/shift_left.h b/libcudacxx/include/cuda/std/__pstl/cuda/shift_left.h index 96c53ce3f0a..43673ae4a3a 100644 --- a/libcudacxx/include/cuda/std/__pstl/cuda/shift_left.h +++ b/libcudacxx/include/cuda/std/__pstl/cuda/shift_left.h @@ -84,7 +84,7 @@ struct __pstl_dispatch<__pstl_algorithm::__shift_left, __execution_backend::__cu auto __flag_iter = ::cuda::transform_iterator{ ::cuda::counting_iterator{0}, __shift_left_predicate{static_cast(__num_shifted)}}; - auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{::cudaStream_t{}}, __policy); // Determine temporary device storage requirements for DeviceSelect::Flagged size_t __num_bytes = 0; diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/shift_right.h b/libcudacxx/include/cuda/std/__pstl/cuda/shift_right.h index 64d591dda03..87c41fec53e 100644 --- a/libcudacxx/include/cuda/std/__pstl/cuda/shift_right.h +++ b/libcudacxx/include/cuda/std/__pstl/cuda/shift_right.h @@ -78,7 +78,7 @@ struct __pstl_dispatch<__pstl_algorithm::__shift_right, __execution_backend::__c const auto __count_remaining = static_cast<_OffsetType>(__count - __num_shifted); const auto __result = __first + __num_shifted; - auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{::cudaStream_t{}}, __policy); if (2 * __num_shifted > __count) { // There is no overlap between the source and destination, so we can just copy diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/sort.h b/libcudacxx/include/cuda/std/__pstl/cuda/sort.h index f4c17a2d964..bd095403a76 100644 --- a/libcudacxx/include/cuda/std/__pstl/cuda/sort.h +++ b/libcudacxx/include/cuda/std/__pstl/cuda/sort.h @@ -87,7 +87,7 @@ struct __pstl_dispatch<__pstl_algorithm::__sort, __execution_backend::__cuda> _CCCL_HOST_API static void __radix_sort_impl(const _Policy& __policy, _Tp* __first, _Tp* __last, _BinaryPredicate) { const auto __count = static_cast(::cuda::std::distance(__first, __last)); - auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{::cudaStream_t{}}, __policy); CUB_NS_QUALIFIER::DoubleBuffer<_Tp> __buffer{__first, nullptr}; @@ -145,7 +145,7 @@ struct __pstl_dispatch<__pstl_algorithm::__sort, __execution_backend::__cuda> __merge_sort_impl(const _Policy& __policy, _InputIterator __first, _InputIterator __last, _BinaryPredicate __pred) { const auto __count = ::cuda::std::distance(__first, __last); - auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{::cudaStream_t{}}, __policy); // Run the kernel _CCCL_TRY_CUDA_API( diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h b/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h index 369d7f3fc06..31213b68c49 100644 --- a/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h +++ b/libcudacxx/include/cuda/std/__pstl/cuda/stable_partition.h @@ -71,7 +71,7 @@ struct __pstl_dispatch<__pstl_algorithm::__stable_partition, __execution_backend _OffsetType __num_selected; const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last)); - auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{::cudaStream_t{}}, __policy); // Determine temporary device storage requirements for device_stable_partition size_t __num_bytes = 0; diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/temporary_storage.h b/libcudacxx/include/cuda/std/__pstl/cuda/temporary_storage.h index 4615fd6d76a..675b2a7e179 100644 --- a/libcudacxx/include/cuda/std/__pstl/cuda/temporary_storage.h +++ b/libcudacxx/include/cuda/std/__pstl/cuda/temporary_storage.h @@ -150,7 +150,7 @@ class __temporary_storage _CCCL_REQUIRES((sizeof...(_Sizes) == sizeof...(_StoredTypes))) _CCCL_HOST_API __temporary_storage(const _Policy& __policy, const size_t __num_bytes_storage, const _Sizes... __elements_stored) - : __stream_(::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy)) + : __stream_(::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{::cudaStream_t{}}, __policy)) , __resource_(__get_memory_resource_or(__policy)) , __total_bytes_allocated_(__get_total_bytes_allocated(__num_bytes_storage, __elements_stored...)) , __storage_(__get_storage( diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp index 134d17553ca..c1ebc772624 100644 --- a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_memory_resource.pass.cpp @@ -67,7 +67,7 @@ static_assert(::cuda::mr::resource); template void test(Policy pol) { - auto old_stream = cuda::__call_or(::cuda::get_stream, cuda::stream_ref{cudaStreamPerThread}, pol); + auto old_stream = cuda::__call_or(::cuda::get_stream, cuda::stream_ref{::cudaStream_t{}}, pol); auto fallback_resource = ::cuda::device_default_memory_pool(cuda::device_ref{0}); { // Ensure that the plain policy is not callable with get_memory_resource assert(cuda::__call_or(::cuda::mr::get_memory_resource, fallback_resource, pol) == fallback_resource); @@ -77,7 +77,7 @@ void test(Policy pol) test_resource resource{42}; auto pol_with_resource = pol.with(cuda::mr::get_memory_resource, resource); assert(cuda::mr::get_memory_resource(pol_with_resource) == resource); - assert(cuda::__call_or(::cuda::get_stream, cuda::stream_ref{cudaStreamPerThread}, pol_with_resource) == old_stream); + assert(cuda::__call_or(::cuda::get_stream, cuda::stream_ref{::cudaStream_t{}}, pol_with_resource) == old_stream); using policy_t = decltype(pol_with_resource); static_assert(cuda::std::is_execution_policy_v); @@ -87,7 +87,7 @@ void test(Policy pol) test_resource resource{42}; auto pol_with_resource = pol.with(cuda::mr::get_memory_resource, resource); assert(cuda::mr::get_memory_resource(pol_with_resource) == resource); - assert(cuda::__call_or(::cuda::get_stream, cuda::stream_ref{cudaStreamPerThread}, pol_with_resource) == old_stream); + assert(cuda::__call_or(::cuda::get_stream, cuda::stream_ref{::cudaStream_t{}}, pol_with_resource) == old_stream); test_resource other_resource{1337}; decltype(auto) pol_with_other_resource = pol_with_resource.with(cuda::mr::get_memory_resource, other_resource); @@ -95,7 +95,7 @@ void test(Policy pol) // The original resource is unchanged assert(cuda::mr::get_memory_resource(pol_with_resource) == resource); assert(cuda::mr::get_memory_resource(pol_with_other_resource) == other_resource); - assert(cuda::__call_or(::cuda::get_stream, cuda::stream_ref{cudaStreamPerThread}, pol_with_resource) == old_stream); + assert(cuda::__call_or(::cuda::get_stream, cuda::stream_ref{::cudaStream_t{}}, pol_with_resource) == old_stream); } } diff --git a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp index 8df49a0e9ec..f1137b98643 100644 --- a/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/execution/execution_policy/get_stream.pass.cpp @@ -30,7 +30,7 @@ void test(Policy pol) { namespace execution = cuda::std::execution; - cuda::stream_ref default_stream{cudaStreamPerThread}; + cuda::stream_ref default_stream{::cudaStream_t{}}; { // Ensure that the plain policy does not provide a stream assert(cuda::__call_or(::cuda::get_stream, default_stream, pol) == default_stream); }