From 1d796e4df296c207176163e116e19a86ca1610ee Mon Sep 17 00:00:00 2001 From: edenfunf Date: Wed, 15 Apr 2026 10:17:58 +0800 Subject: [PATCH 1/3] [thrust] single-pass is_partitioned via zip_iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous implementation required two kernel launches: find_if_not to locate the partition boundary, then find_if to check whether any true element follows it. Replace this with a single find_if over adjacent element pairs (a[i], a[i+1]) built with cuda::make_zip_iterator. The predicate detects a "false → true" transition that violates the partitioning invariant. Using zip_iterator::operator- (which returns the minimum component distance) the synthetic range has n-1 elements, so only one kernel launch is needed regardless of input. Edge-case handling: - Empty range: early return true (avoids computing first+1). - n==1: zip distance is min(1,0)=0, find_if_n returns first_zip; get<1>(first_zip) == last is true, which is correct. Pattern mirrors cuda::std::is_partitioned in libcudacxx/include/cuda/std/__pstl/is_partitioned.h. Fixes #8085 --- thrust/thrust/system/cuda/detail/partition.h | 38 ++++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h index 96e7a536b0c..5ed452f9e7d 100644 --- a/thrust/thrust/system/cuda/detail/partition.h +++ b/thrust/thrust/system/cuda/detail/partition.h @@ -30,9 +30,11 @@ # include # include +# include # include # include # include +# include THRUST_NAMESPACE_BEGIN namespace cuda_cub @@ -365,13 +367,43 @@ stable_partition(execution_policy& policy, Iterator first, Iterator las return ret; } +// Functor for the single-pass is_partitioned check. +// Returns true for an adjacent pair (a[i], a[i+1]) where pred(a[i]) is false +// and pred(a[i+1]) is true — i.e., a "false → true" transition that violates +// the partitioning invariant. +template +struct __is_partitioned_fn +{ + Predicate pred_; + + template + [[nodiscard]] _CCCL_HOST_DEVICE bool operator()(const Tuple& tuple) const + { + return !pred_(::cuda::std::get<0>(tuple)) && pred_(::cuda::std::get<1>(tuple)); + } +}; + +// Single-pass implementation: zip adjacent elements and find any "false → true" +// transition. Two-pass (find_if_not + find_if) required two kernel launches; +// this approach uses one find_if over (a[i], a[i+1]) pairs, cutting kernel +// launch overhead roughly in half for typical inputs. +// See: https://github.com/NVIDIA/cccl/issues/8085 template bool _CCCL_HOST_DEVICE is_partitioned(execution_policy& policy, ItemsIt first, ItemsIt last, Predicate predicate) { - ItemsIt boundary = cuda_cub::find_if_not(policy, first, last, predicate); - ItemsIt end = cuda_cub::find_if(policy, boundary, last, predicate); - return end == last; + if (first == last) + { + return true; + } + // Build a range of adjacent pairs: (a[0],a[1]), (a[1],a[2]), ..., (a[n-2],a[n-1]). + // The distance of this zip range is min(n, n-1) = n-1 (via zip_iterator::operator-). + const auto first_zip = ::cuda::make_zip_iterator(first, first + 1); + const auto last_zip = ::cuda::make_zip_iterator(last, last); + const auto result = cuda_cub::find_if(policy, first_zip, last_zip, __is_partitioned_fn{predicate}); + // Checking get<1>(result) == last (rather than result == last_zip) correctly + // handles the n==1 edge case where find_if_n returns first_zip (num_items==0). + return ::cuda::std::get<1>(result.__iterators()) == last; } } // namespace cuda_cub THRUST_NAMESPACE_END From ced0bbc81741215985f774b738b93f3880c67755 Mon Sep 17 00:00:00 2001 From: CCCL Fix Date: Wed, 15 Apr 2026 19:42:22 +0800 Subject: [PATCH 2/3] [thrust] fix __is_partitioned_fn for proxy-reference iterators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When iterating over thrust::device_vector, dereferencing yields device_reference rather than T. Applying a predicate to a device_reference and then applying operator! to the result fails to compile: converting device_reference to bool requires two user-defined conversions (device_reference → T → void*), which C++ forbids in a single implicit conversion sequence. Fix by wrapping each tuple element with thrust::raw_reference_cast before passing to the predicate, and assigning the result to an intermediate bool. This matches the pattern used elsewhere in Thrust (internal_functional.h, head_flags.h) for the same proxy- reference problem, and mirrors the intent of the PSTL reference implementation which stores pred results in const bool locals. Verified: all 14 tests pass on RTX 5070 / CUDA 12.9 (sm_89): thrust.test.is_partitioned 8/8 pass thrust.test.cuda.is_partitioned.cdp_0 3/3 pass thrust.test.cuda.is_partitioned.cdp_1 3/3 pass --- thrust/thrust/system/cuda/detail/partition.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h index 5ed452f9e7d..bf4e2971758 100644 --- a/thrust/thrust/system/cuda/detail/partition.h +++ b/thrust/thrust/system/cuda/detail/partition.h @@ -35,6 +35,7 @@ # include # include # include +# include THRUST_NAMESPACE_BEGIN namespace cuda_cub @@ -379,7 +380,9 @@ struct __is_partitioned_fn template [[nodiscard]] _CCCL_HOST_DEVICE bool operator()(const Tuple& tuple) const { - return !pred_(::cuda::std::get<0>(tuple)) && pred_(::cuda::std::get<1>(tuple)); + const bool lhs = pred_(thrust::raw_reference_cast(::cuda::std::get<0>(tuple))); + const bool rhs = pred_(thrust::raw_reference_cast(::cuda::std::get<1>(tuple))); + return !lhs && rhs; } }; From b95ee9e2e201f5088064daa4189cf2bad3363a4a Mon Sep 17 00:00:00 2001 From: CCCL Fix Date: Wed, 15 Apr 2026 19:47:05 +0800 Subject: [PATCH 3/3] style: clang-format partition.h --- thrust/thrust/system/cuda/detail/partition.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thrust/thrust/system/cuda/detail/partition.h b/thrust/thrust/system/cuda/detail/partition.h index bf4e2971758..7db28add558 100644 --- a/thrust/thrust/system/cuda/detail/partition.h +++ b/thrust/thrust/system/cuda/detail/partition.h @@ -21,6 +21,7 @@ # include # include +# include # include # include # include @@ -35,7 +36,6 @@ # include # include # include -# include THRUST_NAMESPACE_BEGIN namespace cuda_cub