Support fancy iterators in vectorized transform and port thrust::tabulate to it #6012

miscco · 2025-09-28T15:39:31Z

We should really pull that out into a function

I am strongly considering to refactor the entire mess, so let's postpone any small fixes for now.

miscco · 2025-09-29T05:35:56Z

We should have a trait for that, I guess it will come up more often

miscco · 2025-09-26T06:50:06Z

🙀 I believe those warrant a slightly more elaborate comment

I think this needs a bigger refactoring in general

-Original file line number
+Diff line change
@@ Expand Up / @@ -202,14 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type() @@
       }
     }
-    template <typename VectorizedPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InputT>
+    template <typename VectorizedPolicy,
+              typename Offset,
+              typename F,
+              typename RandomAccessIteratorOut,
+              typename... RandomAccessIteratorsIn>
     _CCCL_DEVICE void transform_kernel_vectorized(
       Offset num_items,
       int num_elem_per_thread_prefetch,
       bool can_vectorize,
       F f,
       RandomAccessIteratorOut out,
-      const InputT*... ins)
+      RandomAccessIteratorsIn... ins)
     {
       constexpr int block_dim        = VectorizedPolicy::block_threads;
       constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
@@ Expand Down Expand Up / @@ -240,9 +244,12 @@ _CCCL_DEVICE void transform_kernel_vectorized( @@
       constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
       using load_store_t            = decltype(load_store_type<load_store_size>());
       using output_t                = it_value_t<RandomAccessIteratorOut>;
-      using result_t                = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const InputT&...>>;
+      using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
       // picks output type size if there are no inputs
-      constexpr int element_size     = int{first_item(sizeof(InputT)..., size_of<output_t>)};
+      constexpr int element_size     = int{first_nonzero_value(
+        (sizeof(it_value_t<RandomAccessIteratorsIn>)
+         * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+        size_of<output_t>)};
       constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
       static_assert((items_per_thread * element_size) % load_store_size == 0);
@@ Expand All / @@ -258,18 +265,35 @@ _CCCL_DEVICE void transform_kernel_vectorized( @@
       auto provide_array = [&](auto... inputs) {
         // load inputs
-        // TODO(bgruber): we could support fancy iterators for loading here as well (and only vectorize some inputs)
-        [[maybe_unused]] auto load_tile_vectorized = [&](auto* in, auto& input) {
-          auto in_vec    = reinterpret_cast<const load_store_t*>(in);
-          auto input_vec = reinterpret_cast<load_store_t*>(input.data());
-          _CCCL_PRAGMA_UNROLL_FULL()
-          for (int i = 0; i < load_store_count; ++i)
+        [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
+          if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
           {
-            input_vec[i] = in_vec[i * VectorizedPolicy::block_threads + threadIdx.x];
+            auto in_vec    = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
+            auto input_vec = reinterpret_cast<load_store_t*>(input.data());
+            _CCCL_PRAGMA_UNROLL_FULL()
+            for (int i = 0; i < load_store_count; ++i)
+            {
+              input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
+            }
+          }
+          else
+          {
+            constexpr int elems = load_store_size / element_size;
+            in += threadIdx.x * elems;
+            _CCCL_PRAGMA_UNROLL_FULL()
+            for (int i = 0; i < load_store_count; ++i)
+            {
+              _CCCL_PRAGMA_UNROLL_FULL()
+              for (int j = 0; j < elems; ++j)
+              {
+                input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
+              }
+            }
           }
         };
         _CCCL_PDL_GRID_DEPENDENCY_SYNC();
-        (load_tile_vectorized(ins, inputs), ...);
+        (load_tile(ins, inputs), ...);
         // Benchmarks showed up to 38% slowdown on H200 (some improvements as well), so omitted. See #5249 for details.
         // _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
@@ Expand All / @@ -280,7 +304,7 @@ _CCCL_DEVICE void transform_kernel_vectorized( @@
           output[i] = f(inputs[i]...);
         }
       };
-      provide_array(uninitialized_array<InputT, items_per_thread>{}...);
+      provide_array(uninitialized_array<it_value_t<RandomAccessIteratorsIn>, items_per_thread>{}...);
       // write output
       if constexpr (can_vectorize_store)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
       return 12 * 1024; // V100 and below
     }
-    template <typename T, typename... Ts>
-    _CCCL_HOST_DEVICE constexpr bool all_equal([[maybe_unused]] T head, Ts... tail)
+    template <typename H, typename... Ts>
+    _CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
     {
-      return ((head == tail) && ...);
+      size_t first = 0;
+      for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+      {
+        if (v == 0)
+        {
+          continue;
+        }
+        if (first == 0)
+        {
+          first = v;
+        }
+        else if (v != first)
+        {
+          return false;
+        }
+      }
+      return true;
     }
-    _CCCL_HOST_DEVICE constexpr bool all_equal()
+    _CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
     {
       return true;
     }
-    template <typename T, typename... Ts>
-    _CCCL_HOST_DEVICE constexpr auto first_item(T head, Ts...) -> T
+    template <typename H, typename... Ts>
+    _CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
     {
-      return head;
+      for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+      {
+        if (v != 0)
+        {
+          return v;
+        }
+      }
+      // we only reach here when all input are not contiguous and the output has a void value type
+      return H{1};
     }
     template <typename T>
@@ Expand Down Expand Up / @@ -336,25 +360,36 @@ struct policy_hub<RequiresStableAddress, @@
         (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn> && ...);
       static constexpr bool all_input_values_trivially_reloc =
         (THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>> && ...);
-      static constexpr bool can_memcpy_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+      static constexpr bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+      // the vectorized kernel supports mixing contiguous and non-contiguous iterators
+      static constexpr bool can_memcpy_contiguous_inputs =
+        ((!THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>
+          || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
+         && ...);
       // for vectorized policy:
-      static constexpr bool all_input_values_same_size = all_equal(sizeof(it_value_t<RandomAccessIteratorsIn>)...);
-      static constexpr int load_store_word_size        = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
-      // if there are no inputs, we take the size of the output value
-      static constexpr int value_type_size =
-        first_item(int{sizeof(it_value_t<RandomAccessIteratorsIn>)}..., int{size_of<it_value_t<RandomAccessIteratorOut>>});
+      static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
+        (sizeof(it_value_t<RandomAccessIteratorsIn>)
+         * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
+      static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
+      // find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
+      // value type
+      static constexpr int contiguous_value_type_size = first_nonzero_value(
+        (int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
+         * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+        int{size_of<it_value_t<RandomAccessIteratorOut>>});
       static constexpr bool value_type_divides_load_store_size =
-        load_store_word_size % value_type_size == 0; // implicitly checks that value_type_size <= load_store_word_size
+        load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
+                                                                // load_store_word_size
       static constexpr int target_bytes_per_thread =
         no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
       static constexpr int items_per_thread_vec =
-        ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / value_type_size;
+        ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
       using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
       static constexpr bool fallback_to_prefetch =
-        RequiresStableAddress || !can_memcpy_inputs || !all_input_values_same_size || !value_type_divides_load_store_size
-        || !DenseOutput;
+        RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
+        || !value_type_divides_load_store_size || !DenseOutput;
       // TODO(bgruber): consider a separate kernel for just filling
@@ Expand All / @@ -380,7 +415,7 @@ struct policy_hub<RequiresStableAddress, @@
             block_threads* async_policy::min_items_per_thread,
             ldgsts_size_and_align)
           > int{max_smem_per_block};
-        static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams;
+        static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs;
       public:
         static constexpr int min_bif = arch_to_min_bytes_in_flight(800);
@@ Expand Down Expand Up / @@ -421,7 +456,8 @@ struct policy_hub<RequiresStableAddress, @@
           (((int{sizeof(it_value_t<RandomAccessIteratorsIn>)} * AsyncBlockSize) % max_alignment == 0) && ...);
         static constexpr bool enough_threads_for_peeling = AsyncBlockSize >= alignment; // head and tail bytes
         static constexpr bool fallback_to_vectorized =
-          exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams;
+          exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams
+          || !can_memcpy_all_inputs;
       public:
         static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion);
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -27,12 +27,36 @@ @@
     #include <thrust/device_vector.h>
     #include <thrust/execution_policy.h>
+    #include <thrust/sequence.h>
     #include <thrust/tabulate.h>
     #include <nvbench_helper.cuh>
     #include "thrust/detail/raw_pointer_cast.h"
+    template <typename T>
+    static void sequence(nvbench::state& state, nvbench::type_list<T>)
+    {
+      const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+      thrust::device_vector<T> output(elements, thrust::no_init);
+      state.add_element_count(elements);
+      state.add_global_memory_writes<T>(elements);
+      caching_allocator_t alloc;
+      state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) {
+                   // sequence is implemented via thrust::tabulate
+                   thrust::sequence(policy(alloc, launch), output.begin(), output.end());
+                 });
+    }
+    NVBENCH_BENCH_TYPES(sequence, NVBENCH_TYPE_AXES(integral_types))
+      .set_name("sequence")
+      .set_type_axes_names({"T{ct}"})
+      .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
     template <class T>
     struct seg_size_t
     {
@@ Expand All / @@ -41,17 +65,17 @@ struct seg_size_t @@
       template <class OffsetT>
       __device__ T operator()(OffsetT i)
       {
-        return d_offsets[i + 1] - d_offsets[i];
+        return static_cast<T>(d_offsets[i + 1] - d_offsets[i]);
       }
     };
     template <typename T>
-    static void basic(nvbench::state& state, nvbench::type_list<T>)
+    static void seg_size(nvbench::state& state, nvbench::type_list<T>)
     {
       const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
       thrust::device_vector<T> input(elements + 1);
-      thrust::device_vector<T> output(elements);
+      thrust::device_vector<T> output(elements, thrust::no_init);
       state.add_element_count(elements);
       state.add_global_memory_reads<T>(elements + 1);
@@ Expand All @@
                  });
     }
-    using types = nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>;
-    NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
-      .set_name("base")
+    NVBENCH_BENCH_TYPES(seg_size, NVBENCH_TYPE_AXES(integral_types))
+      .set_name("seg_size")
       .set_type_axes_names({"T{ct}"})
       .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));

-Original file line number
+Diff line change
@@ Expand Up / @@ -39,37 +39,23 @@ @@
     #if _CCCL_HAS_CUDA_COMPILER()
     #  include <thrust/system/cuda/config.h>
-    #  include <thrust/distance.h>
-    #  include <thrust/system/cuda/detail/parallel_for.h>
+    #  include <thrust/system/cuda/detail/transform.h>
     #  include <thrust/system/cuda/execution_policy.h>
+    #  include <cuda/__functional/address_stability.h>
+    #  include <cuda/std/iterator>
     THRUST_NAMESPACE_BEGIN
     namespace cuda_cub
     {
-    namespace __tabulate
-    {
-    template <class Iterator, class TabulateOp>
-    struct functor
-    {
-      Iterator items;
-      TabulateOp op;
-      template <typename Size>
-      void _CCCL_DEVICE operator()(Size idx)
-      {
-        items[idx] = op(idx);
-      }
-    };
-    } // namespace __tabulate
     template <class Derived, class Iterator, class TabulateOp>
     void _CCCL_HOST_DEVICE tabulate(execution_policy<Derived>& policy, Iterator first, Iterator last, TabulateOp tabulate_op)
     {
-      using size_type = thrust::detail::it_difference_t<Iterator>;
-      size_type count = ::cuda::std::distance(first, last);
-      cuda_cub::parallel_for(policy, __tabulate::functor<Iterator, TabulateOp>{first, tabulate_op}, count);
+      using size_type  = ::cuda::std::iter_difference_t<Iterator>;
+      const auto count = ::cuda::std::distance(first, last);
+      cuda_cub::transform_n(
+        policy, ::cuda::counting_iterator<size_type>{}, count, first, ::cuda::proclaim_copyable_arguments(tabulate_op));
     }
     } // namespace cuda_cub
     THRUST_NAMESPACE_END
     #endif

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Support fancy iterators in vectorized transform and port thrust::tabulate to it #6012

Uh oh!

Diff view

Diff view

There are no files selected for viewing

miscco Sep 28, 2025

Uh oh!

bernhardmgruber Sep 29, 2025

Uh oh!

Uh oh!

miscco Sep 29, 2025

Uh oh!

miscco Sep 26, 2025

Uh oh!

bernhardmgruber Sep 26, 2025

Uh oh!

bernhardmgruber Sep 26, 2025

Uh oh!

Uh oh!

Uh oh!

Support fancy iterators in vectorized transform and port thrust::tabulate to it #6012

Uh oh!

Support fancy iterators in vectorized transform and port thrust::tabulate to it #6012

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

miscco Sep 28, 2025

Choose a reason for hiding this comment

Uh oh!

bernhardmgruber Sep 29, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

miscco Sep 29, 2025

Choose a reason for hiding this comment

Uh oh!

miscco Sep 26, 2025

Choose a reason for hiding this comment

Uh oh!

bernhardmgruber Sep 26, 2025

Choose a reason for hiding this comment

Uh oh!

bernhardmgruber Sep 26, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!