NVIDIA · bernhardmgruber · Sep 30, 2025 · Sep 24, 2025 · Sep 25, 2025 · Sep 24, 2025
@@ -202,14 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
   }
 }
 
-template <typename VectorizedPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InputT>
+template <typename VectorizedPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteratorsIn>
 _CCCL_DEVICE void transform_kernel_vectorized(
   Offset num_items,
   int num_elem_per_thread_prefetch,
   bool can_vectorize,
   F f,
   RandomAccessIteratorOut out,
-  const InputT*... ins)
+  RandomAccessIteratorsIn... ins)
 {
   constexpr int block_dim        = VectorizedPolicy::block_threads;
   constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
@@ -240,9 +244,12 @@ _CCCL_DEVICE void transform_kernel_vectorized(
   constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
   using load_store_t            = decltype(load_store_type<load_store_size>());
   using output_t                = it_value_t<RandomAccessIteratorOut>;
-  using result_t                = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const InputT&...>>;
+  using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
   // picks output type size if there are no inputs
-  constexpr int element_size     = int{first_item(sizeof(InputT)..., size_of<output_t>)};
+  constexpr int element_size     = int{first_nonzero_value(
+    (sizeof(it_value_t<RandomAccessIteratorsIn>)
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+    size_of<output_t>)};
   constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
 
   static_assert((items_per_thread * element_size) % load_store_size == 0);
@@ -258,18 +265,35 @@ _CCCL_DEVICE void transform_kernel_vectorized(
 
   auto provide_array = [&](auto... inputs) {
     // load inputs
-    // TODO(bgruber): we could support fancy iterators for loading here as well (and only vectorize some inputs)
-    [[maybe_unused]] auto load_tile_vectorized = [&](auto* in, auto& input) {
-      auto in_vec    = reinterpret_cast<const load_store_t*>(in);
-      auto input_vec = reinterpret_cast<load_store_t*>(input.data());
-      _CCCL_PRAGMA_UNROLL_FULL()
-      for (int i = 0; i < load_store_count; ++i)
+    [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
+      if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
       {
-        input_vec[i] = in_vec[i * VectorizedPolicy::block_threads + threadIdx.x];
+        auto in_vec    = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
+        auto input_vec = reinterpret_cast<load_store_t*>(input.data());
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < load_store_count; ++i)
+        {
+          input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
+        }
+      }
+      else
+      {
+        constexpr int elems = load_store_size / element_size;
+        in += threadIdx.x * elems;
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < load_store_count; ++i)
+        {
+          _CCCL_PRAGMA_UNROLL_FULL()
+          for (int j = 0; j < elems; ++j)
+          {
+            input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
+          }
+        }
       }
     };
     _CCCL_PDL_GRID_DEPENDENCY_SYNC();
-    (load_tile_vectorized(ins, inputs), ...);
+    (load_tile(ins, inputs), ...);
+
     // Benchmarks showed up to 38% slowdown on H200 (some improvements as well), so omitted. See #5249 for details.
     // _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
 
@@ -280,7 +304,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
       output[i] = f(inputs[i]...);
     }
   };
-  provide_array(uninitialized_array<InputT, items_per_thread>{}...);
+  provide_array(uninitialized_array<it_value_t<RandomAccessIteratorsIn>, items_per_thread>{}...);
 
   // write output
   if constexpr (can_vectorize_store)

@@ -282,21 +282,45 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
   return 12 * 1024; // V100 and below
 }
 
-template <typename T, typename... Ts>
-_CCCL_HOST_DEVICE constexpr bool all_equal([[maybe_unused]] T head, Ts... tail)
+template <typename H, typename... Ts>
+_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
 {
-  return ((head == tail) && ...);
+  size_t first = 0;
+  for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+  {
+    if (v == 0)
+    {
+      continue;
+    }
+    if (first == 0)
+    {
+      first = v;
+    }
+    else if (v != first)
+    {
+      return false;
+    }
+  }
+  return true;
 }
 
-_CCCL_HOST_DEVICE constexpr bool all_equal()
+_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
 {
   return true;
 }
 
-template <typename T, typename... Ts>
-_CCCL_HOST_DEVICE constexpr auto first_item(T head, Ts...) -> T
+template <typename H, typename... Ts>
+_CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
 {
-  return head;
+  for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
+  {
+    if (v != 0)
+    {
+      return v;
+    }
+  }
+  // we only reach here when all input are not contiguous and the output has a void value type
+  return H{1};
 }
 
 template <typename T>
@@ -336,25 +360,36 @@ struct policy_hub<RequiresStableAddress,
     (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn> && ...);
   static constexpr bool all_input_values_trivially_reloc =
     (THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>> && ...);
-  static constexpr bool can_memcpy_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+  static constexpr bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
+  // the vectorized kernel supports mixing contiguous and non-contiguous iterators
+  static constexpr bool can_memcpy_contiguous_inputs =
+    ((!THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>
+      || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
+     && ...);
 
   // for vectorized policy:
-  static constexpr bool all_input_values_same_size = all_equal(sizeof(it_value_t<RandomAccessIteratorsIn>)...);
-  static constexpr int load_store_word_size        = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
-  // if there are no inputs, we take the size of the output value
-  static constexpr int value_type_size =
-    first_item(int{sizeof(it_value_t<RandomAccessIteratorsIn>)}..., int{size_of<it_value_t<RandomAccessIteratorOut>>});
+  static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
+    (sizeof(it_value_t<RandomAccessIteratorsIn>)
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
+  static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
+  // find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
+  // value type
+  static constexpr int contiguous_value_type_size = first_nonzero_value(
+    (int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
+     * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
+    int{size_of<it_value_t<RandomAccessIteratorOut>>});
   static constexpr bool value_type_divides_load_store_size =
-    load_store_word_size % value_type_size == 0; // implicitly checks that value_type_size <= load_store_word_size
+    load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
+                                                            // load_store_word_size
   static constexpr int target_bytes_per_thread =
     no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
   static constexpr int items_per_thread_vec =
-    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / value_type_size;
+    ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
   using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
 
   static constexpr bool fallback_to_prefetch =
-    RequiresStableAddress || !can_memcpy_inputs || !all_input_values_same_size || !value_type_divides_load_store_size
-    || !DenseOutput;
+    RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
+    || !value_type_divides_load_store_size || !DenseOutput;
 
   // TODO(bgruber): consider a separate kernel for just filling
 
@@ -380,7 +415,7 @@ struct policy_hub<RequiresStableAddress,
         block_threads* async_policy::min_items_per_thread,
         ldgsts_size_and_align)
       > int{max_smem_per_block};
-    static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams;
+    static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs;
 
   public:
     static constexpr int min_bif = arch_to_min_bytes_in_flight(800);
@@ -421,7 +456,8 @@ struct policy_hub<RequiresStableAddress,
       (((int{sizeof(it_value_t<RandomAccessIteratorsIn>)} * AsyncBlockSize) % max_alignment == 0) && ...);
     static constexpr bool enough_threads_for_peeling = AsyncBlockSize >= alignment; // head and tail bytes
     static constexpr bool fallback_to_vectorized =
-      exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams;
+      exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams
+      || !can_memcpy_all_inputs;
 
   public:
     static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion);

@@ -27,12 +27,36 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 #include <thrust/tabulate.h>
 
 #include <nvbench_helper.cuh>
 
 #include "thrust/detail/raw_pointer_cast.h"
 
+template <typename T>
+static void sequence(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> output(elements, thrust::no_init);
+
+  state.add_element_count(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc;
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               // sequence is implemented via thrust::tabulate
+               thrust::sequence(policy(alloc, launch), output.begin(), output.end());
+             });
+}
+
+NVBENCH_BENCH_TYPES(sequence, NVBENCH_TYPE_AXES(integral_types))
+  .set_name("sequence")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
+
 template <class T>
 struct seg_size_t
 {
@@ -41,17 +65,17 @@ struct seg_size_t
   template <class OffsetT>
   __device__ T operator()(OffsetT i)
   {
-    return d_offsets[i + 1] - d_offsets[i];
+    return static_cast<T>(d_offsets[i + 1] - d_offsets[i]);
   }
 };
 
 template <typename T>
-static void basic(nvbench::state& state, nvbench::type_list<T>)
+static void seg_size(nvbench::state& state, nvbench::type_list<T>)
 {
   const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
 
   thrust::device_vector<T> input(elements + 1);
-  thrust::device_vector<T> output(elements);
+  thrust::device_vector<T> output(elements, thrust::no_init);
 
   state.add_element_count(elements);
   state.add_global_memory_reads<T>(elements + 1);
@@ -65,9 +89,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
              });
 }
 
-using types = nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>;
-
-NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
-  .set_name("base")
+NVBENCH_BENCH_TYPES(seg_size, NVBENCH_TYPE_AXES(integral_types))
+  .set_name("seg_size")
   .set_type_axes_names({"T{ct}"})
   .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
@@ -39,37 +39,23 @@
 #if _CCCL_HAS_CUDA_COMPILER()
 #  include <thrust/system/cuda/config.h>
 
-#  include <thrust/distance.h>
-#  include <thrust/system/cuda/detail/parallel_for.h>
+#  include <thrust/system/cuda/detail/transform.h>
 #  include <thrust/system/cuda/execution_policy.h>
 
+#  include <cuda/__functional/address_stability.h>
+#  include <cuda/std/iterator>
+
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
-namespace __tabulate
-{
-template <class Iterator, class TabulateOp>
-struct functor
-{
-  Iterator items;
-  TabulateOp op;
-
-  template <typename Size>
-  void _CCCL_DEVICE operator()(Size idx)
-  {
-    items[idx] = op(idx);
-  }
-};
-} // namespace __tabulate
-
 template <class Derived, class Iterator, class TabulateOp>
 void _CCCL_HOST_DEVICE tabulate(execution_policy<Derived>& policy, Iterator first, Iterator last, TabulateOp tabulate_op)
 {
-  using size_type = thrust::detail::it_difference_t<Iterator>;
-  size_type count = ::cuda::std::distance(first, last);
-  cuda_cub::parallel_for(policy, __tabulate::functor<Iterator, TabulateOp>{first, tabulate_op}, count);
+  using size_type  = ::cuda::std::iter_difference_t<Iterator>;
+  const auto count = ::cuda::std::distance(first, last);
+  cuda_cub::transform_n(
+    policy, ::cuda::counting_iterator<size_type>{}, count, first, ::cuda::proclaim_copyable_arguments(tabulate_op));
 }
-
 } // namespace cuda_cub
 THRUST_NAMESPACE_END
 #endif