NVIDIA · bernhardmgruber · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
@@ -29,6 +29,8 @@ typedef struct cccl_device_scan_build_result_t
   void* cubin;
   size_t cubin_size;
   CUlibrary library;
+  cccl_type_info input_type;
+  cccl_type_info output_type;
   cccl_type_info accumulator_type;
   CUkernel init_kernel;
   CUkernel scan_kernel;

@@ -166,10 +166,36 @@ struct scan_kernel_source
 {
   cccl_device_scan_build_result_t& build;
 
+  std::size_t InputSize() const
+  {
+    return build.input_type.size;
+  }
+
+  std::size_t InputAlign() const
+  {
+    return build.input_type.alignment;
+  }
+
+  std::size_t OutputSize() const
+  {
+    return build.output_type.size;
+  }
+
+  std::size_t OutputAlign() const
+  {
+    return build.output_type.alignment;
+  }
+
   std::size_t AccumSize() const
   {
     return build.accumulator_type.size;
   }
+
+  std::size_t AccumAlign() const
+  {
+    return build.accumulator_type.alignment;
+  }
+
   CUkernel InitKernel() const
   {
     return build.init_kernel;
@@ -414,6 +440,8 @@ static_assert(device_scan_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6},
   build_ptr->cc                         = cc;
   build_ptr->cubin                      = (void*) result.data.release();
   build_ptr->cubin_size                 = result.size;
+  build_ptr->input_type                 = input_it.value_type;
+  build_ptr->output_type                = output_it.value_type;
   build_ptr->accumulator_type           = accum_t;
   build_ptr->force_inclusive            = force_inclusive;
   build_ptr->init_kind                  = init_kind;

@@ -18,42 +18,10 @@ struct policy_selector
 #  if USES_WARPSPEED()
   _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy
   {
-    static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS;
-    static constexpr int num_look_ahead_items      = TUNE_NUM_LOOKBACK_ITEMS;
-    static constexpr int items_per_thread          = TUNE_ITEMS_PLUS_ONE - 1;
-
-    static constexpr int num_threads_per_warp = 32;
-    static constexpr int num_load_warps       = 1;
-    static constexpr int num_sched_warps      = 1;
-    static constexpr int num_look_ahead_warps = 1;
-
-    static constexpr int num_total_warps =
-      2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
-    static constexpr int num_total_threads    = num_total_warps * num_threads_per_warp;
-    static constexpr int squad_reduce_threads = num_reduce_and_scan_warps * num_threads_per_warp;
-    static constexpr int tile_size            = items_per_thread * squad_reduce_threads;
-
-    auto warpspeed_policy = cub::detail::scan::scan_warpspeed_policy{
-      true,
-      num_reduce_and_scan_warps,
-      num_reduce_and_scan_warps,
-      num_load_warps,
-      num_sched_warps,
-      num_look_ahead_warps,
-      num_look_ahead_items,
-      num_total_threads,
-      items_per_thread,
-      tile_size};
-
-    return cub::detail::scan::scan_policy{
-      num_total_threads,
-      items_per_thread,
-      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-      cub::LOAD_DEFAULT,
-      cub::BLOCK_STORE_WARP_TRANSPOSE,
-      cub::BLOCK_SCAN_WARP_SCANS,
-      cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450},
-      warpspeed_policy};
+    cub::detail::scan::scan_policy policy{};
+    policy.warpspeed = cub::detail::scan::scan_warpspeed_policy{
+      true, TUNE_NUM_REDUCE_SCAN_WARPS, TUNE_NUM_LOOKBACK_ITEMS, TUNE_ITEMS_PLUS_ONE - 1};
+    return policy;
   }
 #  else
   _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy

@@ -88,11 +88,36 @@ struct DeviceScanKernelSource
                      AccumT,
                      EnforceInclusive == ForceInclusive::Yes>)
 
+  CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t InputSize()
+  {
+    return sizeof(it_value_t<UnwrappedInputIteratorT>);
+  }
+
+  CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t InputAlign()
+  {
+    return alignof(it_value_t<UnwrappedInputIteratorT>);
+  }
+
+  CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t OutputSize()
+  {
+    return sizeof(it_value_t<UnwrappedOutputIteratorT>);
+  }
+
+  CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t OutputAlign()
+  {
+    return alignof(it_value_t<UnwrappedOutputIteratorT>);
+  }
+
   CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t AccumSize()
   {
     return sizeof(AccumT);
   }
 
+  CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t AccumAlign()
+  {
+    return alignof(AccumT);
+  }
+
   CUB_RUNTIME_FUNCTION static ScanTileStateT TileState()
   {
     return {};
@@ -481,21 +506,19 @@ struct DispatchScan
   }
 
 #if __cccl_ptx_isa >= 860
-  template <typename PolicyGetter, typename PolicySelectorT>
-  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t
-  __invoke_lookahead_algorithm(PolicyGetter policy_getter, const PolicySelectorT& policy_selector)
+  template <typename PolicyGetter>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(PolicyGetter policy_getter)
   {
     if (num_items == 0)
     {
       temp_storage_bytes = 1; // just fulfill the contract that CUB always requires some temporary storage
       return cudaSuccess;
     }
 
-    CUB_DETAIL_CONSTEXPR_ISH auto active_policy          = policy_getter();
-    CUB_DETAIL_CONSTEXPR_ISH const auto warpspeed_policy = active_policy.warpspeed;
+    CUB_DETAIL_CONSTEXPR_ISH const detail::scan::scan_warpspeed_policy warpspeed_policy = policy_getter().warpspeed;
 
     const int grid_dim =
-      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(warpspeed_policy.tile_size)));
+      static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(warpspeed_policy.tile_size())));
 
     if (d_temp_storage == nullptr)
     {
@@ -524,37 +547,19 @@ struct DispatchScan
     // TODO(bgruber): we probably need to ensure alignment of d_temp_storage
     _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");
 
-    using selector_smem_info_t = detail::scan::selector_smem_info<PolicySelectorT>;
-
-    auto scan_kernel      = kernel_source.ScanKernel();
-    int smem_size_1_stage = 0;
-    if constexpr (selector_smem_info_t::has_static_layout)
-    {
-      CUB_DETAIL_CONSTEXPR_ISH int static_smem_size_1_stage = detail::scan::smem_for_stages(
-        warpspeed_policy,
-        1,
-        selector_smem_info_t::input_value_size,
-        selector_smem_info_t::input_value_alignment,
-        selector_smem_info_t::output_value_size,
-        selector_smem_info_t::output_value_alignment,
-        selector_smem_info_t::accum_size,
-        selector_smem_info_t::accum_alignment);
-      CUB_DETAIL_STATIC_ISH_ASSERT(static_smem_size_1_stage <= detail::max_smem_per_block,
-                                   "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");
-      smem_size_1_stage = static_smem_size_1_stage;
-    }
-    else
-    {
-      smem_size_1_stage = detail::scan::smem_for_stages(
-        warpspeed_policy,
-        1,
-        policy_selector.input_value_size,
-        policy_selector.input_value_alignment,
-        policy_selector.output_value_size,
-        policy_selector.output_value_alignment,
-        policy_selector.accum_size,
-        policy_selector.accum_alignment);
-    }
+    auto scan_kernel                 = kernel_source.ScanKernel();
+    [[maybe_unused]] auto kernel_src = kernel_source; // need to pull a copy to not access `this` during const. eval.
+    CUB_DETAIL_CONSTEXPR_ISH int smem_size_1_stage = detail::scan::smem_for_stages(
+      warpspeed_policy,
+      1,
+      static_cast<int>(kernel_src.InputSize()),
+      static_cast<int>(kernel_src.InputAlign()),
+      static_cast<int>(kernel_src.OutputSize()),
+      static_cast<int>(kernel_src.OutputAlign()),
+      static_cast<int>(kernel_src.AccumSize()),
+      static_cast<int>(kernel_src.AccumAlign()));
-      static_cast<int>(kernel_src.InputSize()),
-      static_cast<int>(kernel_src.InputAlign()),
-      static_cast<int>(kernel_src.OutputSize()),
-      static_cast<int>(kernel_src.OutputAlign()),
-      static_cast<int>(kernel_src.AccumSize()),
-      static_cast<int>(kernel_src.AccumAlign()));
+      int{kernel_src.InputSize()},
+      int{kernel_src.InputAlign()},
+      int{kernel_src.OutputSize()},
+      int{kernel_src.OutputAlign()},
+      int{kernel_src.AccumSize()},
+      int{kernel_src.AccumAlign())};
-      static_cast<int>(kernel_src.InputSize()),
-      static_cast<int>(kernel_src.InputAlign()),
-      static_cast<int>(kernel_src.OutputSize()),
-      static_cast<int>(kernel_src.OutputAlign()),
-      static_cast<int>(kernel_src.AccumSize()),
-      static_cast<int>(kernel_src.AccumAlign()));
+      int{kernel_src.InputSize()},
+      int{kernel_src.InputAlign()},
+      int{kernel_src.OutputSize()},
+      int{kernel_src.OutputAlign()},
+      int{kernel_src.AccumSize()},
+      int{kernel_src.AccumAlign())};
+    CUB_DETAIL_STATIC_ISH_ASSERT(smem_size_1_stage <= detail::max_smem_per_block,
+                                 "Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");
 
     int num_stages = 1;
     int smem_size  = smem_size_1_stage;
@@ -565,36 +570,19 @@ struct DispatchScan
                    // 1 CTA per SM +1 since it tends to improve performance
                    // TODO(bgruber): make the +1 a tuning parameter
                    const int max_stages_for_even_workload = static_cast<int>(
-                     ::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size)) + 1);
+                     ::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size())) + 1);
 
                    while (num_stages <= max_stages_for_even_workload)
                    {
-                     const auto next_smem_size = [&] {
-                       if constexpr (selector_smem_info_t::has_static_layout)
-                       {
-                         return detail::scan::smem_for_stages(
-                           warpspeed_policy,
-                           num_stages + 1,
-                           selector_smem_info_t::input_value_size,
-                           selector_smem_info_t::input_value_alignment,
-                           selector_smem_info_t::output_value_size,
-                           selector_smem_info_t::output_value_alignment,
-                           selector_smem_info_t::accum_size,
-                           selector_smem_info_t::accum_alignment);
-                       }
-                       else
-                       {
-                         return detail::scan::smem_for_stages(
-                           warpspeed_policy,
-                           num_stages + 1,
-                           policy_selector.input_value_size,
-                           policy_selector.input_value_alignment,
-                           policy_selector.output_value_size,
-                           policy_selector.output_value_alignment,
-                           policy_selector.accum_size,
-                           policy_selector.accum_alignment);
-                       }
-                     }();
+                     const int next_smem_size = detail::scan::smem_for_stages(
+                       warpspeed_policy,
+                       num_stages + 1,
+                       static_cast<int>(kernel_source.InputSize()),
+                       static_cast<int>(kernel_source.InputAlign()),
+                       static_cast<int>(kernel_source.OutputSize()),
+                       static_cast<int>(kernel_source.OutputAlign()),
+                       static_cast<int>(kernel_source.AccumSize()),
+                       static_cast<int>(kernel_source.AccumAlign()));
                      if (next_smem_size > max_dynamic_smem_size)
                      {
                        // This number of stages failed, so stay at the current settings
@@ -647,7 +635,7 @@ struct DispatchScan
 
     // Invoke scan kernel
     {
-      const int block_dim = warpspeed_policy.num_total_threads;
+      const int block_dim = detail::scan::num_total_threads(warpspeed_policy);
 
 #  ifdef CUB_DEBUG_LOG
       _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream);
@@ -689,9 +677,8 @@ struct DispatchScan
   _CCCL_DIAG_PUSH
   _CCCL_DIAG_SUPPRESS_MSVC(4702)
 
-  template <typename PolicyGetter, typename PolicySelectorT>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
-  __invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector)
+  template <typename PolicyGetter>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t __invoke(PolicyGetter policy_getter)
   {
     CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter();
 
@@ -702,12 +689,12 @@ struct DispatchScan
 #  if defined(CUB_DEFINE_RUNTIME_POLICIES)
     if (kernel_source.use_warpspeed(active_policy))
     {
-      return __invoke_lookahead_algorithm(policy_getter, policy_selector);
+      return __invoke_lookahead_algorithm(policy_getter);
     }
 #  else
     if CUB_DETAIL_CONSTEXPR_ISH (KernelSource::use_warpspeed(active_policy))
     {
-      return __invoke_lookahead_algorithm(policy_getter, policy_selector);
+      return __invoke_lookahead_algorithm(policy_getter);
     }
 #  endif
 #endif // __cccl_ptx_isa >= 860
@@ -851,13 +838,7 @@ struct DispatchScan
       }
     };
 
-    using policy_selector_t = detail::scan::policy_selector_from_types<
-      detail::it_value_t<InputIteratorT>,
-      detail::it_value_t<OutputIteratorT>,
-      AccumT,
-      OffsetT,
-      ScanOpT>;
-    return __invoke(policy_getter{}, policy_selector_t{});
+    return __invoke(policy_getter{});
   }
 
   /**
@@ -1027,7 +1008,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
       -1 /* ptx_version, not used actually */,
       kernel_source,
       launcher_factory}
-      .__invoke(policy_getter, policy_selector);
+      .__invoke(policy_getter);
   });
 }
 

@@ -119,7 +119,7 @@ template <typename PolicySelector, typename InputIteratorT, typename OutputItera
   if constexpr (policy.warpspeed
                 && detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
   {
-    return policy.warpspeed.num_total_threads;
+    return num_total_threads(policy.warpspeed);
   }
 #endif // _CCCL_CUDACC_AT_LEAST(12, 8)
   return policy.block_threads;