Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions c/parallel/include/cccl/c/scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ typedef struct cccl_device_scan_build_result_t
void* cubin;
size_t cubin_size;
CUlibrary library;
cccl_type_info input_type;
cccl_type_info output_type;
cccl_type_info accumulator_type;
CUkernel init_kernel;
CUkernel scan_kernel;
Expand Down
28 changes: 28 additions & 0 deletions c/parallel/src/scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,36 @@ struct scan_kernel_source
{
cccl_device_scan_build_result_t& build;

std::size_t InputSize() const
{
return build.input_type.size;
}

std::size_t InputAlign() const
{
return build.input_type.alignment;
}

std::size_t OutputSize() const
{
return build.output_type.size;
}

std::size_t OutputAlign() const
{
return build.output_type.alignment;
}

std::size_t AccumSize() const
{
return build.accumulator_type.size;
}

std::size_t AccumAlign() const
{
return build.accumulator_type.alignment;
}

CUkernel InitKernel() const
{
return build.init_kernel;
Expand Down Expand Up @@ -414,6 +440,8 @@ static_assert(device_scan_policy()(::cuda::arch_id{{CUB_PTX_ARCH / 10}}) == {6},
build_ptr->cc = cc;
build_ptr->cubin = (void*) result.data.release();
build_ptr->cubin_size = result.size;
build_ptr->input_type = input_it.value_type;
build_ptr->output_type = output_it.value_type;
build_ptr->accumulator_type = accum_t;
build_ptr->force_inclusive = force_inclusive;
build_ptr->init_kind = init_kind;
Expand Down
40 changes: 4 additions & 36 deletions cub/benchmarks/bench/scan/policy_selector.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,10 @@ struct policy_selector
# if USES_WARPSPEED()
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy
{
static constexpr int num_reduce_and_scan_warps = TUNE_NUM_REDUCE_SCAN_WARPS;
static constexpr int num_look_ahead_items = TUNE_NUM_LOOKBACK_ITEMS;
static constexpr int items_per_thread = TUNE_ITEMS_PLUS_ONE - 1;

static constexpr int num_threads_per_warp = 32;
static constexpr int num_load_warps = 1;
static constexpr int num_sched_warps = 1;
static constexpr int num_look_ahead_warps = 1;

static constexpr int num_total_warps =
2 * num_reduce_and_scan_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
static constexpr int squad_reduce_threads = num_reduce_and_scan_warps * num_threads_per_warp;
static constexpr int tile_size = items_per_thread * squad_reduce_threads;

auto warpspeed_policy = cub::detail::scan::scan_warpspeed_policy{
true,
num_reduce_and_scan_warps,
num_reduce_and_scan_warps,
num_load_warps,
num_sched_warps,
num_look_ahead_warps,
num_look_ahead_items,
num_total_threads,
items_per_thread,
tile_size};

return cub::detail::scan::scan_policy{
num_total_threads,
items_per_thread,
cub::BLOCK_LOAD_WARP_TRANSPOSE,
cub::LOAD_DEFAULT,
cub::BLOCK_STORE_WARP_TRANSPOSE,
cub::BLOCK_SCAN_WARP_SCANS,
cub::detail::delay_constructor_policy{cub::detail::delay_constructor_kind::fixed_delay, 350, 450},
warpspeed_policy};
cub::detail::scan::scan_policy policy{};
policy.warpspeed = cub::detail::scan::scan_warpspeed_policy{
true, TUNE_NUM_REDUCE_SCAN_WARPS, TUNE_NUM_LOOKBACK_ITEMS, TUNE_ITEMS_PLUS_ONE - 1};
return policy;
}
# else
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::scan::scan_policy
Expand Down
137 changes: 59 additions & 78 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,36 @@ struct DeviceScanKernelSource
AccumT,
EnforceInclusive == ForceInclusive::Yes>)

CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t InputSize()
{
return sizeof(it_value_t<UnwrappedInputIteratorT>);
}

CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t InputAlign()
{
return alignof(it_value_t<UnwrappedInputIteratorT>);
}

CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t OutputSize()
{
return sizeof(it_value_t<UnwrappedOutputIteratorT>);
}

CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t OutputAlign()
{
return alignof(it_value_t<UnwrappedOutputIteratorT>);
}

CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t AccumSize()
{
return sizeof(AccumT);
}

CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t AccumAlign()
{
return alignof(AccumT);
}

CUB_RUNTIME_FUNCTION static ScanTileStateT TileState()
{
return {};
Expand Down Expand Up @@ -481,21 +506,19 @@ struct DispatchScan
}

#if __cccl_ptx_isa >= 860
template <typename PolicyGetter, typename PolicySelectorT>
CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t
__invoke_lookahead_algorithm(PolicyGetter policy_getter, const PolicySelectorT& policy_selector)
template <typename PolicyGetter>
CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(PolicyGetter policy_getter)
{
if (num_items == 0)
{
temp_storage_bytes = 1; // just fulfill the contract that CUB always requires some temporary storage
return cudaSuccess;
}

CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter();
CUB_DETAIL_CONSTEXPR_ISH const auto warpspeed_policy = active_policy.warpspeed;
CUB_DETAIL_CONSTEXPR_ISH const detail::scan::scan_warpspeed_policy warpspeed_policy = policy_getter().warpspeed;

const int grid_dim =
static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(warpspeed_policy.tile_size)));
static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(warpspeed_policy.tile_size())));

if (d_temp_storage == nullptr)
{
Expand Down Expand Up @@ -524,37 +547,19 @@ struct DispatchScan
// TODO(bgruber): we probably need to ensure alignment of d_temp_storage
_CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.look_ahead_tile_state_alignment()), "");

using selector_smem_info_t = detail::scan::selector_smem_info<PolicySelectorT>;

auto scan_kernel = kernel_source.ScanKernel();
int smem_size_1_stage = 0;
if constexpr (selector_smem_info_t::has_static_layout)
{
CUB_DETAIL_CONSTEXPR_ISH int static_smem_size_1_stage = detail::scan::smem_for_stages(
warpspeed_policy,
1,
selector_smem_info_t::input_value_size,
selector_smem_info_t::input_value_alignment,
selector_smem_info_t::output_value_size,
selector_smem_info_t::output_value_alignment,
selector_smem_info_t::accum_size,
selector_smem_info_t::accum_alignment);
CUB_DETAIL_STATIC_ISH_ASSERT(static_smem_size_1_stage <= detail::max_smem_per_block,
"Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");
smem_size_1_stage = static_smem_size_1_stage;
}
else
{
smem_size_1_stage = detail::scan::smem_for_stages(
warpspeed_policy,
1,
policy_selector.input_value_size,
policy_selector.input_value_alignment,
policy_selector.output_value_size,
policy_selector.output_value_alignment,
policy_selector.accum_size,
policy_selector.accum_alignment);
}
auto scan_kernel = kernel_source.ScanKernel();
[[maybe_unused]] auto kernel_src = kernel_source; // need to pull a copy to not access `this` during const. eval.
CUB_DETAIL_CONSTEXPR_ISH int smem_size_1_stage = detail::scan::smem_for_stages(
warpspeed_policy,
1,
static_cast<int>(kernel_src.InputSize()),
static_cast<int>(kernel_src.InputAlign()),
static_cast<int>(kernel_src.OutputSize()),
static_cast<int>(kernel_src.OutputAlign()),
static_cast<int>(kernel_src.AccumSize()),
static_cast<int>(kernel_src.AccumAlign()));
Comment on lines +555 to +560
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my understanding is that everything here is at compile-time

Suggested change
static_cast<int>(kernel_src.InputSize()),
static_cast<int>(kernel_src.InputAlign()),
static_cast<int>(kernel_src.OutputSize()),
static_cast<int>(kernel_src.OutputAlign()),
static_cast<int>(kernel_src.AccumSize()),
static_cast<int>(kernel_src.AccumAlign()));
int{kernel_src.InputSize()},
int{kernel_src.InputAlign()},
int{kernel_src.OutputSize()},
int{kernel_src.OutputAlign()},
int{kernel_src.AccumSize()},
int{kernel_src.AccumAlign())};

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's only constexpr when called through the CUB API. It's just const when called through CCCL.C.

CUB_DETAIL_STATIC_ISH_ASSERT(smem_size_1_stage <= detail::max_smem_per_block,
"Single-stage warpspeed scan exceeds architecture independent SMEM (48KiB)");

int num_stages = 1;
int smem_size = smem_size_1_stage;
Expand All @@ -565,36 +570,19 @@ struct DispatchScan
// 1 CTA per SM +1 since it tends to improve performance
// TODO(bgruber): make the +1 a tuning parameter
const int max_stages_for_even_workload = static_cast<int>(
::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size)) + 1);
::cuda::ceil_div(num_items, static_cast<OffsetT>(sm_count * warpspeed_policy.tile_size())) + 1);

while (num_stages <= max_stages_for_even_workload)
{
const auto next_smem_size = [&] {
if constexpr (selector_smem_info_t::has_static_layout)
{
return detail::scan::smem_for_stages(
warpspeed_policy,
num_stages + 1,
selector_smem_info_t::input_value_size,
selector_smem_info_t::input_value_alignment,
selector_smem_info_t::output_value_size,
selector_smem_info_t::output_value_alignment,
selector_smem_info_t::accum_size,
selector_smem_info_t::accum_alignment);
}
else
{
return detail::scan::smem_for_stages(
warpspeed_policy,
num_stages + 1,
policy_selector.input_value_size,
policy_selector.input_value_alignment,
policy_selector.output_value_size,
policy_selector.output_value_alignment,
policy_selector.accum_size,
policy_selector.accum_alignment);
}
}();
const int next_smem_size = detail::scan::smem_for_stages(
warpspeed_policy,
num_stages + 1,
static_cast<int>(kernel_source.InputSize()),
static_cast<int>(kernel_source.InputAlign()),
static_cast<int>(kernel_source.OutputSize()),
static_cast<int>(kernel_source.OutputAlign()),
static_cast<int>(kernel_source.AccumSize()),
static_cast<int>(kernel_source.AccumAlign()));
if (next_smem_size > max_dynamic_smem_size)
{
// This number of stages failed, so stay at the current settings
Expand Down Expand Up @@ -647,7 +635,7 @@ struct DispatchScan

// Invoke scan kernel
{
const int block_dim = warpspeed_policy.num_total_threads;
const int block_dim = detail::scan::num_total_threads(warpspeed_policy);

# ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream);
Expand Down Expand Up @@ -689,9 +677,8 @@ struct DispatchScan
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_MSVC(4702)

template <typename PolicyGetter, typename PolicySelectorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
__invoke(PolicyGetter policy_getter, [[maybe_unused]] const PolicySelectorT& policy_selector)
template <typename PolicyGetter>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t __invoke(PolicyGetter policy_getter)
{
CUB_DETAIL_CONSTEXPR_ISH auto active_policy = policy_getter();

Expand All @@ -702,12 +689,12 @@ struct DispatchScan
# if defined(CUB_DEFINE_RUNTIME_POLICIES)
if (kernel_source.use_warpspeed(active_policy))
{
return __invoke_lookahead_algorithm(policy_getter, policy_selector);
return __invoke_lookahead_algorithm(policy_getter);
}
# else
if CUB_DETAIL_CONSTEXPR_ISH (KernelSource::use_warpspeed(active_policy))
{
return __invoke_lookahead_algorithm(policy_getter, policy_selector);
return __invoke_lookahead_algorithm(policy_getter);
}
# endif
#endif // __cccl_ptx_isa >= 860
Expand Down Expand Up @@ -851,13 +838,7 @@ struct DispatchScan
}
};

using policy_selector_t = detail::scan::policy_selector_from_types<
detail::it_value_t<InputIteratorT>,
detail::it_value_t<OutputIteratorT>,
AccumT,
OffsetT,
ScanOpT>;
Comment on lines -854 to -859
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: This was incorrect, since it ignored the user provided policy hub.

return __invoke(policy_getter{}, policy_selector_t{});
return __invoke(policy_getter{});
}

/**
Expand Down Expand Up @@ -1027,7 +1008,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
-1 /* ptx_version, not used actually */,
kernel_source,
launcher_factory}
.__invoke(policy_getter, policy_selector);
.__invoke(policy_getter);
});
}

Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/kernels/kernel_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ template <typename PolicySelector, typename InputIteratorT, typename OutputItera
if constexpr (policy.warpspeed
&& detail::scan::use_warpspeed<InputIteratorT, OutputIteratorT, AccumT>(policy.warpspeed))
{
return policy.warpspeed.num_total_threads;
return num_total_threads(policy.warpspeed);
}
#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
return policy.block_threads;
Expand Down
Loading
Loading