Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clangd
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ CompileFlags:
Add:
- -x
- cuda
- --offload-arch=sm_90
- -Wno-unknown-cuda-version
- -Wno-pragma-system-header-outside-header
- --no-cuda-version-check
Expand Down
149 changes: 131 additions & 18 deletions cub/benchmarks/bench/segmented_topk/fixed/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,35 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <cub/detail/choose_offset.cuh>
#include <cub/device/device_topk.cuh>
#include <cub/device/dispatch/dispatch_batched_topk.cuh>
#include <cub/device/dispatch/dispatch_batched_topk_cluster.cuh>

#include <cuda/__execution/determinism.h>
#include <cuda/__execution/output_ordering.h>
#include <cuda/__execution/require.h>
#include <cuda/iterator>
#include <cuda/std/__execution/env.h>
#include <cuda/stream>

#include <algorithm>
#include <vector>

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 1:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_BLOCK_LOAD_ALGORITHM ld 0:2:1

enum class topk_backend
{
baseline,
cluster,
device,
};

inline constexpr topk_backend selected_backend = topk_backend::baseline;

#if !TUNE_BASE
struct tuned_policy_selector
{
Expand Down Expand Up @@ -39,6 +58,109 @@ struct tuned_policy_selector
};
#endif // !TUNE_BASE

template <typename KeyInputItItT,
typename KeyOutputItItT,
typename SegmentSizeParamT,
typename KParamT,
typename SelectDirectionParamT,
typename NumSegmentsParameterT,
typename TotalNumItemsGuaranteeT,
typename HostSegSizeT>
CUB_RUNTIME_FUNCTION static cudaError_t batched_topk_keys(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputItItT d_keys_in,
KeyOutputItItT d_keys_out,
SegmentSizeParamT segment_sizes,
KParamT k,
SelectDirectionParamT select_directions,
NumSegmentsParameterT num_segments,
TotalNumItemsGuaranteeT total_num_items,
const HostSegSizeT* h_segment_sizes,
cudaStream_t stream = nullptr)
{
if constexpr (selected_backend == topk_backend::cluster)
{
(void) h_segment_sizes;
return cub::detail::batched_topk_cluster::dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
segment_sizes,
k,
select_directions,
num_segments,
total_num_items,
stream);
}
else if constexpr (selected_backend == topk_backend::device)
{
using num_segments_val_t = typename NumSegmentsParameterT::value_type;
const auto num_segs = num_segments.get_param(num_segments_val_t{0});

auto requirements = cuda::execution::require(
cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);

if (d_temp_storage == nullptr)
{
const auto max_size = *std::max_element(h_segment_sizes, h_segment_sizes + num_segs);
const auto k_value = k.get_param(num_segments_val_t{0});
return cub::DeviceTopK::MaxKeys(
nullptr,
temp_storage_bytes,
d_keys_in[num_segments_val_t{0}],
d_keys_out[num_segments_val_t{0}],
static_cast<cuda::std::int64_t>(max_size),
static_cast<cuda::std::int64_t>(k_value),
cuda::std::execution::env{requirements});
}

cuda::stream_ref stream_ref{stream};
auto env = cuda::std::execution::env{stream_ref, requirements};
for (num_segments_val_t i = 0; i < num_segs; ++i)
{
const auto k_value = k.get_param(i);
const auto seg_size = h_segment_sizes[i];
if (const auto err = cub::DeviceTopK::MaxKeys(
d_temp_storage,
temp_storage_bytes,
d_keys_in[i],
d_keys_out[i],
static_cast<cuda::std::int64_t>(seg_size),
static_cast<cuda::std::int64_t>(k_value),
env);
err != cudaSuccess)
{
return err;
}
}
return cudaSuccess;
}
else
{
(void) h_segment_sizes;
return cub::detail::batched_topk::dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
static_cast<cub::NullType**>(nullptr),
static_cast<cub::NullType**>(nullptr),
segment_sizes,
k,
select_directions,
num_segments,
total_num_items,
stream
#if !TUNE_BASE
,
tuned_policy_selector{}
#endif // !TUNE_BASE
);
}
}

template <typename KeyT, int MaxSegmentSize, int MaxNumSelected>
void fixed_seg_size_topk_keys(
nvbench::state& state,
Expand Down Expand Up @@ -97,50 +219,41 @@ void fixed_seg_size_topk_keys(
state.add_global_memory_reads<KeyT>(elements, "InputKeys");
state.add_global_memory_writes<KeyT>(selected_elements * num_segments, "OutputKeys");

// Host copy of segment sizes — all entries equal MaxSegmentSize for fixed-size segments.
std::vector<cuda::std::int64_t> h_segment_sizes(num_segments, static_cast<cuda::std::int64_t>(MaxSegmentSize));

// allocate temporary storage
size_t temp_size;
cub::detail::batched_topk::dispatch(
batched_topk_keys(
nullptr,
temp_size,
d_keys_in,
d_keys_out,
static_cast<cub::NullType**>(nullptr),
static_cast<cub::NullType**>(nullptr),
segment_sizes,
k,
select_directions,
num_segments_uniform_t{static_cast<::cuda::std::int64_t>(num_segments)},
total_num_items,
nullptr
#if !TUNE_BASE
,
tuned_policy_selector{}
#endif // !TUNE_BASE
);
h_segment_sizes.data(),
nullptr);

thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

// run the algorithm
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::detail::batched_topk::dispatch(
batched_topk_keys(
temp_storage,
temp_size,
d_keys_in,
d_keys_out,
static_cast<cub::NullType**>(nullptr),
static_cast<cub::NullType**>(nullptr),
segment_sizes,
k,
select_directions,
num_segments_uniform_t{static_cast<::cuda::std::int64_t>(num_segments)},
total_num_items,
launch.get_stream()
#if !TUNE_BASE
,
tuned_policy_selector{}
#endif // !TUNE_BASE
);
h_segment_sizes.data(),
launch.get_stream());
});
}

Expand Down
131 changes: 125 additions & 6 deletions cub/benchmarks/bench/segmented_topk/variable/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,124 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <cub/detail/choose_offset.cuh>
#include <cub/device/device_topk.cuh>
#include <cub/device/dispatch/dispatch_batched_topk.cuh>
#include <cub/device/dispatch/dispatch_batched_topk_cluster.cuh>

#include <cuda/__execution/determinism.h>
#include <cuda/__execution/output_ordering.h>
#include <cuda/__execution/require.h>
#include <cuda/std/__execution/env.h>
#include <cuda/stream>

#include <algorithm>

enum class topk_backend
{
baseline,
cluster,
device,
};

inline constexpr topk_backend selected_backend = topk_backend::baseline;

template <typename KeyInputItItT,
typename KeyOutputItItT,
typename SegmentSizeParamT,
typename KParamT,
typename SelectDirectionParamT,
typename NumSegmentsParameterT,
typename TotalNumItemsGuaranteeT,
typename HostSegSizeT>
CUB_RUNTIME_FUNCTION static cudaError_t batched_topk_keys(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputItItT d_keys_in,
KeyOutputItItT d_keys_out,
SegmentSizeParamT segment_sizes,
KParamT k,
SelectDirectionParamT select_directions,
NumSegmentsParameterT num_segments,
TotalNumItemsGuaranteeT total_num_items,
const HostSegSizeT* h_segment_sizes,
cudaStream_t stream = nullptr)
{
if constexpr (selected_backend == topk_backend::cluster)
{
(void) h_segment_sizes;
return cub::detail::batched_topk_cluster::dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
segment_sizes,
k,
select_directions,
num_segments,
total_num_items,
stream);
}
else if constexpr (selected_backend == topk_backend::device)
{
using num_segments_val_t = typename NumSegmentsParameterT::value_type;
const auto num_segs = num_segments.get_param(num_segments_val_t{0});

auto requirements = cuda::execution::require(
cuda::execution::determinism::not_guaranteed, cuda::execution::output_ordering::unsorted);

if (d_temp_storage == nullptr)
{
const auto max_size = *std::max_element(h_segment_sizes, h_segment_sizes + num_segs);
const auto k_value = k.get_param(num_segments_val_t{0});
return cub::DeviceTopK::MaxKeys(
nullptr,
temp_storage_bytes,
d_keys_in[num_segments_val_t{0}],
d_keys_out[num_segments_val_t{0}],
static_cast<cuda::std::int64_t>(max_size),
static_cast<cuda::std::int64_t>(k_value),
cuda::std::execution::env{requirements});
}

cuda::stream_ref stream_ref{stream};
auto env = cuda::std::execution::env{stream_ref, requirements};
for (num_segments_val_t i = 0; i < num_segs; ++i)
{
const auto k_value = k.get_param(i);
const auto seg_size = h_segment_sizes[i];
if (const auto err = cub::DeviceTopK::MaxKeys(
d_temp_storage,
temp_storage_bytes,
d_keys_in[i],
d_keys_out[i],
static_cast<cuda::std::int64_t>(seg_size),
static_cast<cuda::std::int64_t>(k_value),
env);
err != cudaSuccess)
{
return err;
}
}
return cudaSuccess;
}
else
{
(void) h_segment_sizes;
return cub::detail::batched_topk::dispatch(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_keys_out,
static_cast<cub::NullType**>(nullptr),
static_cast<cub::NullType**>(nullptr),
segment_sizes,
k,
select_directions,
num_segments,
total_num_items,
stream);
}
}

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
Expand Down Expand Up @@ -197,37 +314,39 @@ void variable_seg_size_topk_keys(nvbench::state& state,
state.add_global_memory_reads<KeyT>(input_elements, "InputKeys");
state.add_global_memory_writes<KeyT>(output_elements, "OutputKeys");

// Host copy of segment sizes — consumed by the per-segment device backend.
std::vector<cuda::std::int64_t> h_segment_sizes(static_cast<std::size_t>(num_segments));
thrust::copy(d_segment_sizes.begin(), d_segment_sizes.end(), h_segment_sizes.begin());

size_t temp_size{};
cub::detail::batched_topk::dispatch(
batched_topk_keys(
nullptr,
temp_size,
d_keys_in,
d_keys_out,
static_cast<cub::NullType**>(nullptr),
static_cast<cub::NullType**>(nullptr),
segment_sizes_param,
k_param,
select_directions,
num_segments_uniform_param,
total_num_items,
h_segment_sizes.data(),
nullptr);

thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::detail::batched_topk::dispatch(
batched_topk_keys(
temp_storage,
temp_size,
d_keys_in,
d_keys_out,
static_cast<cub::NullType**>(nullptr),
static_cast<cub::NullType**>(nullptr),
segment_sizes_param,
k_param,
select_directions,
num_segments_uniform_param,
total_num_items,
h_segment_sizes.data(),
launch.get_stream());
});
}
Expand Down
Loading
Loading