Skip to content

Commit

Permalink
Add multiset custom count APIs (#490)
Browse files Browse the repository at this point in the history
Closes #462, closes #488

This PR adds custom `count` APIs desired by libcudf hash join.

---------

Co-authored-by: Daniel Jünger <2955913+sleeepyjack@users.noreply.github.com>
  • Loading branch information
PointKernel and sleeepyjack committed May 28, 2024
1 parent 8f0f332 commit cd7b588
Show file tree
Hide file tree
Showing 9 changed files with 365 additions and 25 deletions.
37 changes: 30 additions & 7 deletions include/cuco/detail/open_addressing/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find(InputIt first,
/**
* @brief Counts the occurrences of keys in `[first, last)` contained in the container
*
* @tparam IsOuter Flag indicating whether it's an outer count or not
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize Number of threads in each block
* @tparam InputIt Device accessible input iterator
Expand All @@ -348,24 +349,46 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find(InputIt first,
* @param count Number of matches
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename AtomicT, typename Ref>
CUCO_KERNEL void count(InputIt first, cuco::detail::index_type n, AtomicT* count, Ref ref)
template <bool IsOuter,
int32_t CGSize,
int32_t BlockSize,
typename InputIt,
typename AtomicT,
typename Ref>
CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
cuco::detail::index_type n,
AtomicT* count,
Ref ref)
{
using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
using size_type = typename Ref::size_type;

size_type constexpr outer_min_count = 1;

using BlockReduce = cub::BlockReduce<size_type, BlockSize>;
__shared__ typename BlockReduce::TempStorage temp_storage;
typename Ref::size_type thread_count = 0;
size_type thread_count = 0;

auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

while (idx < n) {
auto const key = *(first + idx);
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
if constexpr (CGSize == 1) {
thread_count += ref.count(key);
if constexpr (IsOuter) {
thread_count += max(ref.count(key), outer_min_count);
} else {
thread_count += ref.count(key);
}
} else {
auto const tile =
cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
thread_count += ref.count(tile, key);
if constexpr (IsOuter) {
auto temp_count = ref.count(tile, key);
if (tile.all(temp_count == 0) and tile.thread_rank() == 0) { ++temp_count; }
thread_count += temp_count;
} else {
thread_count += ref.count(tile, key);
}
}
idx += loop_stride;
}
Expand Down
78 changes: 64 additions & 14 deletions include/cuco/detail/open_addressing/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ class open_addressing_impl {
* @brief Counts the occurrences of keys in `[first, last)` contained in the container
*
* @tparam Input Device accessible input iterator
* @tparam Ref Type of non-owning device container ref allowing access to storage
*
* @param first Beginning of the sequence of keys to count
* @param last End of the sequence of keys to count
Expand All @@ -542,22 +543,34 @@ class open_addressing_impl {
[[nodiscard]] size_type count(InputIt first,
InputIt last,
Ref container_ref,
cuda_stream_ref stream = {}) const noexcept
cuda_stream_ref stream) const noexcept
{
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return 0; }

auto counter =
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::count<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, counter.data(), container_ref);
auto constexpr is_outer = false;
return this->count<is_outer>(first, last, container_ref, stream);
}

return counter.load_to_host(stream);
/**
* @brief Counts the occurrences of keys in `[first, last)` contained in the container
*
* @note If a given key has no matches, its occurrence is 1.
*
* @tparam Input Device accessible input iterator
* @tparam Ref Type of non-owning device container ref allowing access to storage
*
* @param first Beginning of the sequence of keys to count
* @param last End of the sequence of keys to count
* @param stream CUDA stream used for count
*
* @return The sum of total occurrences of all keys in `[first, last)`
*/
template <typename InputIt, typename Ref>
[[nodiscard]] size_type count_outer(InputIt first,
InputIt last,
Ref container_ref,
cuda_stream_ref stream) const noexcept
{
auto constexpr is_outer = true;
return this->count<is_outer>(first, last, container_ref, stream);
}

/**
Expand Down Expand Up @@ -812,6 +825,43 @@ class open_addressing_impl {
[[nodiscard]] constexpr storage_ref_type storage_ref() const noexcept { return storage_.ref(); }

private:
/**
* @brief Counts the occurrences of keys in `[first, last)` contained in the container
*
* @note If `IsOuter` is `true`, the occurrence of a non-match key is 1. Else, it's 0.
*
* @tparam IsOuter Flag indicating whether it's an outer count or not
* @tparam Input Device accessible input iterator
* @tparam Ref Type of non-owning device container ref allowing access to storage
*
* @param first Beginning of the sequence of keys to count
* @param last End of the sequence of keys to count
* @param stream CUDA stream used for count
*
* @return The sum of total occurrences of all keys in `[first, last)`
*/
template <bool IsOuter, typename InputIt, typename Ref>
[[nodiscard]] size_type count(InputIt first,
InputIt last,
Ref container_ref,
cuda_stream_ref stream) const noexcept
{
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return 0; }

auto counter =
detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
counter.reset(stream);

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::count<IsOuter, cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, counter.data(), container_ref);

return counter.load_to_host(stream);
}

/**
* @brief Extracts the key from a given slot.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ class open_addressing_ref_impl {
/**
* @brief Counts the occurrence of a given key contained in the container
*
* @tparam ProbeKey Input type
* @tparam ProbeKey Probe key type
*
* @param key The key to count for
*
Expand Down Expand Up @@ -843,7 +843,7 @@ class open_addressing_ref_impl {
/**
* @brief Counts the occurrence of a given key contained in the container
*
* @tparam ProbeKey Input type which is convertible to 'key_type'
* @tparam ProbeKey Probe key type
*
* @param group The Cooperative Group used to perform group count
* @param key The key to count for
Expand Down Expand Up @@ -871,7 +871,6 @@ class open_addressing_ref_impl {
}();

if (group.any(state == detail::equal_result::EMPTY)) { return count; }

++probing_iter;
}
}
Expand Down
45 changes: 45 additions & 0 deletions include/cuco/detail/static_multiset/static_multiset.inl
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,51 @@ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>
return impl_->count(first, last, ref(op::count), stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
template <typename InputIt, typename ProbeKeyEqual, typename ProbeHash>
static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::count(
InputIt first,
InputIt last,
ProbeKeyEqual const& probe_key_equal,
ProbeHash const& probe_hash,
cuda_stream_ref stream) const noexcept
{
return impl_->count(first,
last,
ref(op::count).with_key_eq(probe_key_equal).with_hash_function(probe_hash),
stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
class KeyEqual,
class ProbingScheme,
class Allocator,
class Storage>
template <typename InputIt, typename ProbeKeyEqual, typename ProbeHash>
static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::count_outer(
InputIt first,
InputIt last,
ProbeKeyEqual const& probe_key_equal,
ProbeHash const& probe_hash,
cuda_stream_ref stream) const noexcept
{
return impl_->count_outer(
first,
last,
ref(op::count).with_key_eq(probe_key_equal).with_hash_function(probe_hash),
stream);
}

template <class Key,
class Extent,
cuda::thread_scope Scope,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ class operator_impl<
/**
* @brief Counts the occurrence of a given key contained in multiset
*
* @tparam ProbeKey Input type
* @tparam ProbeKey Probe key type
*
* @param group The Cooperative Group used to perform group count
* @param key The key to count for
Expand Down
29 changes: 29 additions & 0 deletions include/cuco/detail/utility/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#pragma once

#include <cuco/detail/error.hpp>
#include <cuco/detail/utility/math.hpp>

namespace cuco {
Expand Down Expand Up @@ -45,5 +46,33 @@ constexpr auto grid_size(index_type num,
return int_div_ceil(cg_size * num, stride * block_size);
}

/**
* @brief Computes the ideal 1D grid size with the given parameters
*
* @tparam Kernel Kernel type
*
* @param block_size Number of threads in each thread block
* @param kernel CUDA kernel to launch
* @param dynamic_shm_size Dynamic shared memory size
*
* @return The grid size that delivers the highest occupancy
*/
template <typename Kernel>
constexpr auto max_occupancy_grid_size(int32_t block_size,
Kernel kernel,
std::size_t dynamic_shm_size = 0)
{
int32_t device = 0;
CUCO_CUDA_TRY(cudaGetDevice(&device));
cudaDeviceProp device_props;
CUCO_CUDA_TRY(cudaGetDeviceProperties(&device_props, device));
int32_t num_multiprocessors = device_props.multiProcessorCount;
int32_t max_active_blocks_per_multiprocessor;
CUCO_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_multiprocessor, kernel, block_size, dynamic_shm_size));

return max_active_blocks_per_multiprocessor * num_multiprocessors;
}

} // namespace detail
} // namespace cuco
52 changes: 52 additions & 0 deletions include/cuco/static_multiset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,8 @@ class static_multiset {
/**
* @brief Counts the occurrences of keys in `[first, last)` contained in the multiset
*
* @note This function synchronizes the given stream.
*
* @tparam Input Device accessible input iterator
*
* @param first Beginning of the sequence of keys to count
Expand All @@ -487,6 +489,56 @@ class static_multiset {
template <typename InputIt>
size_type count(InputIt first, InputIt last, cuda_stream_ref stream = {}) const noexcept;

/**
* @brief Counts the occurrences of keys in `[first, last)` contained in the multiset
*
* @note This function synchronizes the given stream.
*
* @tparam Input Device accessible input iterator
* @tparam ProbeKeyEqual Binary callable
* @tparam ProbeHash Unary hash callable
*
* @param first Beginning of the sequence of keys to count
* @param last End of the sequence of keys to count
* @param probe_key_equal Binary callable to compare two keys for equality
* @param probe_hash Unary callable to hash a given key
* @param stream CUDA stream used for count
*
* @return The sum of total occurrences of all keys in `[first, last)`
*/
template <typename InputIt, typename ProbeKeyEqual, typename ProbeHash>
size_type count(InputIt first,
InputIt last,
ProbeKeyEqual const& probe_key_equal,
ProbeHash const& probe_hash,
cuda_stream_ref stream = {}) const noexcept;

/**
* @brief Counts the occurrences of keys in `[first, last)` contained in the multiset
*
* @note This function synchronizes the given stream.
* @note If a given key has no matches, its occurrence is 1.
*
* @tparam Input Device accessible input iterator
* @tparam ProbeKeyEqual Binary callable
* @tparam ProbeHash Unary hash callable
*
* @param first Beginning of the sequence of keys to count
* @param last End of the sequence of keys to count
* @param probe_key_equal Binary callable to compare two keys for equality
* @param probe_hash Unary callable to hash a given key
* @param stream CUDA stream used for count
*
* @return The sum of total occurrences of all keys in `[first, last)` where keys have no matches
* are considered to have a single occurrence.
*/
template <typename InputIt, typename ProbeKeyEqual, typename ProbeHash>
size_type count_outer(InputIt first,
InputIt last,
ProbeKeyEqual const& probe_key_equal,
ProbeHash const& probe_hash,
cuda_stream_ref stream = {}) const noexcept;

/**
* @brief Gets the number of elements in the container.
*
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ ConfigureTest(DYNAMIC_MAP_TEST
ConfigureTest(STATIC_MULTISET_TEST
static_multiset/contains_test.cu
static_multiset/count_test.cu
static_multiset/custom_count_test.cu
static_multiset/find_test.cu
static_multiset/insert_test.cu)

Expand Down
Loading

0 comments on commit cd7b588

Please sign in to comment.