Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 37 additions & 13 deletions cub/cub/device/dispatch/kernels/transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -202,14 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
}
}

template <typename VectorizedPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InputT>
template <typename VectorizedPolicy,
typename Offset,
typename F,
typename RandomAccessIteratorOut,
typename... RandomAccessIteratorsIn>
_CCCL_DEVICE void transform_kernel_vectorized(
Offset num_items,
int num_elem_per_thread_prefetch,
bool can_vectorize,
F f,
RandomAccessIteratorOut out,
const InputT*... ins)
RandomAccessIteratorsIn... ins)
{
constexpr int block_dim = VectorizedPolicy::block_threads;
constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
Expand Down Expand Up @@ -240,9 +244,12 @@ _CCCL_DEVICE void transform_kernel_vectorized(
constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
using load_store_t = decltype(load_store_type<load_store_size>());
using output_t = it_value_t<RandomAccessIteratorOut>;
using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const InputT&...>>;
using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
// picks output type size if there are no inputs
constexpr int element_size = int{first_item(sizeof(InputT)..., size_of<output_t>)};
constexpr int element_size = int{first_nonzero_value(
(sizeof(it_value_t<RandomAccessIteratorsIn>)
* THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
size_of<output_t>)};
Comment on lines +249 to +252
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should really pull that out into a function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am strongly considering to refactor the entire mess, so let's postpone any small fixes for now.

constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;

static_assert((items_per_thread * element_size) % load_store_size == 0);
Expand All @@ -258,18 +265,35 @@ _CCCL_DEVICE void transform_kernel_vectorized(

auto provide_array = [&](auto... inputs) {
// load inputs
// TODO(bgruber): we could support fancy iterators for loading here as well (and only vectorize some inputs)
[[maybe_unused]] auto load_tile_vectorized = [&](auto* in, auto& input) {
auto in_vec = reinterpret_cast<const load_store_t*>(in);
auto input_vec = reinterpret_cast<load_store_t*>(input.data());
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < load_store_count; ++i)
[[maybe_unused]] auto load_tile = [](auto in, auto& input) {
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
{
input_vec[i] = in_vec[i * VectorizedPolicy::block_threads + threadIdx.x];
auto in_vec = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
auto input_vec = reinterpret_cast<load_store_t*>(input.data());
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < load_store_count; ++i)
{
input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
}
}
else
{
constexpr int elems = load_store_size / element_size;
in += threadIdx.x * elems;
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < load_store_count; ++i)
{
_CCCL_PRAGMA_UNROLL_FULL()
for (int j = 0; j < elems; ++j)
{
input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
}
}
}
};
_CCCL_PDL_GRID_DEPENDENCY_SYNC();
(load_tile_vectorized(ins, inputs), ...);
(load_tile(ins, inputs), ...);

// Benchmarks showed up to 38% slowdown on H200 (some improvements as well), so omitted. See #5249 for details.
// _CCCL_PDL_TRIGGER_NEXT_LAUNCH();

Expand All @@ -280,7 +304,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
output[i] = f(inputs[i]...);
}
};
provide_array(uninitialized_array<InputT, items_per_thread>{}...);
provide_array(uninitialized_array<it_value_t<RandomAccessIteratorsIn>, items_per_thread>{}...);

// write output
if constexpr (can_vectorize_store)
Expand Down
74 changes: 55 additions & 19 deletions cub/cub/device/dispatch/tuning/tuning_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -282,21 +282,45 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
return 12 * 1024; // V100 and below
}

template <typename T, typename... Ts>
_CCCL_HOST_DEVICE constexpr bool all_equal([[maybe_unused]] T head, Ts... tail)
template <typename H, typename... Ts>
_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
{
return ((head == tail) && ...);
size_t first = 0;
for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
{
if (v == 0)
{
continue;
}
if (first == 0)
{
first = v;
}
else if (v != first)
{
return false;
}
}
return true;
}

_CCCL_HOST_DEVICE constexpr bool all_equal()
_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
{
return true;
}

template <typename T, typename... Ts>
_CCCL_HOST_DEVICE constexpr auto first_item(T head, Ts...) -> T
template <typename H, typename... Ts>
_CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
{
return head;
for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
{
if (v != 0)
{
return v;
}
}
// we only reach here when all input are not contiguous and the output has a void value type
return H{1};
}

template <typename T>
Expand Down Expand Up @@ -336,25 +360,36 @@ struct policy_hub<RequiresStableAddress,
(THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn> && ...);
static constexpr bool all_input_values_trivially_reloc =
(THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>> && ...);
static constexpr bool can_memcpy_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
static constexpr bool can_memcpy_all_inputs = all_inputs_contiguous && all_input_values_trivially_reloc;
// the vectorized kernel supports mixing contiguous and non-contiguous iterators
static constexpr bool can_memcpy_contiguous_inputs =
((!THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>
|| THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
&& ...);
Comment on lines +365 to +368
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should have a trait for that, I guess it will come up more often


// for vectorized policy:
static constexpr bool all_input_values_same_size = all_equal(sizeof(it_value_t<RandomAccessIteratorsIn>)...);
static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
// if there are no inputs, we take the size of the output value
static constexpr int value_type_size =
first_item(int{sizeof(it_value_t<RandomAccessIteratorsIn>)}..., int{size_of<it_value_t<RandomAccessIteratorOut>>});
static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
(sizeof(it_value_t<RandomAccessIteratorsIn>)
* THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
// find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
// value type
static constexpr int contiguous_value_type_size = first_nonzero_value(
(int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
* THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
int{size_of<it_value_t<RandomAccessIteratorOut>>});
Comment on lines +378 to +380
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🙀 I believe those warrant a slightly more elaborate comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs a bigger refactoring in general

static constexpr bool value_type_divides_load_store_size =
load_store_word_size % value_type_size == 0; // implicitly checks that value_type_size <= load_store_word_size
load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
// load_store_word_size
static constexpr int target_bytes_per_thread =
no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
static constexpr int items_per_thread_vec =
::cuda::round_up(target_bytes_per_thread, load_store_word_size) / value_type_size;
::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;

static constexpr bool fallback_to_prefetch =
RequiresStableAddress || !can_memcpy_inputs || !all_input_values_same_size || !value_type_divides_load_store_size
|| !DenseOutput;
RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
|| !value_type_divides_load_store_size || !DenseOutput;

// TODO(bgruber): consider a separate kernel for just filling

Expand All @@ -380,7 +415,7 @@ struct policy_hub<RequiresStableAddress,
block_threads* async_policy::min_items_per_thread,
ldgsts_size_and_align)
> int{max_smem_per_block};
static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams;
static constexpr bool fallback_to_vectorized = exhaust_smem || no_input_streams || !can_memcpy_all_inputs;

public:
static constexpr int min_bif = arch_to_min_bytes_in_flight(800);
Expand Down Expand Up @@ -421,7 +456,8 @@ struct policy_hub<RequiresStableAddress,
(((int{sizeof(it_value_t<RandomAccessIteratorsIn>)} * AsyncBlockSize) % max_alignment == 0) && ...);
static constexpr bool enough_threads_for_peeling = AsyncBlockSize >= alignment; // head and tail bytes
static constexpr bool fallback_to_vectorized =
exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams;
exhaust_smem || !tile_sizes_retain_alignment || !enough_threads_for_peeling || no_input_streams
|| !can_memcpy_all_inputs;

public:
static constexpr int min_bif = arch_to_min_bytes_in_flight(PtxVersion);
Expand Down
36 changes: 29 additions & 7 deletions thrust/benchmarks/bench/tabulate/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,36 @@

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sequence.h>
#include <thrust/tabulate.h>

#include <nvbench_helper.cuh>

#include "thrust/detail/raw_pointer_cast.h"

template <typename T>
static void sequence(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> output(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc;
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
// sequence is implemented via thrust::tabulate
thrust::sequence(policy(alloc, launch), output.begin(), output.end());
});
}

NVBENCH_BENCH_TYPES(sequence, NVBENCH_TYPE_AXES(integral_types))
.set_name("sequence")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));

template <class T>
struct seg_size_t
{
Expand All @@ -41,17 +65,17 @@ struct seg_size_t
template <class OffsetT>
__device__ T operator()(OffsetT i)
{
return d_offsets[i + 1] - d_offsets[i];
return static_cast<T>(d_offsets[i + 1] - d_offsets[i]);
}
};

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
static void seg_size(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> input(elements + 1);
thrust::device_vector<T> output(elements);
thrust::device_vector<T> output(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements + 1);
Expand All @@ -65,9 +89,7 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
});
}

using types = nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>;

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(types))
.set_name("base")
NVBENCH_BENCH_TYPES(seg_size, NVBENCH_TYPE_AXES(integral_types))
.set_name("seg_size")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
30 changes: 8 additions & 22 deletions thrust/thrust/system/cuda/detail/tabulate.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,37 +39,23 @@
#if _CCCL_HAS_CUDA_COMPILER()
# include <thrust/system/cuda/config.h>

# include <thrust/distance.h>
# include <thrust/system/cuda/detail/parallel_for.h>
# include <thrust/system/cuda/detail/transform.h>
# include <thrust/system/cuda/execution_policy.h>

# include <cuda/__functional/address_stability.h>
# include <cuda/std/iterator>

THRUST_NAMESPACE_BEGIN
namespace cuda_cub
{
namespace __tabulate
{
template <class Iterator, class TabulateOp>
struct functor
{
Iterator items;
TabulateOp op;

template <typename Size>
void _CCCL_DEVICE operator()(Size idx)
{
items[idx] = op(idx);
}
};
} // namespace __tabulate

template <class Derived, class Iterator, class TabulateOp>
void _CCCL_HOST_DEVICE tabulate(execution_policy<Derived>& policy, Iterator first, Iterator last, TabulateOp tabulate_op)
{
using size_type = thrust::detail::it_difference_t<Iterator>;
size_type count = ::cuda::std::distance(first, last);
cuda_cub::parallel_for(policy, __tabulate::functor<Iterator, TabulateOp>{first, tabulate_op}, count);
using size_type = ::cuda::std::iter_difference_t<Iterator>;
const auto count = ::cuda::std::distance(first, last);
cuda_cub::transform_n(
policy, ::cuda::counting_iterator<size_type>{}, count, first, ::cuda::proclaim_copyable_arguments(tabulate_op));
}

} // namespace cuda_cub
THRUST_NAMESPACE_END
#endif
Loading