Merge pull request #694 from senior-zero/enh-main/github/histogram_tu…

…ning Add policy parameter to allow tuning
NVIDIA · May 27, 2023 · e6eb419 · e6eb419
2 parents 562cf94 + 3567ba0
commit e6eb419
Show file tree

Hide file tree

Showing 38 changed files with 5,202 additions and 2,180 deletions.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -11,29 +11,28 @@
 - reduce
   - sum
   - max
+  - by_key
 - scan
   - sum
   - max
+  - by key
 - select
   - flagged
   - if
 - partition
   - flagged
   - if
-- scan 
-  - by key
-
-### TODO
-
-- adjacent difference 
-  - left
-  - right
-- histogram : needs policy
+- histogram 
   - even
   - range
-- rle : needs policy
+  - multi even
+  - multi range
+- rle 
   - encode
   - non trivial runs
-- reduce : needs policy
-  - by key
+- adjacent difference 
+  - left
+
+### TODO
+
 - segmented
diff --git a/benchmarks/bench/adjacent_difference/subtract_left.cu b/benchmarks/bench/adjacent_difference/subtract_left.cu
@@ -0,0 +1,121 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_adjacent_difference.cuh>
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+
+#if !TUNE_BASE
+struct policy_hub_t
+{
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    using AdjacentDifferencePolicy =
+      cub::AgentAdjacentDifferencePolicy<TUNE_THREADS_PER_BLOCK,
+                                         TUNE_ITEMS_PER_THREAD,
+                                         cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                                         cub::LOAD_CA,
+                                         cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+#endif // !TUNE_BASE
+
+template <class T, class OffsetT>
+void adjacent_difference(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  constexpr bool may_alias = false;
+  constexpr bool read_left = true;
+
+  using input_it_t = const T*;
+  using output_it_t = T*;
+  using difference_op_t = cub::Difference;
+  using offset_t = typename cub::detail::ChooseOffsetT<OffsetT>::Type;
+
+#if !TUNE_BASE
+  using dispatch_t = cub::DispatchAdjacentDifference<input_it_t,
+                                                     output_it_t,
+                                                     difference_op_t,
+                                                     offset_t,
+                                                     may_alias,
+                                                     read_left,
+                                                     policy_hub_t>;
+#else
+  using dispatch_t = cub::DispatchAdjacentDifference<input_it_t,
+                                                     output_it_t,
+                                                     difference_op_t,
+                                                     offset_t,
+                                                     may_alias,
+                                                     read_left>;
+#endif // TUNE_BASE
+
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in(elements);
+  thrust::device_vector<T> out(elements);
+  gen(seed_t{}, in);
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  std::size_t temp_storage_bytes{};
+  dispatch_t::Dispatch(nullptr,
+                       temp_storage_bytes,
+                       d_in,
+                       d_out,
+                       static_cast<offset_t>(elements),
+                       difference_op_t{},
+                       0);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  state.exec([&](nvbench::launch &launch) {
+    dispatch_t::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_in,
+                         d_out,
+                         static_cast<offset_t>(elements),
+                         difference_op_t{},
+                         launch.get_stream());
+  });
+}
+
+
+using types = nvbench::type_list<int32_t>;
+
+NVBENCH_BENCH_TYPES(adjacent_difference, NVBENCH_TYPE_AXES(types, offset_types))
+  .set_name("cub::DeviceAdjacentDifference::SubtractLeftCopy")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/benchmarks/bench/histogram/even.cu b/benchmarks/bench/histogram/even.cu
@@ -0,0 +1,142 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "histogram_common.cuh"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
+// %RANGE% TUNE_WORK_STEALING ws 0:1:1
+// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+
+template <typename SampleT, typename CounterT, typename OffsetT>
+static void histogram(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
+{
+  constexpr int num_channels        = 1;
+  constexpr int num_active_channels = 1;
+
+  using sample_iterator_t = SampleT *;
+
+#if !TUNE_BASE 
+  using policy_t   = policy_hub_t<key_t, num_active_channels>;
+  using dispatch_t = cub::DispatchHistogram<num_channels, //
+                                            num_active_channels,
+                                            sample_iterator_t,
+                                            CounterT,
+                                            SampleT,
+                                            OffsetT,
+                                            policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchHistogram<num_channels, //
+                                            num_active_channels,
+                                            sample_iterator_t,
+                                            CounterT,
+                                            SampleT,
+                                            OffsetT>;
+#endif // TUNE_BASE
+
+  const auto entropy   = str_to_entropy(state.get_string("Entropy"));
+  const auto elements  = state.get_int64("Elements{io}");
+  const auto num_bins  = state.get_int64("Bins");
+  const int num_levels = static_cast<int>(num_bins) + 1;
+
+  const SampleT lower_level = 0;
+  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+
+  thrust::device_vector<SampleT> input(elements);
+  thrust::device_vector<CounterT> hist(num_bins);
+  gen(seed_t{}, input, entropy, lower_level, upper_level);
+
+  SampleT *d_input      = thrust::raw_pointer_cast(input.data());
+  CounterT *d_histogram = thrust::raw_pointer_cast(hist.data());
+
+  CounterT *d_histogram1[1] = {d_histogram};
+  int num_levels1[1]        = {num_levels};
+  SampleT lower_level1[1]   = {lower_level};
+  SampleT upper_level1[1]   = {upper_level};
+
+  std::uint8_t *d_temp_storage = nullptr;
+  std::size_t temp_storage_bytes{};
+
+  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  OffsetT num_rows           = 1;
+  OffsetT row_stride_samples = num_row_pixels;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<SampleT>(elements);
+  state.add_global_memory_writes<CounterT>(num_bins);
+
+  dispatch_t::DispatchEven(d_temp_storage,
+                           temp_storage_bytes,
+                           d_input,
+                           d_histogram1,
+                           num_levels1,
+                           lower_level1,
+                           upper_level1,
+                           num_row_pixels,
+                           num_rows,
+                           row_stride_samples,
+                           0,
+                           is_byte_sample);
+
+  thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(tmp.data());
+
+  state.exec([&](nvbench::launch &launch) {
+    dispatch_t::DispatchEven(d_temp_storage,
+                             temp_storage_bytes,
+                             d_input,
+                             d_histogram1,
+                             num_levels1,
+                             lower_level1,
+                             upper_level1,
+                             num_row_pixels,
+                             num_rows,
+                             row_stride_samples,
+                             launch.get_stream(),
+                             is_byte_sample);
+  });
+}
+
+using bin_types         = nvbench::type_list<int32_t>;
+using some_offset_types = nvbench::type_list<int32_t>;
+
+#ifdef TUNE_SampleT
+using sample_types = nvbench::type_list<TUNE_SampleT>;
+#else // !defined(TUNE_SampleT)
+using sample_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
+#endif // TUNE_SampleT
+
+NVBENCH_BENCH_TYPES(histogram, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types))
+  .set_name("cub::cub::DeviceHistogram::HistogramEven")
+  .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_axis("Bins", {128, 2048, 2097152})
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});