diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
index 82573be757..5ca79b4598 100644
--- a/CHANGE_LOG.TXT
+++ b/CHANGE_LOG.TXT
@@ -1,14 +1,15 @@
//-----------------------------------------------------------------------------
-0.9.3 04/30/2013
+0.9.4 05/07/2013
+ - Fixed compilation errors for SM10-SM13
+ - Fixed compilation errors for some WarpScan entrypoints on SM30+
+ - Added block-wide histogram (BlockHisto256)
+ - Added device-wide histogram (DeviceHisto256)
- Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which
trades more register consumption for less shared memory I/O)
- - Added block-wide histogram (BlockHisto256)
- Updates to BlockRadixRank to use BlockScan (which improves performance
on Kepler due to SHFL instruction)
- - Added device-wide histogram (DeviceHisto256)
- - Fixed compilation errors for some WarpScan entrypoints on SM30+
- Allow types other than C++ primitives to be used in WarpScan::*Sum methods
if they only have operator + overloaded. (Previously they also required
to support assignment from int(0).)
diff --git a/cub/block/block_histo_256.cuh b/cub/block/block_histo_256.cuh
index 46d18d338f..9c33921c36 100644
--- a/cub/block/block_histo_256.cuh
+++ b/cub/block/block_histo_256.cuh
@@ -45,9 +45,12 @@ CUB_NS_PREFIX
namespace cub {
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
/**
- * BlockHisto256Algorithm enumerates alternative algorithms for the parallel
- * construction of 8b histograms.
+ * \brief BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.
*/
enum BlockHisto256Algorithm
{
@@ -57,21 +60,33 @@ enum BlockHisto256Algorithm
* Sorting followed by differentiation. Execution is comprised of two phases:
* -# Sort the 8b data using efficient radix sort
* -# Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts.
+ *
+ * \par Performance Considerations
+ * Delivers consistent throughput regardless of sample bin distribution.
*/
- BLOCK_BYTE_HISTO_SORT,
+ BLOCK_HISTO_256_SORT,
/**
* \par Overview
* Use atomic addition to update byte counts directly
*
- * \par Usage Considerations
- * BLOCK_BYTE_HISTO_ATOMIC can only be used on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used regardless.
+ * \par Performance Considerations
+ * Performance is strongly tied to the hardware implementation of atomic
+ * addition, and may be significantly degraded for non uniformly-random
+ * input distributions where many concurrent updates are likely to be
+ * made to the same bin counter.
*/
- BLOCK_BYTE_HISTO_ATOMIC,
+ BLOCK_HISTO_256_ATOMIC,
};
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
/**
* \addtogroup BlockModule
* @{
@@ -90,12 +105,12 @@ enum BlockHisto256Algorithm
*
* \tparam BLOCK_THREADS The threadblock size in threads
* \tparam ITEMS_PER_THREAD The number of items per thread
- * \tparam ALGORITHM [optional] cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_BYTE_HISTO_SORT)
+ * \tparam ALGORITHM [optional] cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_HISTO_256_SORT)
*
* \par Algorithm
* BlockHisto256 can be (optionally) configured to use different algorithms:
- * -# cub::BLOCK_BYTE_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
- * -# cub::BLOCK_BYTE_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
+ * -# cub::BLOCK_HISTO_256_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
+ * -# cub::BLOCK_HISTO_256_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
*
* \par Usage Considerations
* - The histogram output can be constructed in shared or global memory
@@ -167,7 +182,7 @@ enum BlockHisto256Algorithm
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
- BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT>
+ BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT>
class BlockHisto256
{
private:
@@ -178,13 +193,13 @@ private:
/**
* Ensure the template parameterization meets the requirements of the
- * targeted device architecture. BLOCK_BYTE_HISTO_ATOMIC can only be used
- * on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used
+ * targeted device architecture. BLOCK_HISTO_256_ATOMIC can only be used
+ * on version SM120 or later. Otherwise BLOCK_HISTO_256_SORT is used
* regardless.
*/
static const BlockHisto256Algorithm SAFE_ALGORITHM =
- ((ALGORITHM == BLOCK_BYTE_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
- BLOCK_BYTE_HISTO_SORT :
+ ((ALGORITHM == BLOCK_HISTO_256_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
+ BLOCK_HISTO_256_SORT :
ALGORITHM;
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
@@ -195,7 +210,7 @@ private:
******************************************************************************/
/**
- * BLOCK_BYTE_HISTO_SORT algorithmic variant
+ * BLOCK_HISTO_256_SORT algorithmic variant
*/
template
struct BlockHisto256Internal
@@ -319,10 +334,10 @@ private:
/**
- * BLOCK_BYTE_HISTO_ATOMIC algorithmic variant
+ * BLOCK_HISTO_256_ATOMIC algorithmic variant
*/
template
- struct BlockHisto256Internal
+ struct BlockHisto256Internal
{
/// Shared memory storage layout type
struct SmemStorage {};
diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
index d07424f393..1d37601075 100644
--- a/cub/block/block_load.cuh
+++ b/cub/block/block_load.cuh
@@ -77,8 +77,8 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
// Load directly in thread-blocked order
#pragma unroll
@@ -106,8 +106,8 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadDirect(block_itr, items);
}
@@ -132,9 +132,9 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- const int &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
@@ -165,9 +165,9 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- const int &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadDirect(block_itr, guarded_items, items);
}
@@ -191,10 +191,10 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- const int &guarded_items, ///< [in] Number of valid items in the tile
- T oob_default, ///< [in] Default value to assign out-of-bound items
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T oob_default, ///< [in] Default value to assign out-of-bound items
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
@@ -224,10 +224,10 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- const int &guarded_items, ///< [in] Number of valid items in the tile
- T oob_default, ///< [in] Default value to assign out-of-bound items
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T oob_default, ///< [in] Default value to assign out-of-bound items
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadDirect(block_itr, guarded_items, oob_default, items);
}
@@ -348,10 +348,10 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirectStriped(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- const int &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
- int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+ int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
BlockLoadDirectStriped(block_itr, guarded_items, items, stride);
}
@@ -409,11 +409,11 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirectStriped(
- InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
- const int &guarded_items, ///< [in] Number of valid items in the tile
- T oob_default, ///< [in] Default value to assign out-of-bound items
- T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
- int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+ InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T oob_default, ///< [in] Default value to assign out-of-bound items
+ T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+ int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
BlockLoadDirectStriped(block_itr, guarded_items, oob_default, items, stride);
}
@@ -446,8 +446,8 @@ template <
typename T,
int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockLoadVectorized(
- T *block_ptr, ///< [in] Input pointer for loading from
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ T *block_ptr, ///< [in] Input pointer for loading from
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
enum
{
@@ -503,8 +503,8 @@ template <
typename T,
int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockLoadVectorized(
- T *block_ptr, ///< [in] Input pointer for loading from
- T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+ T *block_ptr, ///< [in] Input pointer for loading from
+ T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadVectorized(block_ptr, items);
}
diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh
index 0749d858a3..f47f33ac55 100644
--- a/cub/block/block_reduce.cuh
+++ b/cub/block/block_reduce.cuh
@@ -50,6 +50,11 @@ CUB_NS_PREFIX
namespace cub {
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
/**
* BlockReduceAlgorithm enumerates alternative algorithms for parallel
* reduction across a CUDA threadblock.
@@ -59,9 +64,13 @@ enum BlockReduceAlgorithm
/**
* \par Overview
- * An efficient "raking" reduction algorithm. Execution is comprised of three phases:
- * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
- * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions.
+ * An efficient "raking" reduction algorithm. Execution is comprised of
+ * three phases:
+ * -# Upsweep sequential reduction in registers (if threads contribute more
+ * than one input each). Each thread then places the partial reduction
+ * of its item(s) into shared memory.
+ * -# Upsweep sequential reduction in shared memory. Threads within a
+ * single warp rake across segments of shared partial reductions.
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
*
* \par
@@ -78,10 +87,15 @@ enum BlockReduceAlgorithm
/**
* \par Overview
- * A quick "tiled warp-reductions" reduction algorithm. Execution is comprised of four phases:
- * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
- * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style reduction within each warp.
- * -# A propagation phase where the warp reduction outputs in each warp are updated with the aggregate from each preceding warp.
+ * A quick "tiled warp-reductions" reduction algorithm. Execution is
+ * comprised of four phases:
+ * -# Upsweep sequential reduction in registers (if threads contribute more
+ * than one input each). Each thread then places the partial reduction
+ * of its item(s) into shared memory.
+ * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+ * reduction within each warp.
+ * -# A propagation phase where the warp reduction outputs in each warp are
+ * updated with the aggregate from each preceding warp.
*
* \par
* \image html block_scan_warpscans.png
@@ -89,13 +103,18 @@ enum BlockReduceAlgorithm
*
* \par Performance Considerations
* - Although this variant may suffer lower overall throughput across the
- * GPU because due to a heavy reliance on inefficient warp-reductions, it can
- * often provide lower turnaround latencies when the GPU is under-occupied.
+ * GPU because due to a heavy reliance on inefficient warp-reductions, it
+ * can often provide lower turnaround latencies when the GPU is
+ * under-occupied.
*/
BLOCK_REDUCE_WARP_REDUCTIONS,
};
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
/**
* \addtogroup BlockModule
* @{
diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh
index bf9ef662dc..e8017594ef 100644
--- a/cub/block/block_scan.cuh
+++ b/cub/block/block_scan.cuh
@@ -49,6 +49,10 @@ CUB_NS_PREFIX
namespace cub {
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
/**
* BlockScanAlgorithm enumerates alternative algorithms for parallel prefix
* scan across a CUDA threadblock.
@@ -108,6 +112,10 @@ enum BlockScanAlgorithm
};
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
/**
* \addtogroup BlockModule
* @{
diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh
index da625a5218..90f3dcae7e 100644
--- a/cub/block/block_store.cuh
+++ b/cub/block/block_store.cuh
@@ -77,8 +77,8 @@ template <
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirect(
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
// Store directly in thread-blocked order
#pragma unroll
@@ -106,8 +106,8 @@ template <
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirect(
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirect(block_itr, items);
}
@@ -124,27 +124,26 @@ __device__ __forceinline__ void BlockStoreDirect(
* \tparam T [inferred] The data type to store.
* \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT [inferred] Integer type for offsets
*/
template <
PtxStoreModifier MODIFIER,
typename T,
int ITEMS_PER_THREAD,
- typename OutputIteratorRA,
- typename SizeT>
+ typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirect(
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
+ int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
+
// Store directly in thread-blocked order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
- int item_offset = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
- if (item_offset < guarded_items)
+ if (ITEM < bounds)
{
- ThreadStore(block_itr + item_offset, items[ITEM]);
+ ThreadStore(block_itr + (threadIdx.x * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
}
}
}
@@ -160,17 +159,15 @@ __device__ __forceinline__ void BlockStoreDirect(
* \tparam T [inferred] The data type to store.
* \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT [inferred] Integer type for offsets
*/
template <
typename T,
int ITEMS_PER_THREAD,
- typename OutputIteratorRA,
- typename SizeT>
+ typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirect(
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirect(block_itr, guarded_items, items);
}
@@ -202,9 +199,9 @@ template <
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirectStriped(
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
- int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
// Store directly in striped order
#pragma unroll
@@ -233,9 +230,9 @@ template <
int ITEMS_PER_THREAD,
typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirectStriped(
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
- int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
BlockStoreDirectStriped(block_itr, items, stride);
}
@@ -252,28 +249,27 @@ __device__ __forceinline__ void BlockStoreDirectStriped(
* \tparam T [inferred] The data type to store.
* \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT [inferred] Integer type for offsets
*/
template <
PtxStoreModifier MODIFIER,
typename T,
int ITEMS_PER_THREAD,
- typename OutputIteratorRA,
- typename SizeT>
+ typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirectStriped(
OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
- int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
+ int bounds = guarded_items - threadIdx.x;
+
// Store directly in striped order
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
- int item_offset = (ITEM * stride) + threadIdx.x;
- if (item_offset < guarded_items)
+ if (ITEM * stride < bounds)
{
- ThreadStore(block_itr + item_offset, items[ITEM]);
+ ThreadStore(block_itr + (ITEM * stride) + threadIdx.x, items[ITEM]);
}
}
}
@@ -289,18 +285,16 @@ __device__ __forceinline__ void BlockStoreDirectStriped(
* \tparam T [inferred] The data type to store.
* \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread.
* \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT [inferred] Integer type for offsets
*/
template <
typename T,
int ITEMS_PER_THREAD,
- typename OutputIteratorRA,
- typename SizeT>
+ typename OutputIteratorRA>
__device__ __forceinline__ void BlockStoreDirectStriped(
OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
- int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+ int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
BlockStoreDirectStriped(block_itr, guarded_items, items, stride);
}
@@ -337,8 +331,8 @@ template <
typename T,
int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockStoreVectorized(
- T *block_ptr, ///< [in] Input pointer for storing from
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ T *block_ptr, ///< [in] Input pointer for storing from
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
enum
{
@@ -398,8 +392,8 @@ template <
typename T,
int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockStoreVectorized(
- T *block_ptr, ///< [in] Input pointer for storing from
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ T *block_ptr, ///< [in] Input pointer for storing from
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreVectorized(block_ptr, items);
}
@@ -606,20 +600,19 @@ private:
/// Store a tile of items across a threadblock
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirect(block_itr, items);
}
/// Store a tile of items across a threadblock, guarded by range
- template
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirect(block_itr, guarded_items, items);
}
@@ -636,11 +629,10 @@ private:
typedef NullType SmemStorage;
/// Store a tile of items across a threadblock, specialized for native pointer types (attempts vectorization)
- template
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- T *block_ptr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ T *block_ptr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreVectorized(block_ptr, items);
}
@@ -648,20 +640,19 @@ private:
/// Store a tile of items across a threadblock, specialized for opaque input iterators (skips vectorization)
template
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- _OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ _OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirect(block_itr, items);
}
/// Store a tile of items across a threadblock, guarded by range
- template
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirect(block_itr, guarded_items, items);
}
@@ -682,9 +673,9 @@ private:
/// Store a tile of items across a threadblock
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
// Transpose to striped order
BlockExchange::BlockedToStriped(smem_storage, items);
@@ -693,12 +684,11 @@ private:
}
/// Store a tile of items across a threadblock, guarded by range
- template
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
// Transpose to striped order
BlockExchange::BlockedToStriped(smem_storage, items);
@@ -719,20 +709,19 @@ private:
/// Store a tile of items across a threadblock
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirectStriped(block_itr, items);
}
/// Store a tile of items across a threadblock, guarded by range
- template
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
BlockStoreDirectStriped(block_itr, guarded_items, items);
}
@@ -756,24 +745,21 @@ public:
* \brief Store a tile of items across a threadblock.
*/
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
StoreInternal::Store(smem_storage, block_itr, items);
}
/**
* \brief Store a tile of items across a threadblock, guarded by range.
- *
- * \tparam SizeT [inferred] Integer type for offsets
*/
- template
static __device__ __forceinline__ void Store(
- SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
- OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
- const SizeT &guarded_items, ///< [in] Number of valid items in the tile
- T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+ SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage
+ OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to
+ const int &guarded_items, ///< [in] Number of valid items in the tile
+ T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
{
StoreInternal::Store(smem_storage, block_itr, guarded_items, items);
}
diff --git a/cub/device/device_histo_256.cuh b/cub/device/device_histo_256.cuh
index 48a52ef2f6..d14d30f4dd 100644
--- a/cub/device/device_histo_256.cuh
+++ b/cub/device/device_histo_256.cuh
@@ -37,7 +37,7 @@
#include
#include
-#include "tiles/tiles_histo_256.cuh"
+#include "persistent_block/persistent_block_histo_256.cuh"
#include "../block/block_load.cuh"
#include "../thread/thread_reduce.cuh"
#include "../util_allocator.cuh"
@@ -60,84 +60,74 @@ namespace cub {
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+/**
+ * Initialization kernel for queue descriptor preparation and for zeroing global counters
+ */
+template <
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename SizeT, ///< Integral type used for global array indexing
+ typename HistoCounter> ///< Integral type for counting sample occurrences per histogram bin
+__launch_bounds__ (256, 1)
+__global__ void InitHisto256Kernel(
+ GridQueue grid_queue, ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+ ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][256]
+ SizeT num_samples) ///< [in] Total number of samples \p d_samples for all channels
+{
+ d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
+ if (threadIdx.x == 0) grid_queue.ResetDrain(num_samples);
+}
+
+
/**
* Multi-block histogram kernel entry point. Computes privatized histograms, one per thread block.
*/
template <
- typename TilesHisto256Policy, ///< Tuning policy for cub::TilesHisto256 abstraction
- int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
- int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
- typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char
- typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
- typename SizeT> ///< Integral type used for global array indexing
-__launch_bounds__ (TilesHisto256Policy::BLOCK_THREADS)
+ typename PersistentBlockHisto256Policy, ///< Tuning policy for cub::PersistentBlockHisto256 abstraction
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char
+ typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
+ typename SizeT> ///< Integral type used for global array indexing
+__launch_bounds__ (int(PersistentBlockHisto256Policy::BLOCK_THREADS), PersistentBlockHisto256Policy::SM_OCCUPANCY)
__global__ void MultiBlockHisto256Kernel(
- InputIteratorRA d_samples, ///< [in] Array of sample data. (Channels, if any, are interleaved in "AOS" format)
- ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][gridDim.x][256]
- SizeT num_samples, ///< [in] Total number of samples \p d_samples for all channels
- GridEvenShare even_share, ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
- GridQueue queue) ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+ InputIteratorRA d_samples, ///< [in] Array of sample data. (Channels, if any, are interleaved in "AOS" format)
+ ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][gridDim.x][256]
+ SizeT num_samples, ///< [in] Total number of samples \p d_samples for all channels
+ GridEvenShare even_share, ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+ GridQueue queue) ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
{
// Constants
- enum {
- BLOCK_THREADS = TilesHisto256Policy::BLOCK_THREADS,
- ITEMS_PER_THREAD = TilesHisto256Policy::ITEMS_PER_THREAD,
+ enum
+ {
+ BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
};
- // Parameterize TilesHisto256 for the parallel execution context
- typedef TilesHisto256 TilesHisto256T;
-
- // Parameterize which mapping of tiles -> thread blocks we will use
- typedef typename TilesHisto256T::template Mapping Mapping;
-
- // Declare shared memory
- __shared__ typename TilesHisto256T::SmemStorage block_histo; // Shared memory for TilesHisto256
- __shared__ HistoCounter histograms[ACTIVE_CHANNELS][256]; // Shared memory histograms
-
- // Composite samples into histogram(s)
- Mapping::ProcessTiles(
- block_histo,
- d_samples,
- num_samples,
- even_share,
- queue,
- histograms);
+ // Thread block type for compositing input tiles
+ typedef PersistentBlockHisto256 PersistentBlockHisto256T;
- // Barrier to ensure histograms are coherent
- __syncthreads();
+ // Shared memory for PersistentBlockHisto256
+ __shared__ typename PersistentBlockHisto256T::SmemStorage smem_storage;
- // Output histogram for each active channel
-
- #pragma unroll
- for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
- {
- int channel_offset = (blockIdx.x * 256);
- int histo_offset = 0;
-
- #pragma unroll
- for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS)
- {
- d_out_histograms.array[CHANNEL][channel_offset + histo_offset + threadIdx.x] = histograms[CHANNEL][histo_offset + threadIdx.x];
- }
- // Finish up with guarded initialization if necessary
- if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256))
- {
- d_out_histograms.array[CHANNEL][channel_offset + histo_offset + threadIdx.x] = histograms[CHANNEL][histo_offset + threadIdx.x];
- }
- }
+ // Thread block instance
+ PersistentBlockHisto256T tiles(smem_storage, d_samples, d_out_histograms.array);
+ // Consume tiles using thread block instance
+ int dummy_result;
+ GridMapping::ConsumeTiles(
+ tiles, num_samples, even_share, queue, dummy_result);
}
/**
- * Single-block finalization kernel for aggregating privatized threadblock histograms from a previous kernel invocation.
+ * Aggregation kernel for aggregating privatized threadblock histograms from a previous kernel invocation.
*/
template <
- int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
- typename HistoCounter> ///< Integral type for counting sample occurrences per histogram bin
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename HistoCounter> ///< Integral type for counting sample occurrences per histogram bin
__launch_bounds__ (256, 1)
-__global__ void FinalizeHisto256Kernel(
+__global__ void AggregateHisto256Kernel(
HistoCounter* d_block_histograms_linear, ///< [in] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][num_threadblocks][256]
ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][256]
int num_threadblocks) ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
@@ -148,11 +138,12 @@ __global__ void FinalizeHisto256Kernel(
int block_offset = blockIdx.x * (num_threadblocks * 256);
int block_oob = block_offset + (num_threadblocks * 256);
+#if CUB_PTX_ARCH >= 200
#pragma unroll 32
+#endif
while (block_offset < block_oob)
{
bin_aggregate += d_block_histograms_linear[block_offset + threadIdx.x];
-
block_offset += 256;
}
@@ -181,30 +172,30 @@ struct DeviceHisto256
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
- /// Generic structure for encapsulating dispatch properties. Mirrors the constants within TilesHisto256Policy.
+ /// Generic structure for encapsulating dispatch properties. Mirrors the constants within PersistentBlockHisto256Policy.
struct KernelDispachParams
{
// Policy fields
- int block_threads;
- int items_per_thread;
- BlockHisto256Algorithm block_algorithm;
- GridMappingStrategy grid_mapping;
- int subscription_factor;
+ int block_threads;
+ int items_per_thread;
+ PersistentBlockHisto256Algorithm block_algorithm;
+ GridMappingStrategy grid_mapping;
+ int subscription_factor;
// Derived fields
- int tile_size;
+ int tile_size;
- template
+ template
__host__ __device__ __forceinline__
void Init(int subscription_factor = 1)
{
- block_threads = TilesHisto256Policy::BLOCK_THREADS;
- items_per_thread = TilesHisto256Policy::ITEMS_PER_THREAD;
- block_algorithm = TilesHisto256Policy::BLOCK_ALGORITHM;
- grid_mapping = TilesHisto256Policy::GRID_MAPPING;
+ block_threads = PersistentBlockHisto256Policy::BLOCK_THREADS;
+ items_per_thread = PersistentBlockHisto256Policy::ITEMS_PER_THREAD;
+ block_algorithm = PersistentBlockHisto256Policy::GRID_ALGORITHM;
+ grid_mapping = PersistentBlockHisto256Policy::GRID_MAPPING;
this->subscription_factor = subscription_factor;
- tile_size = block_threads * items_per_thread;
+ tile_size = block_threads * items_per_thread;
}
__host__ __device__ __forceinline__
@@ -225,44 +216,60 @@ struct DeviceHisto256
template <
int CHANNELS,
int ACTIVE_CHANNELS,
- BlockHisto256Algorithm BLOCK_ALGORITHM,
+ PersistentBlockHisto256Algorithm GRID_ALGORITHM,
int ARCH>
struct TunedPolicies;
/// SM35 tune
- template
- struct TunedPolicies
+ template
+ struct TunedPolicies
{
- typedef TilesHisto256Policy<
- 128,
- (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? 23 : (30 / ACTIVE_CHANNELS),
- BLOCK_ALGORITHM,
- (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
+ typedef PersistentBlockHisto256Policy<
+ (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 128 : 256,
+ (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
+ GRID_ALGORITHM,
+ (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+ (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 8 : 1> MultiBlockPolicy;
enum { SUBSCRIPTION_FACTOR = 7 };
};
+ /// SM30 tune
+ template
+ struct TunedPolicies
+ {
+ typedef PersistentBlockHisto256Policy<
+ 128,
+ (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
+ GRID_ALGORITHM,
+ (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+ 1> MultiBlockPolicy;
+ enum { SUBSCRIPTION_FACTOR = 1 };
+ };
+
/// SM20 tune
- template
- struct TunedPolicies
+ template
+ struct TunedPolicies
{
- typedef TilesHisto256Policy<
- 128,
- (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? 17 : (21 / ACTIVE_CHANNELS),
- BLOCK_ALGORITHM,
- (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
- enum { SUBSCRIPTION_FACTOR = 3 };
+ typedef PersistentBlockHisto256Policy<
+ 128,
+ (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
+ GRID_ALGORITHM,
+ GRID_MAPPING_DYNAMIC,
+ 1> MultiBlockPolicy;
+ enum { SUBSCRIPTION_FACTOR = 1 };
};
/// SM10 tune
- template
- struct TunedPolicies
+ template
+ struct TunedPolicies
{
- typedef TilesHisto256Policy<
+ typedef PersistentBlockHisto256Policy<
128,
7,
- BLOCK_ALGORITHM,
- (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
- enum { SUBSCRIPTION_FACTOR = 2 };
+ GRID_HISTO_256_SORT, // (use sort regardless because atomics are perf-useless)
+ GRID_MAPPING_EVEN_SHARE,
+ 1> MultiBlockPolicy;
+ enum { SUBSCRIPTION_FACTOR = 1 };
};
@@ -270,17 +277,19 @@ struct DeviceHisto256
template <
int CHANNELS,
int ACTIVE_CHANNELS,
- BlockHisto256Algorithm BLOCK_ALGORITHM>
+ PersistentBlockHisto256Algorithm GRID_ALGORITHM>
struct PtxDefaultPolicies
{
static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ?
350 :
- (CUB_PTX_ARCH >= 200) ?
- 200 :
- 100;
+ (CUB_PTX_ARCH >= 300) ?
+ 300 :
+ (CUB_PTX_ARCH >= 200) ?
+ 200 :
+ 100;
// Tuned policy set for the current PTX compiler pass
- typedef TunedPolicies PtxPassTunedPolicies;
+ typedef TunedPolicies PtxPassTunedPolicies;
// Subscription factor for the current PTX compiler pass
static const int SUBSCRIPTION_FACTOR = PtxPassTunedPolicies::SUBSCRIPTION_FACTOR;
@@ -295,17 +304,22 @@ struct DeviceHisto256
{
if (ptx_version >= 350)
{
- typedef TunedPolicies TunedPolicies;
+ typedef TunedPolicies TunedPolicies;
+ multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR);
+ }
+ else if (ptx_version >= 300)
+ {
+ typedef TunedPolicies TunedPolicies;
multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR);
}
else if (ptx_version >= 200)
{
- typedef TunedPolicies TunedPolicies;
+ typedef TunedPolicies TunedPolicies;
multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR);
}
else
{
- typedef TunedPolicies TunedPolicies;
+ typedef TunedPolicies TunedPolicies;
multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR);
}
}
@@ -321,17 +335,17 @@ struct DeviceHisto256
template <
int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InitHisto256KernelPtr, ///< Function type of cub::InitHisto256Kernel
typename MultiBlockHisto256KernelPtr, ///< Function type of cub::MultiBlockHisto256Kernel
- typename FinalizeHisto256KernelPtr, ///< Function type of cub::FinalizeHisto256Kernel
- typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel
+ typename AggregateHisto256KernelPtr, ///< Function type of cub::AggregateHisto256Kernel
typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char
typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
typename SizeT> ///< Integral type used for global array indexing
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
+ InitHisto256KernelPtr init_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::InitHisto256Kernel
MultiBlockHisto256KernelPtr multi_block_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::MultiBlockHisto256Kernel
- FinalizeHisto256KernelPtr finalize_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::FinalizeHisto256Kernel
- ResetDrainKernelPtr prepare_drain_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
+ AggregateHisto256KernelPtr aggregate_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::AggregateHisto256Kernel
KernelDispachParams &multi_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel_ptr was compiled for
InputIteratorRA d_samples, ///< [in] Input samples to histogram
HistoCounter *(&d_histograms)[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having 256 counters of integral type \p HistoCounter.
@@ -340,7 +354,7 @@ struct DeviceHisto256
bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
DeviceAllocator *device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
{
- #if !CUB_CNP_ENABLED
+ #ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorInvalidConfiguration);
@@ -354,11 +368,37 @@ struct DeviceHisto256
cudaError error = cudaSuccess;
do
{
- // Get GPU ordinal
+ // Setup array wrapper for histogram channel output because we can't pass static arrays as kernel parameters
+ ArrayWrapper d_histo_wrapper;
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+ }
+
+ // Initialize counters and queue descriptor if necessary
+ if ((multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) ||
+ (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC))
+ {
+ queue.Allocate(device_allocator);
+
+ if (stream_synchronous) CubLog("Invoking init_kernel_ptr<<<%d, 256, 0, %d>>>()\n", ACTIVE_CHANNELS, (int) stream);
+
+ init_kernel_ptr<<>>(queue, d_histo_wrapper, num_samples);
+
+ #ifndef __CUDA_ARCH__
+ // Sync the stream on the host
+ if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
+ #else
+ // Sync the entire device on the device (cudaStreamSynchronize doesn't exist on device)
+ if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
+ #endif
+ }
+
+ // Determine grid size for the multi-block kernel
+
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
- // Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
@@ -380,7 +420,6 @@ struct DeviceHisto256
#endif
- // Determine grid size for the multi-block kernel
int multi_occupancy = multi_sm_occupancy * sm_count;
int multi_tile_size = multi_block_dispatch_params.block_threads * multi_block_dispatch_params.items_per_thread;
int multi_grid_size;
@@ -402,42 +441,18 @@ struct DeviceHisto256
case GRID_MAPPING_DYNAMIC:
// Prepare queue to distribute work dynamically
- queue.Allocate(device_allocator);
int num_tiles = (num_samples + multi_tile_size - 1) / multi_tile_size;
- #ifndef __CUDA_ARCH__
-
- // We're on the host, so prepare queue on device (because its faster than if we prepare it here)
- if (stream_synchronous) CubLog("Invoking prepare_drain_kernel_ptr<<<1, 1, 0, %d>>>()\n", (int) stream);
- prepare_drain_kernel_ptr<<<1, 1, 0, stream>>>(queue, num_samples);
-
- // Sync the stream on the host
- if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
-
- #else
-
- // Prepare the queue here
- queue.ResetDrain(num_samples);
-
- #endif
-
// Set MultiBlock grid size
multi_grid_size = (num_tiles < multi_occupancy) ?
num_tiles : // Not enough to fill the device with threadblocks
multi_occupancy; // Fill the device with threadblocks
- break;
+ break;
};
- // Setup array wrapper for histogram channel output because we can't pass static arrays as kernel parameters
- ArrayWrapper d_histo_wrapper;
- for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
- {
- d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
- }
-
+ // Bind textures if the iterator supports it
#ifndef __CUDA_ARCH__
- // Host can bind texture if the iterator supports it
if (CubDebug(error = BindIteratorTexture(d_samples))) break;
#endif // __CUDA_ARCH__
@@ -445,7 +460,7 @@ struct DeviceHisto256
if (stream_synchronous) CubLog("Invoking multi_block_kernel_ptr<<<%d, %d, 0, %d>>>(), %d items per thread, %d SM occupancy\n",
multi_grid_size, multi_block_dispatch_params.block_threads, (int) stream, multi_block_dispatch_params.items_per_thread, multi_sm_occupancy);
- if (multi_grid_size == 1)
+ if ((multi_grid_size == 1) || (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC))
{
// A single pass will do
multi_block_kernel_ptr<<>>(
@@ -487,10 +502,10 @@ struct DeviceHisto256
if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
#endif
- if (stream_synchronous) CubLog("Invoking finalize_kernel_ptr<<<%d, %d, 0, %d>>>()\n",
+ if (stream_synchronous) CubLog("Invoking aggregate_kernel_ptr<<<%d, %d, 0, %d>>>()\n",
ACTIVE_CHANNELS, 256, (int) stream);
- finalize_kernel_ptr<<>>(
+ aggregate_kernel_ptr<<>>(
d_block_histograms_linear,
d_histo_wrapper,
multi_grid_size);
@@ -507,10 +522,15 @@ struct DeviceHisto256
while (0);
// Free temporary storage allocation
- if (d_block_histograms_linear) error = CubDebug(DeviceFree(d_block_histograms_linear, device_allocator));
+ if (d_block_histograms_linear)
+ error = CubDebug(DeviceFree(d_block_histograms_linear, device_allocator));
// Free queue allocation
- if (multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) error = CubDebug(queue.Free(device_allocator));
+ if ((multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) ||
+ (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC))
+ {
+ error = CubDebug(queue.Free(device_allocator));
+ }
// Unbind texture
#ifndef __CUDA_ARCH__
@@ -526,18 +546,18 @@ struct DeviceHisto256
/**
* \brief Computes a 256-bin device-wide histogram
*
- * \tparam BLOCK_ALGORITHM cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use
+ * \tparam GRID_ALGORITHM cub::PersistentBlockHisto256Algorithm enumerator specifying the underlying algorithm to use
* \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
* \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed
* \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char
* \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin
*/
template <
- BlockHisto256Algorithm BLOCK_ALGORITHM,
- int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
- int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
- typename InputIteratorRA,
- typename HistoCounter>
+ PersistentBlockHisto256Algorithm GRID_ALGORITHM,
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIteratorRA,
+ typename HistoCounter>
__host__ __device__ __forceinline__
static cudaError_t Dispatch(
InputIteratorRA d_samples, ///< [in] Input samples to histogram
@@ -551,7 +571,7 @@ struct DeviceHisto256
typedef int SizeT;
// Tuning polices for the PTX architecture that will get dispatched to
- typedef PtxDefaultPolicies PtxDefaultPolicies;
+ typedef PtxDefaultPolicies PtxDefaultPolicies;
typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
cudaError error = cudaSuccess;
@@ -575,9 +595,9 @@ struct DeviceHisto256
#endif
Dispatch(
+ InitHisto256Kernel,
MultiBlockHisto256Kernel,
- FinalizeHisto256Kernel,
- ResetDrainKernel,
+ AggregateHisto256Kernel,
multi_block_dispatch_params,
d_samples,
d_histograms,
@@ -601,7 +621,9 @@ struct DeviceHisto256
//---------------------------------------------------------------------
/**
- * \brief Computes a 256-bin device-wide histogram
+ * \brief Computes a 256-bin device-wide histogram. Uses fast block-sorting to compute the histogram.
+ *
+ * Delivers consistent throughput regardless of sample diversity.
*
* \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char
* \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin
@@ -618,12 +640,12 @@ struct DeviceHisto256
bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
{
- return Dispatch(
+ return Dispatch(
d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator);
}
/**
- * \brief Computes a 256-bin device-wide histogram. Uses atomic read-modify-write operations to compute the histogram.
+ * \brief Computes a 256-bin device-wide histogram. Uses shared-memory atomic read-modify-write operations to compute the histogram.
*
* Sample input having lower diversity cause performance to be degraded.
*
@@ -642,13 +664,40 @@ struct DeviceHisto256
bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
{
- return Dispatch(
+ return Dispatch(
d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator);
}
/**
- * \brief Computes a 256-bin device-wide histogram from multi-channel data.
+ * \brief Computes a 256-bin device-wide histogram. Uses global-memory atomic read-modify-write operations to compute the histogram.
+ *
+ * Sample input having lower diversity cause performance to be degraded.
+ *
+ * \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char
+ * \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin
+ */
+ template <
+ typename InputIteratorRA,
+ typename HistoCounter>
+ __host__ __device__ __forceinline__
+ static cudaError_t SingleChannelGlobalAtomic(
+ InputIteratorRA d_samples, ///< [in] Input samples
+ HistoCounter* d_histogram, ///< [out] Array of 256 counters of integral type \p HistoCounter.
+ int num_samples, ///< [in] Number of samples to process
+ cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0.
+ bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
+ DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
+ {
+ return Dispatch(
+ d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator);
+ }
+
+
+ /**
+ * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses fast block-sorting to compute the histogram.
+ *
+ * Delivers consistent throughput regardless of sample diversity.
*
* \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
* \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed
@@ -669,12 +718,13 @@ struct DeviceHisto256
bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
{
- return Dispatch(
+ return Dispatch(
d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator);
}
+
/**
- * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses atomic read-modify-write operations to compute the histogram.
+ * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses shared-memory atomic read-modify-write operations to compute the histogram.
*
* Sample input having lower diversity cause performance to be degraded.
*
@@ -697,7 +747,36 @@ struct DeviceHisto256
bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
{
- return Dispatch(
+ return Dispatch(
+ d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator);
+ }
+
+
+ /**
+ * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses global-memory atomic read-modify-write operations to compute the histogram.
+ *
+ * Sample input having lower diversity cause performance to be degraded.
+ *
+ * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+ * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed
+ * \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char
+ * \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin
+ */
+ template <
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIteratorRA,
+ typename HistoCounter>
+ __host__ __device__ __forceinline__
+ static cudaError_t MultiChannelGlobalAtomic(
+ InputIteratorRA d_samples, ///< [in] Input samples. (Channels, if any, are interleaved in "AOS" format)
+ HistoCounter *(&d_histograms)[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having 256 counters of integral type \p HistoCounter.
+ int num_samples, ///< [in] Number of samples to process
+ cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0.
+ bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
+ DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
+ {
+ return Dispatch(
d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator);
}
diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
index 31df6bce53..6e62879e16 100644
--- a/cub/device/device_reduce.cuh
+++ b/cub/device/device_reduce.cuh
@@ -37,8 +37,9 @@
#include
#include
-#include "tiles/tiles_reduce.cuh"
+#include "persistent_block/persistent_block_reduce.cuh"
#include "../util_allocator.cuh"
+#include "../grid/grid_mapping.cuh"
#include "../grid/grid_even_share.cuh"
#include "../grid/grid_queue.cuh"
@@ -60,12 +61,12 @@ namespace cub {
* Multi-block reduction kernel entry point. Computes privatized reductions, one per thread block.
*/
template <
- typename TilesReducePolicy, ///< Tuning policy for cub::TilesReduce abstraction
+ typename PersistentBlockReducePolicy, ///< Tuning policy for cub::PersistentBlockReduce abstraction
typename InputIteratorRA, ///< The random-access iterator type for input (may be a simple pointer type).
typename OutputIteratorRA, ///< The random-access iterator type for output (may be a simple pointer type).
typename SizeT, ///< Integral type used for global array indexing
typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b)
-__launch_bounds__ (TilesReducePolicy::BLOCK_THREADS, 1)
+__launch_bounds__ (int(PersistentBlockReducePolicy::BLOCK_THREADS), 1)
__global__ void MultiBlockReduceKernel(
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
@@ -74,26 +75,24 @@ __global__ void MultiBlockReduceKernel(
GridQueue queue, ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
- // Data type of input iterator
+ // Data type
typedef typename std::iterator_traits::value_type T;
- // Parameterize TilesReduce for the parallel execution context
- typedef TilesReduce TilesReduceT;
+ // Thread block type for reducing input tiles
+ typedef PersistentBlockReduce PersistentBlockReduceT;
- // Parameterize which mapping of tiles -> thread blocks we will use
- typedef typename TilesReduceT::template Mapping Mapping;
+ // Block-wide aggregate
+ T block_aggregate;
- // Declare shared memory for TilesReduce
- __shared__ typename TilesReduceT::SmemStorage smem_storage;
+ // Shared memory storage
+ __shared__ typename PersistentBlockReduceT::SmemStorage smem_storage;
- // Reduce tiles
- T block_aggregate = Mapping::ProcessTiles(
- smem_storage,
- d_in,
- num_items,
- even_share,
- queue,
- reduction_op);
+ // Thread block instance
+ PersistentBlockReduceT tiles(smem_storage, d_in, reduction_op);
+
+ // Consume tiles using thread block instance
+ GridMapping::ConsumeTilesFlagFirst(
+ tiles, num_items, even_share, queue, block_aggregate);
// Output result
if (threadIdx.x == 0)
@@ -107,41 +106,41 @@ __global__ void MultiBlockReduceKernel(
* Single-block reduction kernel entry point.
*/
template <
- typename TilesReducePolicy, ///< Tuning policy for cub::TilesReduce abstraction
+ typename PersistentBlockReducePolicy, ///< Tuning policy for cub::PersistentBlockReduce abstraction
typename InputIteratorRA, ///< The random-access iterator type for input (may be a simple pointer type).
typename OutputIteratorRA, ///< The random-access iterator type for output (may be a simple pointer type).
typename SizeT, ///< Integral type used for global array indexing
typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b)
-__launch_bounds__ (TilesReducePolicy::BLOCK_THREADS, 1)
+__launch_bounds__ (int(PersistentBlockReducePolicy::BLOCK_THREADS), 1)
__global__ void SingleBlockReduceKernel(
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
SizeT num_items, ///< [in] Total number of input data items
ReductionOp reduction_op) ///< [in] Binary reduction operator
{
- // Data type of input iterator
+ // Data type
typedef typename std::iterator_traits::value_type T;
- // Parameterize TilesReduce for the parallel execution context
- typedef TilesReduce TilesReduceT;
+ // Thread block type for reducing input tiles
+ typedef PersistentBlockReduce PersistentBlockReduceT;
+
+ // Block-wide aggregate
+ T block_aggregate;
+
+ // Shared memory storage
+ __shared__ typename PersistentBlockReduceT::SmemStorage smem_storage;
- // Declare shared memory for TilesReduce
- __shared__ typename TilesReduceT::SmemStorage smem_storage;
+ // Block abstraction for reducing tiles
+ PersistentBlockReduceT tiles(smem_storage, d_in, reduction_op);
- // Reduce tiles
- T block_aggregate = TilesReduceT::ProcessTilesEvenShare(
- smem_storage,
- d_in,
- SizeT(0),
- num_items,
- reduction_op);
+ // Reduce input tiles
+ ConsumeTilesFlagFirst(tiles, 0, num_items, block_aggregate);
// Output result
if (threadIdx.x == 0)
{
d_out[blockIdx.x] = block_aggregate;
}
-
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
@@ -163,7 +162,7 @@ struct DeviceReduce
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
- /// Generic structure for encapsulating dispatch properties. Mirrors the constants within TilesReducePolicy.
+ /// Generic structure for encapsulating dispatch properties. Mirrors the constants within PersistentBlockReducePolicy.
struct KernelDispachParams
{
// Policy fields
@@ -178,16 +177,16 @@ struct DeviceReduce
// Derived fields
int tile_size;
- template
+ template
__host__ __device__ __forceinline__
void Init(int subscription_factor = 1)
{
- block_threads = TilesReducePolicy::BLOCK_THREADS;
- items_per_thread = TilesReducePolicy::ITEMS_PER_THREAD;
- vector_load_length = TilesReducePolicy::VECTOR_LOAD_LENGTH;
- block_algorithm = TilesReducePolicy::BLOCK_ALGORITHM;
- load_modifier = TilesReducePolicy::LOAD_MODIFIER;
- grid_mapping = TilesReducePolicy::GRID_MAPPING;
+ block_threads = PersistentBlockReducePolicy::BLOCK_THREADS;
+ items_per_thread = PersistentBlockReducePolicy::ITEMS_PER_THREAD;
+ vector_load_length = PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH;
+ block_algorithm = PersistentBlockReducePolicy::BLOCK_ALGORITHM;
+ load_modifier = PersistentBlockReducePolicy::LOAD_MODIFIER;
+ grid_mapping = PersistentBlockReducePolicy::GRID_MAPPING;
this->subscription_factor = subscription_factor;
tile_size = block_threads * items_per_thread;
@@ -221,8 +220,8 @@ struct DeviceReduce
struct TunedPolicies
{
// K20C: 182.1 @ 48M 32-bit T
- typedef TilesReducePolicy<256, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
- typedef TilesReducePolicy<256, 16, 2, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
+ typedef PersistentBlockReducePolicy<256, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
+ typedef PersistentBlockReducePolicy<256, 16, 2, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
enum { SUBSCRIPTION_FACTOR = 4 };
};
@@ -231,8 +230,8 @@ struct DeviceReduce
struct TunedPolicies
{
// GTX670: 154.0 @ 48M 32-bit T
- typedef TilesReducePolicy<256, 2, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
- typedef TilesReducePolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
+ typedef PersistentBlockReducePolicy<256, 2, 1, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
+ typedef PersistentBlockReducePolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
enum { SUBSCRIPTION_FACTOR = 1 };
};
@@ -241,8 +240,8 @@ struct DeviceReduce
struct TunedPolicies
{
// GTX 580: 178.9 @ 48M 32-bit T
- typedef TilesReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_DYNAMIC> MultiBlockPolicy;
- typedef TilesReducePolicy<128, 4, 1, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
+ typedef PersistentBlockReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_DYNAMIC> MultiBlockPolicy;
+ typedef PersistentBlockReducePolicy<128, 4, 1, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
enum { SUBSCRIPTION_FACTOR = 1 };
};
@@ -250,8 +249,8 @@ struct DeviceReduce
template
struct TunedPolicies
{
- typedef TilesReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
- typedef TilesReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
+ typedef PersistentBlockReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
+ typedef PersistentBlockReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
enum { SUBSCRIPTION_FACTOR = 1 };
};
@@ -259,8 +258,8 @@ struct DeviceReduce
template
struct TunedPolicies
{
- typedef TilesReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
- typedef TilesReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
+ typedef PersistentBlockReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
+ typedef PersistentBlockReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy;
enum { SUBSCRIPTION_FACTOR = 1 };
};
@@ -354,7 +353,7 @@ struct DeviceReduce
cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0.
bool stream_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
{
- #if !CUB_CNP_ENABLED
+ #ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorInvalidConfiguration);
@@ -396,7 +395,7 @@ struct DeviceReduce
template <
typename MultiBlockReduceKernelPtr, ///< Function type of cub::MultiBlockReduceKernel
typename ReduceSingleKernelPtr, ///< Function type of cub::SingleBlockReduceKernel
- typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel
+ typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel
typename InputIteratorRA, ///< The random-access iterator type for input (may be a simple pointer type).
typename OutputIteratorRA, ///< The random-access iterator type for output (may be a simple pointer type).
typename SizeT, ///< Integral type used for global array indexing
@@ -405,7 +404,7 @@ struct DeviceReduce
static cudaError_t DispatchIterative(
MultiBlockReduceKernelPtr multi_block_kernel, ///< [in] Kernel function pointer to parameterization of cub::MultiBlockReduceKernel
ReduceSingleKernelPtr single_block_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleBlockReduceKernel
- ResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
+ ResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
KernelDispachParams &multi_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel_ptr was compiled for
KernelDispachParams &single_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p single_block_kernel was compiled for
InputIteratorRA d_in, ///< [in] Input data to reduce
@@ -416,7 +415,7 @@ struct DeviceReduce
bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
DeviceAllocator *device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator.
{
- #if !CUB_CNP_ENABLED
+ #ifndef CUB_RUNTIME_ENABLED
// Kernel launch not supported from this device
return CubDebug(cudaErrorInvalidConfiguration);
@@ -585,7 +584,7 @@ struct DeviceReduce
KernelDispachParams &single_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p single_block_kernel was compiled for
InputIteratorRA d_in, ///< [in] Input data to reduce
OutputIteratorRA d_out, ///< [out] Output location for result
- SizeT num_items, ///< [in] Number of items to reduce
+ SizeT num_items, ///< [in] Number of items to reduce
ReductionOp reduction_op, ///< [in] Binary reduction operator
cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0.
bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false.
diff --git a/cub/device/persistent_block/persistent_block_histo_256.cuh b/cub/device/persistent_block/persistent_block_histo_256.cuh
new file mode 100644
index 0000000000..8c4d93a1d1
--- /dev/null
+++ b/cub/device/persistent_block/persistent_block_histo_256.cuh
@@ -0,0 +1,810 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::PersistentBlockHisto256 implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram.
+ */
+
+#pragma once
+
+#include
+
+#include "../../util_arch.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_histo_256.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief PersistentBlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.
+ */
+enum PersistentBlockHisto256Algorithm
+{
+
+ /**
+ * \par Overview
+ * A two-kernel approach in which:
+ * -# Thread blocks in the first kernel aggregate their own privatized
+ * histograms using block-wide sorting (see BlockHisto256Algorithm::BLOCK_HISTO_256_SORT).
+ * -# A single thread block in the second kernel reduces them into the output histogram(s).
+ *
+ * \par Performance Considerations
+ * Delivers consistent throughput regardless of sample bin distribution.
+ */
+ GRID_HISTO_256_SORT,
+
+
+ /**
+ * \par Overview
+ * A two-kernel approach in which:
+ * -# Thread blocks in the first kernel aggregate their own privatized
+ * histograms using shared-memory \p atomicAdd().
+ * -# A single thread block in the second kernel reduces them into the
+ * output histogram(s).
+ *
+ * \par Performance Considerations
+ * Performance is strongly tied to the hardware implementation of atomic
+ * addition, and may be significantly degraded for non uniformly-random
+ * input distributions where many concurrent updates are likely to be
+ * made to the same bin counter.
+ */
+ GRID_HISTO_256_SHARED_ATOMIC,
+
+
+ /**
+ * \par Overview
+ * A single-kernel approach in which thread blocks update the output histogram(s) directly
+ * using global-memory \p atomicAdd().
+ *
+ * \par Performance Considerations
+ * Performance is strongly tied to the hardware implementation of atomic
+ * addition, and may be significantly degraded for non uniformly-random
+ * input distributions where many concurrent updates are likely to be
+ * made to the same bin counter.
+ */
+ GRID_HISTO_256_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for PersistentBlockHisto256
+ */
+template <
+ int _BLOCK_THREADS,
+ int _ITEMS_PER_THREAD,
+ PersistentBlockHisto256Algorithm _GRID_ALGORITHM,
+ GridMappingStrategy _GRID_MAPPING,
+ int _SM_OCCUPANCY>
+struct PersistentBlockHisto256Policy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS,
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+ SM_OCCUPANCY = _SM_OCCUPANCY,
+ };
+
+ static const PersistentBlockHisto256Algorithm GRID_ALGORITHM = _GRID_ALGORITHM;
+ static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING;
+};
+
+
+
+/******************************************************************************
+ * PersistentBlockHisto256
+ ******************************************************************************/
+
+/**
+ * \brief implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram.
+ */
+template <
+ typename PersistentBlockHisto256Policy, ///< Tuning policy
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char
+ typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
+ typename SizeT, ///< Integer type for offsets
+ PersistentBlockHisto256Algorithm GRID_ALGORITHM = PersistentBlockHisto256Policy::GRID_ALGORITHM>
+struct PersistentBlockHisto256;
+
+
+/**
+ * Specialized for GRID_HISTO_256_GLOBAL_ATOMIC
+ */
+template <
+ typename PersistentBlockHisto256Policy, ///< Tuning policy
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char
+ typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
+ typename SizeT> ///< Integer type for offsets
+struct PersistentBlockHisto256
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
+ TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
+ };
+
+ // Shared memory type required by this thread block
+ struct SmemStorage {};
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ /// Reference to smem_storage
+ SmemStorage &smem_storage;
+
+ /// Reference to output histograms
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+ /// Input data to reduce
+ InputIteratorRA d_in;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ PersistentBlockHisto256(
+ SmemStorage &smem_storage, ///< Reference to smem_storage
+ InputIteratorRA d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) : ///< Reference to output histograms
+ smem_storage(smem_storage),
+ d_in(d_in),
+ d_out_histograms(d_out_histograms)
+ {}
+
+
+ /**
+ * The number of items processed per "tile"
+ */
+ __device__ __forceinline__ int TileItems()
+ {
+ return TILE_ITEMS;
+ }
+
+
+ /**
+ * Process a single tile.
+ */
+ __device__ __forceinline__ void ConsumeTile(
+ bool &sync_after,
+ SizeT block_offset,
+ int num_valid)
+ {
+ if (num_valid < TILE_ITEMS)
+ {
+ // Only a partially-full tile of samples to read and composite
+ int bounds = num_valid - (threadIdx.x * CHANNELS);
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+ {
+ unsigned char item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ atomicAdd(d_out_histograms[CHANNEL] + item, 1);
+ }
+ }
+ }
+
+ }
+ else
+ {
+ // Full tile of samples to read and composite
+ unsigned char items[ITEMS_PER_THREAD][CHANNELS];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ }
+ }
+ }
+
+ __threadfence_block();
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+ }
+ }
+ }
+ }
+
+ // No need to sync after processing this tile to ensure smem coherence
+ sync_after = false;
+ }
+
+
+ /**
+ * Finalize the computation.
+ */
+ __device__ __forceinline__ void Finalize(
+ int dummy_result)
+ {}
+};
+
+
+
+
+/**
+ * Specialized for GRID_HISTO_256_SHARED_ATOMIC
+ */
+template <
+ typename PersistentBlockHisto256Policy, ///< Tuning policy
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char
+ typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
+ typename SizeT> ///< Integer type for offsets
+struct PersistentBlockHisto256
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
+ TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
+ };
+
+ // Shared memory type required by this thread block
+ struct SmemStorage
+ {
+ HistoCounter histograms[ACTIVE_CHANNELS][256];
+ };
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ /// Reference to smem_storage
+ SmemStorage &smem_storage;
+
+ /// Reference to output histograms
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+ /// Input data to reduce
+ InputIteratorRA d_in;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ PersistentBlockHisto256(
+ SmemStorage &smem_storage, ///< Reference to smem_storage
+ InputIteratorRA d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) : ///< Reference to output histograms
+ smem_storage(smem_storage),
+ d_in(d_in),
+ d_out_histograms(d_out_histograms)
+ {
+ // Initialize histogram bin counts to zeros
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int histo_offset = 0;
+
+ #pragma unroll
+ for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS)
+ {
+ smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+ }
+ // Finish up with guarded initialization if necessary
+ if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256))
+ {
+ smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+ }
+ }
+ }
+
+
+ /**
+ * The number of items processed per "tile"
+ */
+ __device__ __forceinline__ int TileItems()
+ {
+ return TILE_ITEMS;
+ }
+
+
+ /**
+ * Process a single tile.
+ */
+ __device__ __forceinline__ void ConsumeTile(
+ bool &sync_after,
+ SizeT block_offset,
+ int num_valid)
+ {
+ if (num_valid < TILE_ITEMS)
+ {
+ // Only a partially-full tile of samples to read and composite
+ int bounds = num_valid - (threadIdx.x * CHANNELS);
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+ {
+ unsigned char item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ atomicAdd(smem_storage.histograms[CHANNEL] + item, 1);
+ }
+ }
+ }
+
+ }
+ else
+ {
+ // Full tile of samples to read and composite
+ unsigned char items[ITEMS_PER_THREAD][CHANNELS];
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+ }
+ }
+ }
+
+ __threadfence_block();
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+ {
+ if (CHANNEL < ACTIVE_CHANNELS)
+ {
+ atomicAdd(smem_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+ }
+ }
+ }
+ }
+
+ // No need to sync after processing this tile to ensure smem coherence
+ sync_after = false;
+ }
+
+
+ /**
+ * Finalize the computation.
+ */
+ __device__ __forceinline__ void Finalize(
+ int dummy_result)
+ {
+ // Barrier to ensure shared memory histograms are coherent
+ __syncthreads();
+
+ // Copy shared memory histograms to output
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int channel_offset = (blockIdx.x * 256);
+ int histo_offset = 0;
+
+ #pragma unroll
+ for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS)
+ {
+ d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+ }
+ // Finish up with guarded initialization if necessary
+ if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256))
+ {
+ d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+ }
+ }
+ }
+};
+
+
+/**
+ * Specialized for GRID_HISTO_256_SORT
+ */
+template <
+ typename PersistentBlockHisto256Policy, ///< Tuning policy
+ int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+ int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
+ typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char
+ typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin
+ typename SizeT> ///< Integer type for offsets
+struct PersistentBlockHisto256
+{
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
+ TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
+
+ STRIPED_COUNTERS_PER_THREAD = (256 + BLOCK_THREADS - 1) / BLOCK_THREADS,
+ };
+
+ // Parameterize BlockRadixSort type for our thread block
+ typedef BlockRadixSort BlockRadixSortT;
+
+ // Parameterize BlockDiscontinuity type for our thread block
+ typedef BlockDiscontinuity BlockDiscontinuityT;
+
+ // Shared memory type required by this thread block
+ union SmemStorage
+ {
+ // Storage for sorting bin values
+ typename BlockRadixSortT::SmemStorage sort_storage;
+
+ struct
+ {
+ // Storage for detecting discontinuities in the tile of sorted bin values
+ typename BlockDiscontinuityT::SmemStorage discont_storage;
+
+ // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+ unsigned int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+ unsigned int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+ };
+ };
+
+
+ // Discontinuity functor
+ struct DiscontinuityOp
+ {
+ // Reference to smem_storage
+ SmemStorage &smem_storage;
+
+ // Constructor
+ __device__ __forceinline__ DiscontinuityOp(SmemStorage &smem_storage) : smem_storage(smem_storage) {}
+
+ // Discontinuity predicate
+ __device__ __forceinline__ bool operator()(const unsigned char &a, const unsigned char &b, unsigned int b_index)
+ {
+ if (a != b)
+ {
+ // Note the begin/end offsets in shared storage
+ smem_storage.run_begin[b] = b_index;
+ smem_storage.run_end[a] = b_index;
+
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ };
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ /// Reference to smem_storage
+ SmemStorage &smem_storage;
+
+ /// Histogram counters striped across threads
+ HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
+
+ /// Reference to output histograms
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+ /// Input data to reduce
+ InputIteratorRA d_in;
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ PersistentBlockHisto256(
+ SmemStorage &smem_storage, ///< Reference to smem_storage
+ InputIteratorRA d_in, ///< Input data to reduce
+ HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) : ///< Reference to output histograms
+ smem_storage(smem_storage),
+ d_in(d_in),
+ d_out_histograms(d_out_histograms)
+ {
+ // Initialize histogram counters striped across threads
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ thread_counters[CHANNEL][COUNTER] = 0;
+ }
+ }
+ }
+
+
+ /**
+ * The number of items processed per "tile"
+ */
+ __device__ __forceinline__ int TileItems()
+ {
+ return TILE_ITEMS;
+ }
+
+
+ /**
+ * Composite a tile of input items
+ */
+ __device__ __forceinline__ void Composite(
+ unsigned char (&items)[ITEMS_PER_THREAD], ///< Tile of samples
+ HistoCounter thread_counters[STRIPED_COUNTERS_PER_THREAD]) ///< Histogram counters striped across threads
+ {
+ // Sort bytes in blocked arrangement
+ BlockRadixSortT::SortBlocked(smem_storage.sort_storage, items);
+
+ __syncthreads();
+
+ // Initialize the shared memory's run_begin and run_end for each bin
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ smem_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+ smem_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+ }
+
+ __syncthreads();
+
+ // Note the begin/end run offsets of bin runs in the sorted tile
+ int flags[ITEMS_PER_THREAD]; // unused
+ DiscontinuityOp flag_op(smem_storage);
+ BlockDiscontinuityT::Flag(smem_storage.discont_storage, items, flag_op, flags);
+
+ // Update begin for first item
+ if (threadIdx.x == 0) smem_storage.run_begin[items[0]] = 0;
+
+ __syncthreads();
+
+ // Composite into histogram
+ // Initialize the shared memory's run_begin and run_end for each bin
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+ thread_counters[COUNTER] += smem_storage.run_end[bin] - smem_storage.run_begin[bin];
+ }
+ }
+
+
+ /**
+ * Process one channel within a tile.
+ */
+ __device__ __forceinline__ void ConsumeTileChannel(
+ int channel,
+ SizeT block_offset,
+ int num_valid)
+ {
+ // Load items in striped fashion
+ if (num_valid < TILE_ITEMS)
+ {
+ // Only a partially-full tile of samples to read and composite
+ unsigned char items[ITEMS_PER_THREAD];
+
+ // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
+ int bounds = (num_valid - (threadIdx.x * CHANNELS));
+
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
+ d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
+ 0;
+ }
+
+ // Composite our histogram data
+ Composite(items, thread_counters[channel]);
+
+ __syncthreads();
+
+ // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
+ if (threadIdx.x == 0)
+ {
+ int extra = (TILE_ITEMS - num_valid) / CHANNELS;
+ thread_counters[channel][0] -= extra;
+ }
+ }
+ else
+ {
+ // Full tile of samples to read and composite
+ unsigned char items[ITEMS_PER_THREAD];
+
+ // Unguarded loads
+ #pragma unroll
+ for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+ {
+ items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
+ }
+
+ // Composite our histogram data
+ Composite(items, thread_counters[channel]);
+ }
+ }
+
+
+ /**
+ * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Inductive step.
+ */
+ template
+ struct IterateChannels
+ {
+ /**
+ * Process one channel within a tile.
+ */
+ static __device__ __forceinline__ void ConsumeTileChannel(
+ PersistentBlockHisto256 *persistent_block_histo,
+ SizeT block_offset,
+ int num_valid)
+ {
+ __syncthreads();
+
+ persistent_block_histo->ConsumeTileChannel(CHANNEL, block_offset, num_valid);
+
+ IterateChannels::ConsumeTileChannel(persistent_block_histo, block_offset, num_valid);
+ }
+ };
+
+
+ /**
+ * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Base step.
+ */
+ template
+ struct IterateChannels
+ {
+ static __device__ __forceinline__ void ConsumeTileChannel(PersistentBlockHisto256 *persistent_block_histo, SizeT block_offset, int num_valid) {}
+ };
+
+
+ /**
+ * Process a single tile.
+ *
+ * We take several passes through the tile in this variant, extracting the samples for one channel at a time
+ */
+ __device__ __forceinline__ void ConsumeTile(
+ bool &sync_after,
+ SizeT block_offset,
+ int num_valid)
+ {
+ // First channel
+ ConsumeTileChannel(0, block_offset, num_valid);
+
+ // Iterate through remaining channels
+ IterateChannels<1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, num_valid);
+
+ // Need to sync after processing this tile to ensure smem coherence
+ sync_after = true;
+ }
+
+
+ /**
+ * Finalize the computation.
+ */
+ __device__ __forceinline__ void Finalize(
+ int dummy_result)
+ {
+ // Copy counters striped across threads into the histogram output
+ #pragma unroll
+ for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+ {
+ int channel_offset = (blockIdx.x * 256);
+
+ #pragma unroll
+ for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+ {
+ int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+
+ if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == 256) || (bin < 256))
+ {
+ d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
+ }
+ }
+ }
+ }
+};
+
+
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/cub/device/persistent_block/persistent_block_reduce.cuh b/cub/device/persistent_block/persistent_block_reduce.cuh
new file mode 100644
index 0000000000..9dbc7a3f15
--- /dev/null
+++ b/cub/device/persistent_block/persistent_block_reduce.cuh
@@ -0,0 +1,247 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill. All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the NVIDIA CORPORATION nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::PersistentBlockReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+
+ */
+
+#pragma once
+
+#include
+
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_reduce.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Tuning policy for PersistentBlockReduce
+ */
+template <
+ int _BLOCK_THREADS,
+ int _ITEMS_PER_THREAD,
+ int _VECTOR_LOAD_LENGTH,
+ BlockReduceAlgorithm _BLOCK_ALGORITHM,
+ PtxLoadModifier _LOAD_MODIFIER,
+ GridMappingStrategy _GRID_MAPPING>
+struct PersistentBlockReducePolicy
+{
+ enum
+ {
+ BLOCK_THREADS = _BLOCK_THREADS,
+ ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+ VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
+ };
+
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
+ static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING;
+ static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+};
+
+
+/**
+ * \brief PersistentBlockReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+ */
+template <
+ typename PersistentBlockReducePolicy,
+ typename InputIteratorRA,
+ typename SizeT,
+ typename ReductionOp>
+struct PersistentBlockReduce
+{
+
+ //---------------------------------------------------------------------
+ // Types and constants
+ //---------------------------------------------------------------------
+
+ typedef typename std::iterator_traits::value_type T; // Type of input iterator
+ typedef VectorHelper VecHelper; // Helper type for vectorizing loads of T
+ typedef typename VecHelper::Type VectorT; // Vector of T
+
+ // Constants
+ enum
+ {
+ BLOCK_THREADS = PersistentBlockReducePolicy::BLOCK_THREADS,
+ ITEMS_PER_THREAD = PersistentBlockReducePolicy::ITEMS_PER_THREAD,
+ TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+ VECTOR_LOAD_LENGTH = PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH,
+
+ // Can vectorize according to the policy if the input iterator is a native pointer to a built-in primitive
+ CAN_VECTORIZE = (PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH > 1) &&
+ (IsPointer::VALUE) &&
+ (VecHelper::BUILT_IN),
+
+ };
+
+ static const BlockReduceAlgorithm BLOCK_ALGORITHM = PersistentBlockReducePolicy::BLOCK_ALGORITHM;
+
+ // Parameterized BlockReduce primitive
+ typedef BlockReduce BlockReduceT;
+
+ // Shared memory type required by this thread block
+ typedef typename BlockReduceT::SmemStorage SmemStorage;
+
+
+ //---------------------------------------------------------------------
+ // Per-thread fields
+ //---------------------------------------------------------------------
+
+ T thread_aggregate; ///< Each thread's partial reduction
+ SmemStorage& smem_storage; ///< Reference to smem_storage
+ InputIteratorRA d_in; ///< Input data to reduce
+ ReductionOp reduction_op; ///< Binary reduction operator
+ int first_tile_size; ///< Size of first tile consumed
+ bool input_aligned; ///< Whether or not input is vector-aligned
+
+
+ //---------------------------------------------------------------------
+ // Interface
+ //---------------------------------------------------------------------
+
+ /**
+ * Constructor
+ */
+ __device__ __forceinline__ PersistentBlockReduce(
+ SmemStorage& smem_storage, ///< Reference to smem_storage
+ InputIteratorRA d_in, ///< Input data to reduce
+ ReductionOp reduction_op) : ///< Binary reduction operator
+ smem_storage(smem_storage),
+ d_in(d_in),
+ reduction_op(reduction_op),
+ first_tile_size(TILE_ITEMS),
+ input_aligned(CAN_VECTORIZE && ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0)){}
+
+
+ /**
+ * The number of items processed per "tile"
+ */
+ __device__ __forceinline__ int TileItems()
+ {
+ return TILE_ITEMS;
+ }
+
+
+ /**
+ * Process a single tile.
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate. Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+ __device__ __forceinline__ void ConsumeTile(
+ bool &sync_after,
+ SizeT block_offset,
+ int num_valid,
+ bool first_tile)
+ {
+ if (num_valid < TILE_ITEMS)
+ {
+ // Our first tile is a partial tile size
+ if (first_tile) first_tile_size = num_valid;
+
+ // Partial tile
+ int thread_offset = threadIdx.x;
+
+ if ((first_tile) && (thread_offset < num_valid))
+ {
+ thread_aggregate = ThreadLoad(d_in + block_offset + thread_offset);
+ thread_offset += BLOCK_THREADS;
+ }
+
+ while (thread_offset < num_valid)
+ {
+ T item = ThreadLoad(d_in + block_offset + thread_offset);
+ thread_aggregate = reduction_op(thread_aggregate, item);
+ thread_offset += BLOCK_THREADS;
+ }
+ }
+ else
+ {
+ T items[ITEMS_PER_THREAD];
+
+ // Load full tile
+ if (input_aligned)
+ {
+ // Alias items as an array of VectorT and load it in striped fashion
+ BlockLoadDirectStriped(
+ reinterpret_cast(d_in + block_offset),
+ reinterpret_cast(items));
+ }
+ else
+ {
+ // Load items in striped fashion
+ BlockLoadDirectStriped(d_in + block_offset, items);
+ }
+
+ // Prevent hoisting
+ __threadfence_block();
+
+ // Reduce items within each thread
+ T partial = ThreadReduce(items, reduction_op);
+
+ // Update|assign the thread's running aggregate
+ thread_aggregate = (first_tile) ?
+ partial :
+ reduction_op(thread_aggregate, partial);
+ }
+
+ // No synchronization needed after tile processing
+ sync_after = false;
+ }
+
+
+ /**
+ * Finalize the computation.
+ */
+ __device__ __forceinline__ void Finalize(
+ T& block_aggregate)
+ {
+ // Cooperative reduction across the thread block (guarded reduction if our first tile was a partial tile)
+ block_aggregate = (first_tile_size < TILE_ITEMS) ?
+ BlockReduceT::Reduce(smem_storage, thread_aggregate, reduction_op, first_tile_size) :
+ BlockReduceT::Reduce(smem_storage, thread_aggregate, reduction_op);
+ }
+
+};
+
+
+} // CUB namespace
+CUB_NS_POSTFIX // Optional outer namespace(s)
+
diff --git a/cub/device/tiles/tiles_scan.cuh b/cub/device/persistent_block/persistent_block_scan.cuh
similarity index 88%
rename from cub/device/tiles/tiles_scan.cuh
rename to cub/device/persistent_block/persistent_block_scan.cuh
index 9b92a7363d..a6ff677983 100644
--- a/cub/device/tiles/tiles_scan.cuh
+++ b/cub/device/persistent_block/persistent_block_scan.cuh
@@ -28,7 +28,7 @@
/**
* \file
- * cub::TilesScan implements an abstraction of CUDA thread blocks for
+ * cub::PersistentBlockScan implements an abstraction of CUDA thread blocks for
* participating in device-wide prefix scan.
*/
@@ -61,7 +61,7 @@ enum
/**
- * Tuning policy for TilesScan
+ * Tuning policy for PersistentBlockScan
*/
template <
int _BLOCK_THREADS,
@@ -69,7 +69,7 @@ template <
BlockLoadPolicy _LOAD_POLICY,
BlockStorePolicy _STORE_POLICY,
BlockScanAlgorithm _SCAN_ALGORITHM>
-struct TilesScanPolicy
+struct PersistentBlockScanPolicy
{
enum
{
@@ -87,15 +87,15 @@ struct TilesScanPolicy
/**
- * \brief TilesScan implements an abstraction of CUDA thread blocks for
+ * \brief PersistentBlockScan implements an abstraction of CUDA thread blocks for
* participating in device-wide reduction.
*/
template <
- typename TilesScanPolicy,
+ typename PersistentBlockScanPolicy,
typename InputIteratorRA,
typename OutputIteratorRA,
typename SizeT>
-class TilesScan
+class PersistentBlockScan
{
public:
@@ -107,12 +107,12 @@ public:
typedef typename std::iterator_traits::value_type T;
// Data type of block-signaling flag
- typedef typename TilesScanPolicy::BlockFlag BlockFlag;
+ typedef typename PersistentBlockScanPolicy::BlockFlag BlockFlag;
// Constants
enum
{
- TILE_ITEMS = TilesScanPolicy::BLOCK_THREADS * TilesScanPolicy::ITEMS_PER_THREAD,
+ TILE_ITEMS = PersistentBlockScanPolicy::BLOCK_THREADS * PersistentBlockScanPolicy::ITEMS_PER_THREAD,
};
struct Signal
@@ -146,22 +146,22 @@ public:
// Parameterized block load
typedef BlockLoad<
InputIteratorRA,
- TilesScanPolicy::BLOCK_THREADS,
- TilesScanPolicy::ITEMS_PER_THREAD,
- TilesScanPolicy::LOAD_POLICY> BlockLoadT;
+ PersistentBlockScanPolicy::BLOCK_THREADS,
+ PersistentBlockScanPolicy::ITEMS_PER_THREAD,
+ PersistentBlockScanPolicy::LOAD_POLICY> BlockLoadT;
// Parameterized block store
typedef BlockStore<
OutputIteratorRA,
- TilesScanPolicy::BLOCK_THREADS,
- TilesScanPolicy::ITEMS_PER_THREAD,
- TilesScanPolicy::STORE_POLICY> BlockStoreT;
+ PersistentBlockScanPolicy::BLOCK_THREADS,
+ PersistentBlockScanPolicy::ITEMS_PER_THREAD,
+ PersistentBlockScanPolicy::STORE_POLICY> BlockStoreT;
// Parameterized block scan
typedef BlockScan<
T,
- TilesScanPolicy::BLOCK_THREADS,
- TilesScanPolicy::SCAN_ALGORITHM> BlockScanT;
+ PersistentBlockScanPolicy::BLOCK_THREADS,
+ PersistentBlockScanPolicy::SCAN_ALGORITHM> BlockScanT;
// Parameterized warp reduce
typedef WarpReduce WarpReduceT;
@@ -270,7 +270,7 @@ public:
ScanOp &scan_op,
T &thread_aggregate)
{
- T items[TilesScanPolicy::ITEMS_PER_THREAD];
+ T items[PersistentBlockScanPolicy::ITEMS_PER_THREAD];
BlockLoadT::Load(smem_storage.load, d_in + block_offset, items);
@@ -392,7 +392,7 @@ public:
* The return value is undefined in threads other than thread0.
*/
template
- static __device__ __forceinline__ T ProcessTilesEvenShare(
+ static __device__ __forceinline__ T ProcessPersistentBlockEvenShare(
SmemStorage &smem_storage,
InputIteratorRA d_in,
SizeT block_offset,
@@ -438,7 +438,7 @@ public:
* The return value is undefined in threads other than thread0.
*/
template
- static __device__ __forceinline__ T ProcessTilesDynamic(
+ static __device__ __forceinline__ T ProcessPersistentBlockDynamic(
SmemStorage &smem_storage,
InputIteratorRA d_in,
SizeT num_items,
@@ -505,45 +505,6 @@ public:
}
}
-
- /**
- * \brief Consumes input tiles according to TilesScanPolicy::GRID_MAPPING, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor.
- *
- * The return value is undefined in threads other than thread0.
- */
- template
- static __device__ __forceinline__ T ProcessTiles(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridEvenShare &even_share,
- GridQueue &queue,
- ScanOp &scan_op)
- {
- if (TilesScanPolicy::GRID_MAPPING == GRID_MAPPING_EVEN_SHARE)
- {
- // Even share
- even_share.BlockInit();
-
- return ProcessTilesEvenShare(
- smem_storage,
- d_in,
- even_share.block_offset,
- even_share.block_oob,
- scan_op);
- }
- else
- {
- // Dynamically dequeue
- return ProcessTilesDynamic(
- smem_storage,
- d_in,
- num_items,
- queue,
- scan_op);
- }
- }
-
};
diff --git a/cub/device/tiles/tiles_histo_256.cuh b/cub/device/tiles/tiles_histo_256.cuh
deleted file mode 100644
index 94623db89f..0000000000
--- a/cub/device/tiles/tiles_histo_256.cuh
+++ /dev/null
@@ -1,499 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill. All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the NVIDIA CORPORATION nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::TilesHisto256 implements an abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram.
- */
-
-#pragma once
-
-#include
-
-#include "../../grid/grid_mapping.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_histo_256.cuh"
-#include "../../util_vector.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Tuning policy for TilesHisto256
- */
-template <
- int _BLOCK_THREADS,
- int _ITEMS_PER_THREAD,
- BlockHisto256Algorithm _BLOCK_ALGORITHM,
- GridMappingStrategy _GRID_MAPPING>
-struct TilesHisto256Policy
-{
- enum
- {
- BLOCK_THREADS = _BLOCK_THREADS,
- ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
- };
-
- static const BlockHisto256Algorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
- static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING;
-};
-
-
-
-/******************************************************************************
- * TilesHisto256
- ******************************************************************************/
-
-/**
- * \brief TilesHisto256 implements an abstraction of CUDA thread blocks for participating in device-wide histogram.
- */
-template <
- typename TilesHisto256Policy, ///< Tuning policy
- int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
- typename SizeT> ///< Integer type for offsets
-class TilesHisto256
-{
-private:
-
- //---------------------------------------------------------------------
- // Types and constants
- //---------------------------------------------------------------------
-
- // Constants
- enum
- {
- BLOCK_THREADS = TilesHisto256Policy::BLOCK_THREADS,
- ITEMS_PER_THREAD = TilesHisto256Policy::ITEMS_PER_THREAD,
- TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
- TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
- };
-
- static const BlockHisto256Algorithm BLOCK_ALGORITHM = TilesHisto256Policy::BLOCK_ALGORITHM;
-
- // Parameterized BlockHisto256 primitive
- typedef BlockHisto256 BlockHisto256T;
-
- // Shared memory type for this threadblock
- struct _SmemStorage
- {
- SizeT block_offset; // Location where to dequeue input for dynamic operation
- typename BlockHisto256T::SmemStorage block_histo; // Smem needed for cooperative histogramming
- };
-
-public:
-
- /// \smemstorage{TilesHisto256}
- typedef _SmemStorage SmemStorage;
-
-private:
-
- //---------------------------------------------------------------------
- // Utility operations
- //---------------------------------------------------------------------
-
- /**
- * Channel-oriented (one channel at a time)
- */
- template <
- BlockHisto256Algorithm _BLOCK_ALGORITHM,
- bool CHANNEL_ORIENTED = (_BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) >
- struct TilesHisto256Internal
- {
- /**
- * Process one channel within a tile.
- */
- template <
- typename InputIteratorRA,
- typename HistoCounter,
- int ACTIVE_CHANNELS>
- static __device__ __forceinline__ void ConsumeTileChannel(
- SmemStorage &smem_storage,
- int channel,
- InputIteratorRA d_in,
- SizeT block_offset,
- HistoCounter (&histograms)[ACTIVE_CHANNELS][256],
- const int &guarded_items = TILE_ITEMS)
- {
- // Load items in striped fashion
- if (guarded_items < TILE_ITEMS)
- {
- unsigned char items[ITEMS_PER_THREAD];
-
- // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
- int bounds = (guarded_items - (threadIdx.x * CHANNELS));
-
- #pragma unroll
- for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
- {
- items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
- d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
- 0;
- }
-
- // Composite our histogram data
- BlockHisto256T::Composite(smem_storage.block_histo, items, histograms[channel]);
-
- __syncthreads();
-
- if (threadIdx.x == 0)
- {
- int extra = (TILE_ITEMS - guarded_items) / CHANNELS;
- histograms[channel][0] -= extra;
- }
- }
- else
- {
- unsigned char items[ITEMS_PER_THREAD];
-
- // Unguarded loads
- #pragma unroll
- for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
- {
- items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
- }
-
- // Composite our histogram data
- BlockHisto256T::Composite(smem_storage.block_histo, items, histograms[channel]);
- }
- }
-
-
- /**
- * Process one tile.
- */
- template <
- typename InputIteratorRA,
- typename HistoCounter,
- int ACTIVE_CHANNELS>
- static __device__ __forceinline__ void ConsumeTile(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT block_offset,
- HistoCounter (&histograms)[ACTIVE_CHANNELS][256],
- const int &guarded_items = TILE_ITEMS)
- {
- // We take several passes through the tile in this variant, extracting the samples for one channel at a time
-
- // First channel
- ConsumeTileChannel(smem_storage, 0, d_in, block_offset, histograms, guarded_items);
-
- // Iterate through remaining channels
- #pragma unroll
- for (int CHANNEL = 1; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
- {
- __syncthreads();
-
- ConsumeTileChannel(smem_storage, CHANNEL, d_in, block_offset, histograms, guarded_items);
- }
- }
- };
-
-
-
- /**
- * BLOCK_BYTE_HISTO_ATOMIC algorithmic variant
- */
- template
- struct TilesHisto256Internal<_BLOCK_ALGORITHM, false>
- {
- /**
- * Process one tile.
- */
- template <
- typename InputIteratorRA,
- typename HistoCounter,
- int ACTIVE_CHANNELS>
- static __device__ __forceinline__ void ConsumeTile(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT block_offset,
- HistoCounter (&histograms)[ACTIVE_CHANNELS][256],
- const int &guarded_items = TILE_ITEMS)
- {
-
- if (guarded_items < TILE_ITEMS)
- {
- int bounds = guarded_items - (threadIdx.x * CHANNELS);
-
- #pragma unroll
- for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
- {
- #pragma unroll
- for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
- {
- if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
- {
- unsigned char item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
- atomicAdd(histograms[CHANNEL] + item, 1);
- }
- }
- }
-
- }
- else
- {
- unsigned char items[ITEMS_PER_THREAD][CHANNELS];
-
- #pragma unroll
- for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
- {
- #pragma unroll
- for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
- {
- if (CHANNEL < ACTIVE_CHANNELS)
- {
- items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
- }
- }
- }
-
- __threadfence_block();
-
- #pragma unroll
- for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
- {
- #pragma unroll
- for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
- {
- if (CHANNEL < ACTIVE_CHANNELS)
- {
- atomicAdd(histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
- }
- }
- }
-
-
-/*
- #pragma unroll
- for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
- {
- unsigned char items[ITEMS_PER_THREAD][CHANNELS];
-
- int tile_offset = (CHANNEL * TILE_CHANNEL_ITEMS);
-
- #pragma unroll
- for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
- {
- items[ITEM] = d_in[block_offset + tile_offset + (ITEM * BLOCK_THREADS) + threadIdx.x];
- }
-
- __threadfence_block();
-
- // Update histogram
-
- if ((ACTIVE_CHANNELS == CHANNELS) || (my_channel < ACTIVE_CHANNELS))
- {
- #pragma unroll
- for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
- {
- atomicAdd(histograms[my_channel] + items[ITEM], 1);
- }
- }
- }
-*/
- }
- }
- };
-
-
-
-
-
-
-public:
-
- //---------------------------------------------------------------------
- // Interface
- //---------------------------------------------------------------------
-
- /**
- * \brief Consumes input tiles using an even-share policy
- */
- template <
- typename InputIteratorRA,
- typename HistoCounter,
- int ACTIVE_CHANNELS>
- static __device__ __forceinline__ void ProcessTilesEvenShare(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT block_offset,
- const SizeT &block_oob,
- HistoCounter (&histograms)[ACTIVE_CHANNELS][256])
- {
- // Initialize histograms
- #pragma unroll
- for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
- {
- BlockHisto256T::InitHistogram(histograms[CHANNEL]);
- }
-
- __syncthreads();
-
- // Consume full tiles
- while (block_offset + TILE_ITEMS <= block_oob)
- {
- TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms);
-
- block_offset += TILE_ITEMS;
-
- // Skip synchro for atomic version since we know it doesn't use any smem
- if (BLOCK_ALGORITHM != BLOCK_BYTE_HISTO_ATOMIC)
- {
- __syncthreads();
- }
- }
-
- // Consume any remaining partial-tile
- if (block_offset < block_oob)
- {
- TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms, block_oob - block_offset);
- }
- }
-
-
- /**
- * \brief Consumes input tiles using a dynamic queue policy
- */
- template <
- typename InputIteratorRA,
- typename HistoCounter,
- int ACTIVE_CHANNELS>
- static __device__ __forceinline__ void ProcessTilesDynamic(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridQueue &queue,
- HistoCounter (&histograms)[ACTIVE_CHANNELS][256])
- {
-
- // Initialize histograms
- #pragma unroll
- for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
- {
- BlockHisto256T::InitHistogram(histograms[CHANNEL]);
- }
-
- // Dynamically consume tiles
- while (true)
- {
- // Dequeue up to TILE_ITEMS
- if (threadIdx.x == 0)
- {
- smem_storage.block_offset = queue.Drain(TILE_ITEMS);
- }
-
- __syncthreads();
-
- SizeT block_offset = smem_storage.block_offset;
-
- __syncthreads();
-
- if (block_offset + TILE_ITEMS > num_items)
- {
- if (block_offset < num_items)
- {
- // We have less than a full tile to consume
- TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms, num_items - block_offset);
- }
-
- // No more work to do
- break;
- }
-
- // We have a full tile to consume
- TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms);
- }
- }
-
-
- /**
- * Specialized for GRID_MAPPING_EVEN_SHARE
- */
- template
- struct Mapping
- {
- template <
- typename InputIteratorRA,
- typename HistoCounter,
- int ACTIVE_CHANNELS>
- static __device__ __forceinline__ void ProcessTiles(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridEvenShare &even_share,
- GridQueue &queue,
- HistoCounter (&histograms)[ACTIVE_CHANNELS][256])
- {
- even_share.BlockInit();
- return ProcessTilesEvenShare(smem_storage, d_in, even_share.block_offset, even_share.block_oob, histograms);
- }
-
- };
-
-
- /**
- * Specialized for GRID_MAPPING_DYNAMIC
- */
- template
- struct Mapping
- {
- template <
- typename InputIteratorRA,
- typename HistoCounter,
- int ACTIVE_CHANNELS>
- static __device__ __forceinline__ void ProcessTiles(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridEvenShare &even_share,
- GridQueue &queue,
- HistoCounter (&histograms)[ACTIVE_CHANNELS][256])
- {
- ProcessTilesDynamic(smem_storage, d_in, num_items, queue, histograms);
- }
-
- };
-
-};
-
-
-} // CUB namespace
-CUB_NS_POSTFIX // Optional outer namespace(s)
-
diff --git a/cub/device/tiles/tiles_reduce.cuh b/cub/device/tiles/tiles_reduce.cuh
deleted file mode 100644
index 54b33d1ae9..0000000000
--- a/cub/device/tiles/tiles_reduce.cuh
+++ /dev/null
@@ -1,457 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill. All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the NVIDIA CORPORATION nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::TilesReduce implements an abstraction of CUDA thread blocks for
- * reducing multiple tiles as part of device-wide reduction.
-
- */
-
-#pragma once
-
-#include
-
-#include "../../grid/grid_mapping.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_reduce.cuh"
-#include "../../util_vector.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Tuning policy for TilesReduce
- */
-template <
- int _BLOCK_THREADS,
- int _ITEMS_PER_THREAD,
- int _VECTOR_LOAD_LENGTH,
- BlockReduceAlgorithm _BLOCK_ALGORITHM,
- PtxLoadModifier _LOAD_MODIFIER,
- GridMappingStrategy _GRID_MAPPING>
-struct TilesReducePolicy
-{
- enum
- {
- BLOCK_THREADS = _BLOCK_THREADS,
- ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
- VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
- };
-
- static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
- static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING;
- static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
-};
-
-
-/**
- * \brief TilesReduce implements an abstraction of CUDA thread blocks for
- * participating in device-wide reduction.
- */
-template <
- typename TilesReducePolicy,
- typename InputIteratorRA,
- typename SizeT>
-class TilesReduce
-{
-private:
-
- //---------------------------------------------------------------------
- // Types and constants
- //---------------------------------------------------------------------
-
- // Data type of input iterator
- typedef typename std::iterator_traits::value_type T;
-
- // Constants
- enum
- {
- // Number of items to be be processed to completion before the thread block terminates or obtains more work
- TILE_ITEMS = TilesReducePolicy::BLOCK_THREADS * TilesReducePolicy::ITEMS_PER_THREAD,
- };
-
- // Parameterized BlockReduce primitive
- typedef BlockReduce BlockReduceT;
-
- // Shared memory type for this threadblock
- struct _SmemStorage
- {
- SizeT block_offset; // Location where to dequeue input for dynamic operation
- typename BlockReduceT::SmemStorage reduce; // Smem needed for cooperative reduction
- };
-
-public:
-
- /// \smemstorage{TilesReduce}
- typedef _SmemStorage SmemStorage;
-
-private:
-
- //---------------------------------------------------------------------
- // Utility operations
- //---------------------------------------------------------------------
-
- /**
- * Process a single, full tile. Specialized for native pointers
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE,
- * this partial reduction is stored into \p thread_aggregate. Otherwise
- * it is accumulated into \p thread_aggregate.
- *
- * Performs a block-wide barrier synchronization
- */
- template <
- bool VECTORIZE_INPUT,
- bool FIRST_TILE,
- typename ReductionOp>
- static __device__ __forceinline__ void ConsumeFullTile(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT block_offset,
- ReductionOp &reduction_op,
- T &thread_aggregate)
- {
- T items[TilesReducePolicy::ITEMS_PER_THREAD];
-
- if (VECTORIZE_INPUT)
- {
- typedef VectorHelper VecHelper;
- typedef typename VecHelper::Type VectorT;
-
- // Alias items as an array of VectorT and load it in striped fashion
- BlockLoadDirectStriped(
- reinterpret_cast(d_in + block_offset),
- reinterpret_cast(items));
- }
- else
- {
- // Load items in striped fashion
- BlockLoadDirectStriped(
- d_in + block_offset,
- items);
- }
-
- // Prevent hoisting
- __threadfence_block();
-
- // Reduce items within each thread
- T partial = ThreadReduce(items, reduction_op);
-
- // Update|assign the thread's running aggregate
- thread_aggregate = (FIRST_TILE) ?
- partial :
- reduction_op(thread_aggregate, partial);
- }
-
-
- /**
- * Process a single, partial tile.
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE,
- * this partial reduction is stored into \p thread_aggregate. Otherwise
- * it is accumulated into \p thread_aggregate.
- */
- template <
- bool FIRST_TILE,
- typename ReductionOp>
- static __device__ __forceinline__ void ConsumePartialTile(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT block_offset,
- const SizeT &block_oob,
- ReductionOp &reduction_op,
- T &thread_aggregate)
- {
- SizeT thread_offset = block_offset + threadIdx.x;
-
- if ((FIRST_TILE) && (thread_offset < block_oob))
- {
- thread_aggregate = ThreadLoad(d_in + thread_offset);
- thread_offset += TilesReducePolicy::BLOCK_THREADS;
- }
-
- while (thread_offset < block_oob)
- {
- T item = ThreadLoad(d_in + thread_offset);
- thread_aggregate = reduction_op(thread_aggregate, item);
- thread_offset += TilesReducePolicy::BLOCK_THREADS;
- }
- }
-
-
- /**
- * \brief Consumes input tiles using an even-share policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor.
- *
- * The return value is undefined in threads other than thread0.
- */
- template <
- bool VECTORIZE_INPUT,
- typename ReductionOp>
- static __device__ __forceinline__ T ProcessTilesEvenShare(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT block_offset,
- const SizeT &block_oob,
- ReductionOp &reduction_op)
- {
- if (block_offset + TILE_ITEMS <= block_oob)
- {
- // We have at least one full tile to consume
- T thread_aggregate;
- ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
- block_offset += TILE_ITEMS;
-
- // Consume any other full tiles
- while (block_offset + TILE_ITEMS <= block_oob)
- {
- ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
- block_offset += TILE_ITEMS;
- }
-
- // Consume any remaining input
- ConsumePartialTile(smem_storage, d_in, block_offset, block_oob, reduction_op, thread_aggregate);
-
- // Compute the block-wide reduction (every thread has a valid input)
- return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op);
- }
- else
- {
- // We have less than a full tile to consume
- T thread_aggregate;
- ConsumePartialTile(smem_storage, d_in, block_offset, block_oob, reduction_op, thread_aggregate);
-
- // Compute the block-wide reduction (up to block_items threads have valid inputs)
- SizeT block_items = block_oob - block_offset;
- return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op, block_items);
- }
- }
-
-
- /**
- * \brief Consumes input tiles using a dynamic queue policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor.
- *
- * The return value is undefined in threads other than thread0.
- */
- template <
- bool VECTORIZE_INPUT,
- typename ReductionOp>
- static __device__ __forceinline__ T ProcessTilesDynamic(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridQueue &queue,
- ReductionOp &reduction_op)
- {
- // Each thread block is statically assigned at some input, otherwise its
- // block_aggregate will be undefined.
- SizeT block_offset = blockIdx.x * TILE_ITEMS;
-
- if (block_offset + TILE_ITEMS <= num_items)
- {
- // We have a full tile to consume
- T thread_aggregate;
- ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
-
- // Dynamically consume other tiles
- SizeT even_share_base = gridDim.x * TILE_ITEMS;
-
- if (even_share_base < num_items)
- {
- // There are tiles left to consume
- while (true)
- {
- // Dequeue up to TILE_ITEMS
- if (threadIdx.x == 0)
- {
- smem_storage.block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
- }
-
- __syncthreads();
-
- block_offset = smem_storage.block_offset;
-
- __syncthreads();
-
- if (block_offset + TILE_ITEMS > num_items)
- {
- if (block_offset < num_items)
- {
- // We have less than a full tile to consume
- ConsumePartialTile(smem_storage, d_in, block_offset, num_items, reduction_op, thread_aggregate);
- }
-
- // No more work to do
- break;
- }
-
- // We have a full tile to consume
- ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
- }
- }
-
- // Compute the block-wide reduction (every thread has a valid input)
- return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op);
- }
- else
- {
- // We have less than a full tile to consume
- T thread_aggregate;
- SizeT block_items = num_items - block_offset;
- ConsumePartialTile(smem_storage, d_in, block_offset, num_items, reduction_op, thread_aggregate);
-
- // Compute the block-wide reduction (up to block_items threads have valid inputs)
- return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op, block_items);
- }
- }
-
-
-public:
-
- //---------------------------------------------------------------------
- // Interface
- //---------------------------------------------------------------------
-
- /**
- * \brief Consumes input tiles using an even-share policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor.
- *
- * The return value is undefined in threads other than thread0.
- */
- template
- static __device__ __forceinline__ T ProcessTilesEvenShare(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT block_offset,
- const SizeT &block_oob,
- ReductionOp &reduction_op)
- {
- typedef VectorHelper VecHelper;
- typedef typename VecHelper::Type VectorT;
-
- if ((IsPointer::VALUE) &&
- (TilesReducePolicy::VECTOR_LOAD_LENGTH > 1) &&
- (VecHelper::BUILT_IN) &&
- ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
- {
- return ProcessTilesEvenShare(smem_storage, d_in, block_offset, block_oob, reduction_op);
- }
- else
- {
- return ProcessTilesEvenShare(smem_storage, d_in, block_offset, block_oob, reduction_op);
- }
- }
-
-
- /**
- * \brief Consumes input tiles using a dynamic queue policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor.
- *
- * The return value is undefined in threads other than thread0.
- */
- template
- static __device__ __forceinline__ T ProcessTilesDynamic(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridQueue &queue,
- ReductionOp &reduction_op)
- {
- typedef VectorHelper VecHelper;
- typedef typename VecHelper::Type VectorT;
-
- if ((IsPointer::VALUE) &&
- (TilesReducePolicy::VECTOR_LOAD_LENGTH > 1) &&
- (VecHelper::BUILT_IN) &&
- ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
- {
- return ProcessTilesDynamic(smem_storage, d_in, num_items, queue, reduction_op);
- }
- else
- {
- return ProcessTilesDynamic(smem_storage, d_in, num_items, queue, reduction_op);
- }
- }
-
-
- /**
- * Specialized for GRID_MAPPING_EVEN_SHARE
- */
- template
- struct Mapping
- {
- template
- static __device__ __forceinline__ T ProcessTiles(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridEvenShare &even_share,
- GridQueue &queue,
- ReductionOp &reduction_op)
- {
- // Even share
- even_share.BlockInit();
-
- return ProcessTilesEvenShare(smem_storage, d_in, even_share.block_offset, even_share.block_oob, reduction_op);
- }
-
- };
-
-
- /**
- * Specialized for GRID_MAPPING_DYNAMIC
- */
- template
- struct Mapping
- {
- template
- static __device__ __forceinline__ T ProcessTiles(
- SmemStorage &smem_storage,
- InputIteratorRA d_in,
- SizeT num_items,
- GridEvenShare &even_share,
- GridQueue &queue,
- ReductionOp &reduction_op)
- {
- // Dynamically dequeue
- return ProcessTilesDynamic(smem_storage, d_in, num_items, queue, reduction_op);
- }
-
- };
-
-};
-
-
-} // CUB namespace
-CUB_NS_POSTFIX // Optional outer namespace(s)
-
diff --git a/cub/grid/grid_even_share.cuh b/cub/grid/grid_even_share.cuh
index d3013285a8..136f4bed83 100644
--- a/cub/grid/grid_even_share.cuh
+++ b/cub/grid/grid_even_share.cuh
@@ -26,15 +26,11 @@
*
******************************************************************************/
-/******************************************************************************
- * Threadblock Work management.
- *
- * A given threadblock may receive one of three different amounts of
- * work: "big", "normal", and "last". The big workloads are one
- * grain greater than the normal, and the last workload
- * does the extra work.
- *
- ******************************************************************************/
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
+ */
+
#pragma once
@@ -56,7 +52,7 @@ namespace cub {
/**
- * \brief A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
+ * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
*
* \par Overview
* GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
@@ -87,17 +83,36 @@ private:
public:
/// Total number of input items
- SizeT num_items;
+ SizeT num_items;
/// Grid size in threadblocks
- int grid_size;
+ int grid_size;
/// Offset into input marking the beginning of the owning thread block's segment of input tiles
- SizeT block_offset;
+ SizeT block_offset;
/// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
SizeT block_oob;
+ /**
+ * \brief Block-based constructor for single-block grids.
+ */
+ __device__ __forceinline__ GridEvenShare(SizeT num_items) :
+ num_items(num_items),
+ grid_size(1),
+ block_offset(0),
+ block_oob(num_items) {}
+
+
+ /**
+ * \brief Default constructor. Zero-initializes block-specific fields.
+ */
+ __host__ __device__ __forceinline__ GridEvenShare() :
+ num_items(0),
+ grid_size(0),
+ block_offset(0),
+ block_oob(0) {}
+
/**
* \brief Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
diff --git a/cub/grid/grid_mapping.cuh b/cub/grid/grid_mapping.cuh
index ea4c6def89..a3e0b6d8b7 100644
--- a/cub/grid/grid_mapping.cuh
+++ b/cub/grid/grid_mapping.cuh
@@ -28,13 +28,13 @@
/**
* \file
- * cub::GridMappingStrategy enumerates alternative strategies for mapping
- * constant-sized tiles of device-wide data onto a grid of CUDA thread
- * blocks.
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
*/
#pragma once
+#include "grid_even_share.cuh"
+#include "grid_queue.cuh"
#include "../util_namespace.cuh"
/// Optional outer namespace(s)
@@ -50,15 +50,18 @@ namespace cub {
*/
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
/**
- * GridMappingStrategy enumerates alternative strategies for mapping
- * constant-sized tiles of device-wide data onto a grid of CUDA thread
- * blocks.
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
*/
enum GridMappingStrategy
{
/**
- * \brief An "even-share" strategy.
+ * \brief An "even-share" strategy for assigning input tiles to thread blocks.
*
* \par Overview
* The input is evenly partitioned into \p p segments, where \p p is
@@ -73,7 +76,7 @@ enum GridMappingStrategy
GRID_MAPPING_EVEN_SHARE,
/**
- * \brief A dynamic "queue-based" strategy for commutative reduction operators.
+ * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
*
* \par Overview
* The input is treated as a queue to be dynamically consumed by a grid of
@@ -87,6 +90,453 @@ enum GridMappingStrategy
};
+
+/******************************************************************************
+ * Mapping engines
+ *****************************************************************************/
+
+/**
+ * \brief Dispatches tiles of work from the given input range to the specified thread block abstraction.
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);
+ * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile:
+ * - int TileItems()
+ * - Finalization:
+ * - void Finalize(Result &result);
+ *
+ * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing
+ * \tparam SizeT [inferred] Integral type used for global array indexing
+ * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance
+ */
+
+template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+__device__ __forceinline__ void ConsumeTiles(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
+ SizeT block_oob, ///< [in] Threadblock end offset (exclusive)
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+{
+ bool sync_after = true;
+
+ // Number of items per tile that can be processed by tiles
+ int tile_items = persistent_block.TileItems();
+
+ // Consume any full tiles
+ while (block_offset + tile_items <= block_oob)
+ {
+ persistent_block.ConsumeTile(sync_after, block_offset, tile_items);
+ if (sync_after) __syncthreads();
+
+ block_offset += tile_items;
+ }
+
+ // Consume any remaining input
+ if (block_offset < block_oob)
+ {
+ persistent_block.ConsumeTile(sync_after, block_offset, block_oob - block_offset);
+ if (sync_after) __syncthreads();
+ }
+
+ // Compute the block-wide reduction (every thread has a valid input)
+ persistent_block.Finalize(result);
+}
+
+
+/**
+ * \brief Uses a GridEvenShare descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);
+ * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile:
+ * - int TileItems()
+ * - Finalization:
+ * - void Finalize(Result &result);
+ *
+ * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing
+ * \tparam SizeT [inferred] Integral type used for global array indexing
+ * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance
+ */
+
+template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+__device__ __forceinline__ void ConsumeTiles(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridEvenShare &even_share, ///< [in] GridEvenShare descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+{
+ even_share.BlockInit();
+ ConsumeTiles(persistent_block, even_share.block_offset, even_share.block_oob, result);
+}
+
+
+
+/**
+ * \brief Dispatches tiles of work from the given input range to the specified thread block abstraction. The first tile given to each thread block is flagged as such.
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);
+ * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile:
+ * - int TileItems()
+ * - Finalization:
+ * - void Finalize(Result &result);
+ *
+ * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing
+ * \tparam SizeT [inferred] Integral type used for global array indexing
+ * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance
+ */
+template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+__device__ __forceinline__ void ConsumeTilesFlagFirst(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT block_offset, ///< [in] Threadblock begin offset (inclusive)
+ SizeT block_oob, ///< [in] Threadblock end offset (exclusive)
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+{
+ bool sync_after = true;
+
+ // Number of items per tile that can be processed by tiles
+ int tile_items = persistent_block.TileItems();
+
+ if (block_offset + tile_items <= block_oob)
+ {
+ // We have at least one full tile to consume
+ persistent_block.ConsumeTile(sync_after, block_offset, tile_items, true);
+ if (sync_after) __syncthreads();
+
+ block_offset += tile_items;
+
+ // Consume any other full tiles
+ while (block_offset + tile_items <= block_oob)
+ {
+ persistent_block.ConsumeTile(sync_after, block_offset, tile_items, false);
+ if (sync_after) __syncthreads();
+
+ block_offset += tile_items;
+ }
+
+ // Consume any remaining input
+ if (block_offset < block_oob)
+ {
+ persistent_block.ConsumeTile(sync_after, block_offset, block_oob - block_offset, false);
+ if (sync_after) __syncthreads();
+ }
+ }
+ else
+ {
+ // We have less than a full tile to consume
+ SizeT block_items = block_oob - block_offset;
+
+ persistent_block.ConsumeTile(sync_after, block_offset, block_items, true);
+ if (sync_after) __syncthreads();
+ }
+
+ // Compute the block-wide reduction (every thread has a valid input)
+ persistent_block.Finalize(result);
+}
+
+
+/**
+ * \brief Uses a GridEvenShare descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);
+ * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile:
+ * - int TileItems()
+ * - Finalization:
+ * - void Finalize(Result &result);
+ *
+ * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing
+ * \tparam SizeT [inferred] Integral type used for global array indexing
+ * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance
+ */
+template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+__device__ __forceinline__ void ConsumeTilesFlagFirst(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridEvenShare &even_share, ///< [in] GridEvenShare descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+{
+ even_share.BlockInit();
+ ConsumeTilesFlagFirst(persistent_block, even_share.block_offset, even_share.block_oob, result);
+}
+
+
+
+/**
+ * \brief Uses a GridQueue descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);
+ * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile:
+ * - int TileItems()
+ * - Finalization:
+ * - void Finalize(Result &result);
+ *
+ * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing
+ * \tparam SizeT [inferred] Integral type used for global array indexing
+ * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance
+ */
+template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+__device__ __forceinline__ void ConsumeTiles(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridQueue &queue, ///< [in,out] GridQueue descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+{
+ // Shared tile-processing offset obtained dynamically from queue
+ __shared__ SizeT dynamic_block_offset;
+
+ bool sync_after = true;
+
+ // Number of items per tile that can be processed by tiles
+ int tile_items = persistent_block.TileItems();
+
+ // There are tiles left to consume
+ while (true)
+ {
+ // Dequeue up to tile_items
+ if (threadIdx.x == 0)
+ {
+ dynamic_block_offset = queue.Drain(tile_items);
+ }
+
+ __syncthreads();
+
+ SizeT block_offset = dynamic_block_offset;
+
+ __syncthreads();
+
+ if (block_offset + tile_items > num_items)
+ {
+ if (block_offset < num_items)
+ {
+ // We have less than a full tile to consume
+ persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset);
+ if (sync_after) __syncthreads();
+ }
+
+ // No more work to do
+ break;
+ }
+
+ // We have a full tile to consume
+ persistent_block.ConsumeTile(sync_after, block_offset, tile_items);
+ }
+
+ persistent_block.Finalize(result);
+}
+
+
+
+/**
+ * \brief Uses a GridQueue descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);
+ * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile:
+ * - int TileItems()
+ * - Finalization:
+ * - void Finalize(Result &result);
+ *
+ * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing
+ * \tparam SizeT [inferred] Integral type used for global array indexing
+ * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance
+ */
+template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+__device__ __forceinline__ void ConsumeTilesFlagFirst(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridQueue &queue, ///< [in,out] GridQueue descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+{
+ // Shared tile-processing offset obtained dynamically from queue
+ __shared__ SizeT dynamic_block_offset;
+
+ bool sync_after = true;
+
+ // Number of items per tile that can be processed by tiles
+ int tile_items = persistent_block.TileItems();
+
+ // We give each thread block at least one tile of input.
+ SizeT block_offset = blockIdx.x * tile_items;
+
+ // Check if we have a full tile to consume
+ if (block_offset + tile_items <= num_items)
+ {
+ persistent_block.ConsumeTile(sync_after, block_offset, tile_items, true);
+ if (sync_after) __syncthreads();
+
+ // Now that every block in the kernel has gotten a tile, attempt to dynamically consume any remaining
+ SizeT even_share_base = gridDim.x * tile_items;
+ if (even_share_base < num_items)
+ {
+ // There are tiles left to consume
+ while (true)
+ {
+ // Dequeue up to tile_items
+ if (threadIdx.x == 0)
+ {
+ dynamic_block_offset = queue.Drain(tile_items) + even_share_base;
+ }
+
+ __syncthreads();
+
+ block_offset = dynamic_block_offset;
+
+ __syncthreads();
+
+ if (block_offset + tile_items > num_items)
+ {
+ if (block_offset < num_items)
+ {
+ // We have less than a full tile to consume
+ persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset, false);
+ if (sync_after) __syncthreads();
+ }
+
+ // No more work to do
+ break;
+ }
+
+ // We have a full tile to consume
+ persistent_block.ConsumeTile(sync_after, block_offset, tile_items, false);
+ }
+ }
+ }
+ else
+ {
+ // We have less than a full tile to consume
+ persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset, true);
+ if (sync_after) __syncthreads();
+ }
+
+ // Compute the block-wide reduction (every thread has a valid input)
+ persistent_block.Finalize(result);
+
+}
+
+
+
+/******************************************************************************
+ * Type-directed dispatch to mapping engines
+ *****************************************************************************/
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * \brief Dispatch helper for statically selecting between mapping strategies (e.g., to avoid compiling an alternative that is invaild for a given architecture)
+ */
+template
+struct GridMapping;
+
+/**
+ * Even-share specialization of GridMapping
+ */
+template<>
+struct GridMapping
+{
+ template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+ static __device__ __forceinline__ void ConsumeTiles(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridEvenShare &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue &queue, ///< [in,out] GridQueue descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+ {
+ cub::ConsumeTiles(persistent_block, num_items, even_share, result);
+ }
+
+ template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+ static __device__ __forceinline__ void ConsumeTilesFlagFirst(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridEvenShare &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue &queue, ///< [in,out] GridQueue descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+ {
+ cub::ConsumeTilesFlagFirst(persistent_block, num_items, even_share, result);
+ }
+};
+
+
+/**
+ * Even-share specialization of GridMapping
+ */
+template<>
+struct GridMapping
+{
+ template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+ static __device__ __forceinline__ void ConsumeTiles(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridEvenShare &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue &queue, ///< [in,out] GridQueue descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+ {
+ cub::ConsumeTiles(persistent_block, num_items, queue, result);
+ }
+
+ template <
+ typename PersistentBlock,
+ typename SizeT,
+ typename Result>
+ static __device__ __forceinline__ void ConsumeTilesFlagFirst(
+ PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing
+ SizeT num_items, ///< [in] Total number of global input items
+ GridEvenShare &even_share, ///< [in] GridEvenShare descriptor
+ GridQueue &queue, ///< [in,out] GridQueue descriptor
+ Result &result) ///< [out] Result returned by tiles::Finalize()
+ {
+ cub::ConsumeTilesFlagFirst(persistent_block, num_items, queue, result);
+ }
+};
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
/** @} */ // end group GridModule
} // CUB namespace
diff --git a/cub/grid/grid_queue.cuh b/cub/grid/grid_queue.cuh
index 3ca31b5a1a..daec348ea5 100644
--- a/cub/grid/grid_queue.cuh
+++ b/cub/grid/grid_queue.cuh
@@ -26,9 +26,10 @@
*
******************************************************************************/
-/******************************************************************************
- * Abstraction for grid-wide queue management
- ******************************************************************************/
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
#pragma once
@@ -52,7 +53,7 @@ namespace cub {
/**
- * \brief Abstraction for grid-wide queue management.
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
*
* \par Overview
* GridQueue descriptors provides abstractions for "filling" or
@@ -77,7 +78,8 @@ namespace cub {
* \par
* Iterative work management can be implemented simply with a pair of flip-flopping
* work buffers, each with an associated set of fill and drain GridQueue descriptors.
-
+ *
+ * \tparam SizeT Integer type for array indexing
*/
template
class GridQueue
diff --git a/cub/thread/thread_load.cuh b/cub/thread/thread_load.cuh
index e36fbe9341..a07c758c27 100644
--- a/cub/thread/thread_load.cuh
+++ b/cub/thread/thread_load.cuh
@@ -161,6 +161,26 @@ struct ThreadLoadDispatch
};
+
+/**
+ * Generic PTX_LOAD_CG specialization for SM10-SM13 architectures
+ */
+#if CUB_PTX_ARCH < 200
+template <>
+struct ThreadLoadDispatch
+{
+ // Iterator
+ template
+ static __device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorRA itr)
+ {
+ // Straightforward dereference
+ return *itr;
+ }
+};
+#endif // CUB_PTX_ARCH < 200
+
+
+
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
@@ -456,6 +476,7 @@ __device__ __forceinline__ typename std::iterator_traits::value
/**
* Expand ThreadLoad() implementations for primitive types.
*/
+#if CUB_PTX_ARCH >= 200
// Signed
CUB_LOADS_0124(char, char, short, s8, h)
@@ -491,6 +512,7 @@ CUB_LOADS_4L(double4, double2);
CUB_LOADS_0124(unsigned long, ulong, unsigned long, u32, r)
#endif
+#endif // CUB_PTX_ARCH >= 200
/**
* Undefine macros
diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh
index 0e524893ca..83de8d789c 100644
--- a/cub/thread/thread_store.cuh
+++ b/cub/thread/thread_store.cuh
@@ -214,54 +214,74 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val)
/**
* Define a global ThreadStore() specialization for type
*/
-#define CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \
- template<> \
- void ThreadStore(type* ptr, const type& val) \
- { \
- const asm_type raw = reinterpret_cast(val); \
- asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \
- _CUB_ASM_PTR_(ptr), \
- #reg_mod(raw)); \
+#define CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \
+ template<> \
+ void ThreadStore(type* ptr, const type& val) \
+ { \
+ const asm_type raw = reinterpret_cast(val); \
+ asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \
+ _CUB_ASM_PTR_(ptr), \
+ #reg_mod(raw)); \
}
/**
* Define a global ThreadStore() specialization for the vector-1 type
*/
#define CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \
- template<> \
- void ThreadStore(type* ptr, const type& val) \
- { \
- const asm_type raw_x = reinterpret_cast(val.x); \
- asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \
- _CUB_ASM_PTR_(ptr), \
- #reg_mod(raw_x)); \
+ template<> \
+ void ThreadStore(type* ptr, const type& val) \
+ { \
+ const asm_type raw_x = reinterpret_cast(val.x); \
+ asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \
+ _CUB_ASM_PTR_(ptr), \
+ #reg_mod(raw_x)); \
}
/**
- * Define a volatile-shared ThreadStore() specialization for the vector-1 type
+ * Define a global ThreadStore() specialization for the vector-2 type
*/
-#define CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod) \
- template<> \
- void ThreadStore(type* ptr, const type& val) \
- { \
- ThreadStore( \
- (asm_type*) ptr, \
- reinterpret_cast(val.x)); \
+#define CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \
+ template<> \
+ void ThreadStore(type* ptr, const type& val) \
+ { \
+ const asm_type raw_x = reinterpret_cast(val.x); \
+ const asm_type raw_y = reinterpret_cast(val.y); \
+ asm volatile ("st.global."#ptx_modifier".v2."#ptx_type" [%0], {%1, %2};" : : \
+ _CUB_ASM_PTR_(ptr), \
+ #reg_mod(raw_x), \
+ #reg_mod(raw_y)); \
}
/**
- * Define a global ThreadStore() specialization for the vector-2 type
+ * Define a global ThreadStore() specialization for the vector-4 type
*/
-#define CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \
- template<> \
- void ThreadStore(type* ptr, const type& val) \
- { \
- const asm_type raw_x = reinterpret_cast(val.x); \
- const asm_type raw_y = reinterpret_cast(val.y); \
- asm volatile ("st.global."#ptx_modifier".v2."#ptx_type" [%0], {%1, %2};" : : \
- _CUB_ASM_PTR_(ptr), \
- #reg_mod(raw_x), \
- #reg_mod(raw_y)); \
+#define CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \
+ template<> \
+ void ThreadStore(type* ptr, const type& val) \
+ { \
+ const asm_type raw_x = reinterpret_cast(val.x); \
+ const asm_type raw_y = reinterpret_cast(val.y); \
+ const asm_type raw_z = reinterpret_cast(val.z); \
+ const asm_type raw_w = reinterpret_cast(val.w); \
+ asm volatile ("st.global."#ptx_modifier".v4."#ptx_type" [%0], {%1, %2, %3, %4};" : : \
+ _CUB_ASM_PTR_(ptr), \
+ #reg_mod(raw_x), \
+ #reg_mod(raw_y), \
+ #reg_mod(raw_z), \
+ #reg_mod(raw_w)); \
+ }
+
+
+/**
+ * Define a volatile-shared ThreadStore() specialization for the vector-1 type
+ */
+#define CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod) \
+ template<> \
+ void ThreadStore(type* ptr, const type& val) \
+ { \
+ ThreadStore( \
+ (asm_type*) ptr, \
+ reinterpret_cast(val.x)); \
}
/**
@@ -269,48 +289,29 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val)
* Performs separate references if the component_type is only 1 byte (otherwise we lose
* performance due to the bitfield ops to disassemble the value)
*/
-#define CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod) \
- template<> \
- void ThreadStore(type* ptr, const type& val) \
- { \
- if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100)) \
- { \
- component_type *base_ptr = (component_type*) ptr; \
+#define CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod) \
+ template<> \
+ void ThreadStore(type* ptr, const type& val) \
+ { \
+ if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100)) \
+ { \
+ component_type *base_ptr = (component_type*) ptr; \
ThreadStore(base_ptr, (component_type) val.x); \
ThreadStore(base_ptr + 1, (component_type) val.y); \
- } \
- else \
- { \
- const asm_type raw_x = reinterpret_cast(val.x); \
- const asm_type raw_y = reinterpret_cast(val.y); \
- asm volatile ("{" \
+ } \
+ else \
+ { \
+ const asm_type raw_x = reinterpret_cast(val.x); \
+ const asm_type raw_y = reinterpret_cast