diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT index 82573be757..5ca79b4598 100644 --- a/CHANGE_LOG.TXT +++ b/CHANGE_LOG.TXT @@ -1,14 +1,15 @@ //----------------------------------------------------------------------------- -0.9.3 04/30/2013 +0.9.4 05/07/2013 + - Fixed compilation errors for SM10-SM13 + - Fixed compilation errors for some WarpScan entrypoints on SM30+ + - Added block-wide histogram (BlockHisto256) + - Added device-wide histogram (DeviceHisto256) - Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which trades more register consumption for less shared memory I/O) - - Added block-wide histogram (BlockHisto256) - Updates to BlockRadixRank to use BlockScan (which improves performance on Kepler due to SHFL instruction) - - Added device-wide histogram (DeviceHisto256) - - Fixed compilation errors for some WarpScan entrypoints on SM30+ - Allow types other than C++ primitives to be used in WarpScan::*Sum methods if they only have operator + overloaded. (Previously they also required to support assignment from int(0).) diff --git a/cub/block/block_histo_256.cuh b/cub/block/block_histo_256.cuh index 46d18d338f..9c33921c36 100644 --- a/cub/block/block_histo_256.cuh +++ b/cub/block/block_histo_256.cuh @@ -45,9 +45,12 @@ CUB_NS_PREFIX namespace cub { +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + /** - * BlockHisto256Algorithm enumerates alternative algorithms for the parallel - * construction of 8b histograms. + * \brief BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms. */ enum BlockHisto256Algorithm { @@ -57,21 +60,33 @@ enum BlockHisto256Algorithm * Sorting followed by differentiation. Execution is comprised of two phases: * -# Sort the 8b data using efficient radix sort * -# Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts. + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. */ - BLOCK_BYTE_HISTO_SORT, + BLOCK_HISTO_256_SORT, /** * \par Overview * Use atomic addition to update byte counts directly * - * \par Usage Considerations - * BLOCK_BYTE_HISTO_ATOMIC can only be used on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used regardless. + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. */ - BLOCK_BYTE_HISTO_ATOMIC, + BLOCK_HISTO_256_ATOMIC, }; + +/****************************************************************************** + * Block histogram + ******************************************************************************/ + + /** * \addtogroup BlockModule * @{ @@ -90,12 +105,12 @@ enum BlockHisto256Algorithm * * \tparam BLOCK_THREADS The threadblock size in threads * \tparam ITEMS_PER_THREAD The number of items per thread - * \tparam ALGORITHM [optional] cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_BYTE_HISTO_SORT) + * \tparam ALGORITHM [optional] cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_HISTO_256_SORT) * * \par Algorithm * BlockHisto256 can be (optionally) configured to use different algorithms: - * -# cub::BLOCK_BYTE_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm) - * -# cub::BLOCK_BYTE_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm) + * -# cub::BLOCK_HISTO_256_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm) + * -# cub::BLOCK_HISTO_256_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm) * * \par Usage Considerations * - The histogram output can be constructed in shared or global memory @@ -167,7 +182,7 @@ enum BlockHisto256Algorithm template < int BLOCK_THREADS, int ITEMS_PER_THREAD, - BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT> + BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT> class BlockHisto256 { private: @@ -178,13 +193,13 @@ private: /** * Ensure the template parameterization meets the requirements of the - * targeted device architecture. BLOCK_BYTE_HISTO_ATOMIC can only be used - * on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used + * targeted device architecture. BLOCK_HISTO_256_ATOMIC can only be used + * on version SM120 or later. Otherwise BLOCK_HISTO_256_SORT is used * regardless. */ static const BlockHisto256Algorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_BYTE_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ? - BLOCK_BYTE_HISTO_SORT : + ((ALGORITHM == BLOCK_HISTO_256_ATOMIC) && (CUB_PTX_ARCH < 120)) ? + BLOCK_HISTO_256_SORT : ALGORITHM; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document @@ -195,7 +210,7 @@ private: ******************************************************************************/ /** - * BLOCK_BYTE_HISTO_SORT algorithmic variant + * BLOCK_HISTO_256_SORT algorithmic variant */ template struct BlockHisto256Internal @@ -319,10 +334,10 @@ private: /** - * BLOCK_BYTE_HISTO_ATOMIC algorithmic variant + * BLOCK_HISTO_256_ATOMIC algorithmic variant */ template - struct BlockHisto256Internal + struct BlockHisto256Internal { /// Shared memory storage layout type struct SmemStorage {}; diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh index d07424f393..1d37601075 100644 --- a/cub/block/block_load.cuh +++ b/cub/block/block_load.cuh @@ -77,8 +77,8 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirect( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { // Load directly in thread-blocked order #pragma unroll @@ -106,8 +106,8 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirect( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { BlockLoadDirect(block_itr, items); } @@ -132,9 +132,9 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirect( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - const int &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD); @@ -165,9 +165,9 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirect( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - const int &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { BlockLoadDirect(block_itr, guarded_items, items); } @@ -191,10 +191,10 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirect( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - const int &guarded_items, ///< [in] Number of valid items in the tile - T oob_default, ///< [in] Default value to assign out-of-bound items - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + const int &guarded_items, ///< [in] Number of valid items in the tile + T oob_default, ///< [in] Default value to assign out-of-bound items + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD); @@ -224,10 +224,10 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirect( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - const int &guarded_items, ///< [in] Number of valid items in the tile - T oob_default, ///< [in] Default value to assign out-of-bound items - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + const int &guarded_items, ///< [in] Number of valid items in the tile + T oob_default, ///< [in] Default value to assign out-of-bound items + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { BlockLoadDirect(block_itr, guarded_items, oob_default, items); } @@ -348,10 +348,10 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirectStriped( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - const int &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. { BlockLoadDirectStriped(block_itr, guarded_items, items, stride); } @@ -409,11 +409,11 @@ template < int ITEMS_PER_THREAD, typename InputIteratorRA> __device__ __forceinline__ void BlockLoadDirectStriped( - InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from - const int &guarded_items, ///< [in] Number of valid items in the tile - T oob_default, ///< [in] Default value to assign out-of-bound items - T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. + InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from + const int &guarded_items, ///< [in] Number of valid items in the tile + T oob_default, ///< [in] Default value to assign out-of-bound items + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. { BlockLoadDirectStriped(block_itr, guarded_items, oob_default, items, stride); } @@ -446,8 +446,8 @@ template < typename T, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockLoadVectorized( - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { enum { @@ -503,8 +503,8 @@ template < typename T, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockLoadVectorized( - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { BlockLoadVectorized(block_ptr, items); } diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh index 0749d858a3..f47f33ac55 100644 --- a/cub/block/block_reduce.cuh +++ b/cub/block/block_reduce.cuh @@ -50,6 +50,11 @@ CUB_NS_PREFIX namespace cub { + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + /** * BlockReduceAlgorithm enumerates alternative algorithms for parallel * reduction across a CUDA threadblock. @@ -59,9 +64,13 @@ enum BlockReduceAlgorithm /** * \par Overview - * An efficient "raking" reduction algorithm. Execution is comprised of three phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. + * An efficient "raking" reduction algorithm. Execution is comprised of + * three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a + * single warp rake across segments of shared partial reductions. * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. * * \par @@ -78,10 +87,15 @@ enum BlockReduceAlgorithm /** * \par Overview - * A quick "tiled warp-reductions" reduction algorithm. Execution is comprised of four phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style reduction within each warp. - * -# A propagation phase where the warp reduction outputs in each warp are updated with the aggregate from each preceding warp. + * A quick "tiled warp-reductions" reduction algorithm. Execution is + * comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style + * reduction within each warp. + * -# A propagation phase where the warp reduction outputs in each warp are + * updated with the aggregate from each preceding warp. * * \par * \image html block_scan_warpscans.png @@ -89,13 +103,18 @@ enum BlockReduceAlgorithm * * \par Performance Considerations * - Although this variant may suffer lower overall throughput across the - * GPU because due to a heavy reliance on inefficient warp-reductions, it can - * often provide lower turnaround latencies when the GPU is under-occupied. + * GPU because due to a heavy reliance on inefficient warp-reductions, it + * can often provide lower turnaround latencies when the GPU is + * under-occupied. */ BLOCK_REDUCE_WARP_REDUCTIONS, }; +/****************************************************************************** + * Block reduce + ******************************************************************************/ + /** * \addtogroup BlockModule * @{ diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh index bf9ef662dc..e8017594ef 100644 --- a/cub/block/block_scan.cuh +++ b/cub/block/block_scan.cuh @@ -49,6 +49,10 @@ CUB_NS_PREFIX namespace cub { +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + /** * BlockScanAlgorithm enumerates alternative algorithms for parallel prefix * scan across a CUDA threadblock. @@ -108,6 +112,10 @@ enum BlockScanAlgorithm }; +/****************************************************************************** + * Block scan + ******************************************************************************/ + /** * \addtogroup BlockModule * @{ diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh index da625a5218..90f3dcae7e 100644 --- a/cub/block/block_store.cuh +++ b/cub/block/block_store.cuh @@ -77,8 +77,8 @@ template < int ITEMS_PER_THREAD, typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirect( - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { // Store directly in thread-blocked order #pragma unroll @@ -106,8 +106,8 @@ template < int ITEMS_PER_THREAD, typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirect( - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirect(block_itr, items); } @@ -124,27 +124,26 @@ __device__ __forceinline__ void BlockStoreDirect( * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type). - * \tparam SizeT [inferred] Integer type for offsets */ template < PtxStoreModifier MODIFIER, typename T, int ITEMS_PER_THREAD, - typename OutputIteratorRA, - typename SizeT> + typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirect( - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { + int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD); + // Store directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - int item_offset = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - if (item_offset < guarded_items) + if (ITEM < bounds) { - ThreadStore(block_itr + item_offset, items[ITEM]); + ThreadStore(block_itr + (threadIdx.x * ITEMS_PER_THREAD) + ITEM, items[ITEM]); } } } @@ -160,17 +159,15 @@ __device__ __forceinline__ void BlockStoreDirect( * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type). - * \tparam SizeT [inferred] Integer type for offsets */ template < typename T, int ITEMS_PER_THREAD, - typename OutputIteratorRA, - typename SizeT> + typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirect( - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirect(block_itr, guarded_items, items); } @@ -202,9 +199,9 @@ template < int ITEMS_PER_THREAD, typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirectStriped( - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. { // Store directly in striped order #pragma unroll @@ -233,9 +230,9 @@ template < int ITEMS_PER_THREAD, typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirectStriped( - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. { BlockStoreDirectStriped(block_itr, items, stride); } @@ -252,28 +249,27 @@ __device__ __forceinline__ void BlockStoreDirectStriped( * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type). - * \tparam SizeT [inferred] Integer type for offsets */ template < PtxStoreModifier MODIFIER, typename T, int ITEMS_PER_THREAD, - typename OutputIteratorRA, - typename SizeT> + typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirectStriped( OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. { + int bounds = guarded_items - threadIdx.x; + // Store directly in striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - int item_offset = (ITEM * stride) + threadIdx.x; - if (item_offset < guarded_items) + if (ITEM * stride < bounds) { - ThreadStore(block_itr + item_offset, items[ITEM]); + ThreadStore(block_itr + (ITEM * stride) + threadIdx.x, items[ITEM]); } } } @@ -289,18 +285,16 @@ __device__ __forceinline__ void BlockStoreDirectStriped( * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam OutputIteratorRA [inferred] The random-access iterator type for output (may be a simple pointer type). - * \tparam SizeT [inferred] Integer type for offsets */ template < typename T, int ITEMS_PER_THREAD, - typename OutputIteratorRA, - typename SizeT> + typename OutputIteratorRA> __device__ __forceinline__ void BlockStoreDirectStriped( OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int stride = blockDim.x) ///< [in] [optional] Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied. { BlockStoreDirectStriped(block_itr, guarded_items, items, stride); } @@ -337,8 +331,8 @@ template < typename T, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockStoreVectorized( - T *block_ptr, ///< [in] Input pointer for storing from - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + T *block_ptr, ///< [in] Input pointer for storing from + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { enum { @@ -398,8 +392,8 @@ template < typename T, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockStoreVectorized( - T *block_ptr, ///< [in] Input pointer for storing from - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + T *block_ptr, ///< [in] Input pointer for storing from + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreVectorized(block_ptr, items); } @@ -606,20 +600,19 @@ private: /// Store a tile of items across a threadblock static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirect(block_itr, items); } /// Store a tile of items across a threadblock, guarded by range - template static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirect(block_itr, guarded_items, items); } @@ -636,11 +629,10 @@ private: typedef NullType SmemStorage; /// Store a tile of items across a threadblock, specialized for native pointer types (attempts vectorization) - template static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - T *block_ptr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + T *block_ptr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreVectorized(block_ptr, items); } @@ -648,20 +640,19 @@ private: /// Store a tile of items across a threadblock, specialized for opaque input iterators (skips vectorization) template static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - _OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + _OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirect(block_itr, items); } /// Store a tile of items across a threadblock, guarded by range - template static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirect(block_itr, guarded_items, items); } @@ -682,9 +673,9 @@ private: /// Store a tile of items across a threadblock static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { // Transpose to striped order BlockExchange::BlockedToStriped(smem_storage, items); @@ -693,12 +684,11 @@ private: } /// Store a tile of items across a threadblock, guarded by range - template static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { // Transpose to striped order BlockExchange::BlockedToStriped(smem_storage, items); @@ -719,20 +709,19 @@ private: /// Store a tile of items across a threadblock static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirectStriped(block_itr, items); } /// Store a tile of items across a threadblock, guarded by range - template static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockStoreDirectStriped(block_itr, guarded_items, items); } @@ -756,24 +745,21 @@ public: * \brief Store a tile of items across a threadblock. */ static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreInternal::Store(smem_storage, block_itr, items); } /** * \brief Store a tile of items across a threadblock, guarded by range. - * - * \tparam SizeT [inferred] Integer type for offsets */ - template static __device__ __forceinline__ void Store( - SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage - OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to - const SizeT &guarded_items, ///< [in] Number of valid items in the tile - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage + OutputIteratorRA block_itr, ///< [in] The threadblock's base output iterator for storing to + const int &guarded_items, ///< [in] Number of valid items in the tile + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreInternal::Store(smem_storage, block_itr, guarded_items, items); } diff --git a/cub/device/device_histo_256.cuh b/cub/device/device_histo_256.cuh index 48a52ef2f6..d14d30f4dd 100644 --- a/cub/device/device_histo_256.cuh +++ b/cub/device/device_histo_256.cuh @@ -37,7 +37,7 @@ #include #include -#include "tiles/tiles_histo_256.cuh" +#include "persistent_block/persistent_block_histo_256.cuh" #include "../block/block_load.cuh" #include "../thread/thread_reduce.cuh" #include "../util_allocator.cuh" @@ -60,84 +60,74 @@ namespace cub { #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +/** + * Initialization kernel for queue descriptor preparation and for zeroing global counters + */ +template < + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SizeT, ///< Integral type used for global array indexing + typename HistoCounter> ///< Integral type for counting sample occurrences per histogram bin +__launch_bounds__ (256, 1) +__global__ void InitHisto256Kernel( + GridQueue grid_queue, ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks + ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][256] + SizeT num_samples) ///< [in] Total number of samples \p d_samples for all channels +{ + d_out_histograms.array[blockIdx.x][threadIdx.x] = 0; + if (threadIdx.x == 0) grid_queue.ResetDrain(num_samples); +} + + /** * Multi-block histogram kernel entry point. Computes privatized histograms, one per thread block. */ template < - typename TilesHisto256Policy, ///< Tuning policy for cub::TilesHisto256 abstraction - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char - typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin - typename SizeT> ///< Integral type used for global array indexing -__launch_bounds__ (TilesHisto256Policy::BLOCK_THREADS) + typename PersistentBlockHisto256Policy, ///< Tuning policy for cub::PersistentBlockHisto256 abstraction + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char + typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin + typename SizeT> ///< Integral type used for global array indexing +__launch_bounds__ (int(PersistentBlockHisto256Policy::BLOCK_THREADS), PersistentBlockHisto256Policy::SM_OCCUPANCY) __global__ void MultiBlockHisto256Kernel( - InputIteratorRA d_samples, ///< [in] Array of sample data. (Channels, if any, are interleaved in "AOS" format) - ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][gridDim.x][256] - SizeT num_samples, ///< [in] Total number of samples \p d_samples for all channels - GridEvenShare even_share, ///< [in] Descriptor for how to map an even-share of tiles across thread blocks - GridQueue queue) ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks + InputIteratorRA d_samples, ///< [in] Array of sample data. (Channels, if any, are interleaved in "AOS" format) + ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][gridDim.x][256] + SizeT num_samples, ///< [in] Total number of samples \p d_samples for all channels + GridEvenShare even_share, ///< [in] Descriptor for how to map an even-share of tiles across thread blocks + GridQueue queue) ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks { // Constants - enum { - BLOCK_THREADS = TilesHisto256Policy::BLOCK_THREADS, - ITEMS_PER_THREAD = TilesHisto256Policy::ITEMS_PER_THREAD, + enum + { + BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS, + ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD, TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD, }; - // Parameterize TilesHisto256 for the parallel execution context - typedef TilesHisto256 TilesHisto256T; - - // Parameterize which mapping of tiles -> thread blocks we will use - typedef typename TilesHisto256T::template Mapping Mapping; - - // Declare shared memory - __shared__ typename TilesHisto256T::SmemStorage block_histo; // Shared memory for TilesHisto256 - __shared__ HistoCounter histograms[ACTIVE_CHANNELS][256]; // Shared memory histograms - - // Composite samples into histogram(s) - Mapping::ProcessTiles( - block_histo, - d_samples, - num_samples, - even_share, - queue, - histograms); + // Thread block type for compositing input tiles + typedef PersistentBlockHisto256 PersistentBlockHisto256T; - // Barrier to ensure histograms are coherent - __syncthreads(); + // Shared memory for PersistentBlockHisto256 + __shared__ typename PersistentBlockHisto256T::SmemStorage smem_storage; - // Output histogram for each active channel - - #pragma unroll - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - int channel_offset = (blockIdx.x * 256); - int histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS) - { - d_out_histograms.array[CHANNEL][channel_offset + histo_offset + threadIdx.x] = histograms[CHANNEL][histo_offset + threadIdx.x]; - } - // Finish up with guarded initialization if necessary - if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256)) - { - d_out_histograms.array[CHANNEL][channel_offset + histo_offset + threadIdx.x] = histograms[CHANNEL][histo_offset + threadIdx.x]; - } - } + // Thread block instance + PersistentBlockHisto256T tiles(smem_storage, d_samples, d_out_histograms.array); + // Consume tiles using thread block instance + int dummy_result; + GridMapping::ConsumeTiles( + tiles, num_samples, even_share, queue, dummy_result); } /** - * Single-block finalization kernel for aggregating privatized threadblock histograms from a previous kernel invocation. + * Aggregation kernel for aggregating privatized threadblock histograms from a previous kernel invocation. */ template < - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename HistoCounter> ///< Integral type for counting sample occurrences per histogram bin + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename HistoCounter> ///< Integral type for counting sample occurrences per histogram bin __launch_bounds__ (256, 1) -__global__ void FinalizeHisto256Kernel( +__global__ void AggregateHisto256Kernel( HistoCounter* d_block_histograms_linear, ///< [in] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][num_threadblocks][256] ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][256] int num_threadblocks) ///< [in] Number of threadblock histograms per channel in \p d_block_histograms @@ -148,11 +138,12 @@ __global__ void FinalizeHisto256Kernel( int block_offset = blockIdx.x * (num_threadblocks * 256); int block_oob = block_offset + (num_threadblocks * 256); +#if CUB_PTX_ARCH >= 200 #pragma unroll 32 +#endif while (block_offset < block_oob) { bin_aggregate += d_block_histograms_linear[block_offset + threadIdx.x]; - block_offset += 256; } @@ -181,30 +172,30 @@ struct DeviceHisto256 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - /// Generic structure for encapsulating dispatch properties. Mirrors the constants within TilesHisto256Policy. + /// Generic structure for encapsulating dispatch properties. Mirrors the constants within PersistentBlockHisto256Policy. struct KernelDispachParams { // Policy fields - int block_threads; - int items_per_thread; - BlockHisto256Algorithm block_algorithm; - GridMappingStrategy grid_mapping; - int subscription_factor; + int block_threads; + int items_per_thread; + PersistentBlockHisto256Algorithm block_algorithm; + GridMappingStrategy grid_mapping; + int subscription_factor; // Derived fields - int tile_size; + int tile_size; - template + template __host__ __device__ __forceinline__ void Init(int subscription_factor = 1) { - block_threads = TilesHisto256Policy::BLOCK_THREADS; - items_per_thread = TilesHisto256Policy::ITEMS_PER_THREAD; - block_algorithm = TilesHisto256Policy::BLOCK_ALGORITHM; - grid_mapping = TilesHisto256Policy::GRID_MAPPING; + block_threads = PersistentBlockHisto256Policy::BLOCK_THREADS; + items_per_thread = PersistentBlockHisto256Policy::ITEMS_PER_THREAD; + block_algorithm = PersistentBlockHisto256Policy::GRID_ALGORITHM; + grid_mapping = PersistentBlockHisto256Policy::GRID_MAPPING; this->subscription_factor = subscription_factor; - tile_size = block_threads * items_per_thread; + tile_size = block_threads * items_per_thread; } __host__ __device__ __forceinline__ @@ -225,44 +216,60 @@ struct DeviceHisto256 template < int CHANNELS, int ACTIVE_CHANNELS, - BlockHisto256Algorithm BLOCK_ALGORITHM, + PersistentBlockHisto256Algorithm GRID_ALGORITHM, int ARCH> struct TunedPolicies; /// SM35 tune - template - struct TunedPolicies + template + struct TunedPolicies { - typedef TilesHisto256Policy< - 128, - (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? 23 : (30 / ACTIVE_CHANNELS), - BLOCK_ALGORITHM, - (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; + typedef PersistentBlockHisto256Policy< + (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 128 : 256, + (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 12 : (30 / ACTIVE_CHANNELS), + GRID_ALGORITHM, + (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE, + (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 8 : 1> MultiBlockPolicy; enum { SUBSCRIPTION_FACTOR = 7 }; }; + /// SM30 tune + template + struct TunedPolicies + { + typedef PersistentBlockHisto256Policy< + 128, + (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 20 : (22 / ACTIVE_CHANNELS), + GRID_ALGORITHM, + (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE, + 1> MultiBlockPolicy; + enum { SUBSCRIPTION_FACTOR = 1 }; + }; + /// SM20 tune - template - struct TunedPolicies + template + struct TunedPolicies { - typedef TilesHisto256Policy< - 128, - (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? 17 : (21 / ACTIVE_CHANNELS), - BLOCK_ALGORITHM, - (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; - enum { SUBSCRIPTION_FACTOR = 3 }; + typedef PersistentBlockHisto256Policy< + 128, + (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 21 : (23 / ACTIVE_CHANNELS), + GRID_ALGORITHM, + GRID_MAPPING_DYNAMIC, + 1> MultiBlockPolicy; + enum { SUBSCRIPTION_FACTOR = 1 }; }; /// SM10 tune - template - struct TunedPolicies + template + struct TunedPolicies { - typedef TilesHisto256Policy< + typedef PersistentBlockHisto256Policy< 128, 7, - BLOCK_ALGORITHM, - (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; - enum { SUBSCRIPTION_FACTOR = 2 }; + GRID_HISTO_256_SORT, // (use sort regardless because atomics are perf-useless) + GRID_MAPPING_EVEN_SHARE, + 1> MultiBlockPolicy; + enum { SUBSCRIPTION_FACTOR = 1 }; }; @@ -270,17 +277,19 @@ struct DeviceHisto256 template < int CHANNELS, int ACTIVE_CHANNELS, - BlockHisto256Algorithm BLOCK_ALGORITHM> + PersistentBlockHisto256Algorithm GRID_ALGORITHM> struct PtxDefaultPolicies { static const int PTX_TUNE_ARCH = (CUB_PTX_ARCH >= 350) ? 350 : - (CUB_PTX_ARCH >= 200) ? - 200 : - 100; + (CUB_PTX_ARCH >= 300) ? + 300 : + (CUB_PTX_ARCH >= 200) ? + 200 : + 100; // Tuned policy set for the current PTX compiler pass - typedef TunedPolicies PtxPassTunedPolicies; + typedef TunedPolicies PtxPassTunedPolicies; // Subscription factor for the current PTX compiler pass static const int SUBSCRIPTION_FACTOR = PtxPassTunedPolicies::SUBSCRIPTION_FACTOR; @@ -295,17 +304,22 @@ struct DeviceHisto256 { if (ptx_version >= 350) { - typedef TunedPolicies TunedPolicies; + typedef TunedPolicies TunedPolicies; + multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR); + } + else if (ptx_version >= 300) + { + typedef TunedPolicies TunedPolicies; multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR); } else if (ptx_version >= 200) { - typedef TunedPolicies TunedPolicies; + typedef TunedPolicies TunedPolicies; multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR); } else { - typedef TunedPolicies TunedPolicies; + typedef TunedPolicies TunedPolicies; multi_block_dispatch_params.Init(TunedPolicies::SUBSCRIPTION_FACTOR); } } @@ -321,17 +335,17 @@ struct DeviceHisto256 template < int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InitHisto256KernelPtr, ///< Function type of cub::InitHisto256Kernel typename MultiBlockHisto256KernelPtr, ///< Function type of cub::MultiBlockHisto256Kernel - typename FinalizeHisto256KernelPtr, ///< Function type of cub::FinalizeHisto256Kernel - typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel + typename AggregateHisto256KernelPtr, ///< Function type of cub::AggregateHisto256Kernel typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin typename SizeT> ///< Integral type used for global array indexing __host__ __device__ __forceinline__ static cudaError_t Dispatch( + InitHisto256KernelPtr init_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::InitHisto256Kernel MultiBlockHisto256KernelPtr multi_block_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::MultiBlockHisto256Kernel - FinalizeHisto256KernelPtr finalize_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::FinalizeHisto256Kernel - ResetDrainKernelPtr prepare_drain_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel + AggregateHisto256KernelPtr aggregate_kernel_ptr, ///< [in] Kernel function pointer to parameterization of cub::AggregateHisto256Kernel KernelDispachParams &multi_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel_ptr was compiled for InputIteratorRA d_samples, ///< [in] Input samples to histogram HistoCounter *(&d_histograms)[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having 256 counters of integral type \p HistoCounter. @@ -340,7 +354,7 @@ struct DeviceHisto256 bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. DeviceAllocator *device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // Kernel launch not supported from this device return CubDebug(cudaErrorInvalidConfiguration); @@ -354,11 +368,37 @@ struct DeviceHisto256 cudaError error = cudaSuccess; do { - // Get GPU ordinal + // Setup array wrapper for histogram channel output because we can't pass static arrays as kernel parameters + ArrayWrapper d_histo_wrapper; + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL]; + } + + // Initialize counters and queue descriptor if necessary + if ((multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) || + (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC)) + { + queue.Allocate(device_allocator); + + if (stream_synchronous) CubLog("Invoking init_kernel_ptr<<<%d, 256, 0, %d>>>()\n", ACTIVE_CHANNELS, (int) stream); + + init_kernel_ptr<<>>(queue, d_histo_wrapper, num_samples); + + #ifndef __CUDA_ARCH__ + // Sync the stream on the host + if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break; + #else + // Sync the entire device on the device (cudaStreamSynchronize doesn't exist on device) + if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break; + #endif + } + + // Determine grid size for the multi-block kernel + int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - // Get SM count int sm_count; if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; @@ -380,7 +420,6 @@ struct DeviceHisto256 #endif - // Determine grid size for the multi-block kernel int multi_occupancy = multi_sm_occupancy * sm_count; int multi_tile_size = multi_block_dispatch_params.block_threads * multi_block_dispatch_params.items_per_thread; int multi_grid_size; @@ -402,42 +441,18 @@ struct DeviceHisto256 case GRID_MAPPING_DYNAMIC: // Prepare queue to distribute work dynamically - queue.Allocate(device_allocator); int num_tiles = (num_samples + multi_tile_size - 1) / multi_tile_size; - #ifndef __CUDA_ARCH__ - - // We're on the host, so prepare queue on device (because its faster than if we prepare it here) - if (stream_synchronous) CubLog("Invoking prepare_drain_kernel_ptr<<<1, 1, 0, %d>>>()\n", (int) stream); - prepare_drain_kernel_ptr<<<1, 1, 0, stream>>>(queue, num_samples); - - // Sync the stream on the host - if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break; - - #else - - // Prepare the queue here - queue.ResetDrain(num_samples); - - #endif - // Set MultiBlock grid size multi_grid_size = (num_tiles < multi_occupancy) ? num_tiles : // Not enough to fill the device with threadblocks multi_occupancy; // Fill the device with threadblocks - break; + break; }; - // Setup array wrapper for histogram channel output because we can't pass static arrays as kernel parameters - ArrayWrapper d_histo_wrapper; - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL]; - } - + // Bind textures if the iterator supports it #ifndef __CUDA_ARCH__ - // Host can bind texture if the iterator supports it if (CubDebug(error = BindIteratorTexture(d_samples))) break; #endif // __CUDA_ARCH__ @@ -445,7 +460,7 @@ struct DeviceHisto256 if (stream_synchronous) CubLog("Invoking multi_block_kernel_ptr<<<%d, %d, 0, %d>>>(), %d items per thread, %d SM occupancy\n", multi_grid_size, multi_block_dispatch_params.block_threads, (int) stream, multi_block_dispatch_params.items_per_thread, multi_sm_occupancy); - if (multi_grid_size == 1) + if ((multi_grid_size == 1) || (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC)) { // A single pass will do multi_block_kernel_ptr<<>>( @@ -487,10 +502,10 @@ struct DeviceHisto256 if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break; #endif - if (stream_synchronous) CubLog("Invoking finalize_kernel_ptr<<<%d, %d, 0, %d>>>()\n", + if (stream_synchronous) CubLog("Invoking aggregate_kernel_ptr<<<%d, %d, 0, %d>>>()\n", ACTIVE_CHANNELS, 256, (int) stream); - finalize_kernel_ptr<<>>( + aggregate_kernel_ptr<<>>( d_block_histograms_linear, d_histo_wrapper, multi_grid_size); @@ -507,10 +522,15 @@ struct DeviceHisto256 while (0); // Free temporary storage allocation - if (d_block_histograms_linear) error = CubDebug(DeviceFree(d_block_histograms_linear, device_allocator)); + if (d_block_histograms_linear) + error = CubDebug(DeviceFree(d_block_histograms_linear, device_allocator)); // Free queue allocation - if (multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) error = CubDebug(queue.Free(device_allocator)); + if ((multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) || + (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC)) + { + error = CubDebug(queue.Free(device_allocator)); + } // Unbind texture #ifndef __CUDA_ARCH__ @@ -526,18 +546,18 @@ struct DeviceHisto256 /** * \brief Computes a 256-bin device-wide histogram * - * \tparam BLOCK_ALGORITHM cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use + * \tparam GRID_ALGORITHM cub::PersistentBlockHisto256Algorithm enumerator specifying the underlying algorithm to use * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed * \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char * \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin */ template < - BlockHisto256Algorithm BLOCK_ALGORITHM, - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIteratorRA, - typename HistoCounter> + PersistentBlockHisto256Algorithm GRID_ALGORITHM, + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIteratorRA, + typename HistoCounter> __host__ __device__ __forceinline__ static cudaError_t Dispatch( InputIteratorRA d_samples, ///< [in] Input samples to histogram @@ -551,7 +571,7 @@ struct DeviceHisto256 typedef int SizeT; // Tuning polices for the PTX architecture that will get dispatched to - typedef PtxDefaultPolicies PtxDefaultPolicies; + typedef PtxDefaultPolicies PtxDefaultPolicies; typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy; cudaError error = cudaSuccess; @@ -575,9 +595,9 @@ struct DeviceHisto256 #endif Dispatch( + InitHisto256Kernel, MultiBlockHisto256Kernel, - FinalizeHisto256Kernel, - ResetDrainKernel, + AggregateHisto256Kernel, multi_block_dispatch_params, d_samples, d_histograms, @@ -601,7 +621,9 @@ struct DeviceHisto256 //--------------------------------------------------------------------- /** - * \brief Computes a 256-bin device-wide histogram + * \brief Computes a 256-bin device-wide histogram. Uses fast block-sorting to compute the histogram. + * + * Delivers consistent throughput regardless of sample diversity. * * \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char * \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin @@ -618,12 +640,12 @@ struct DeviceHisto256 bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. { - return Dispatch( + return Dispatch( d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator); } /** - * \brief Computes a 256-bin device-wide histogram. Uses atomic read-modify-write operations to compute the histogram. + * \brief Computes a 256-bin device-wide histogram. Uses shared-memory atomic read-modify-write operations to compute the histogram. * * Sample input having lower diversity cause performance to be degraded. * @@ -642,13 +664,40 @@ struct DeviceHisto256 bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. { - return Dispatch( + return Dispatch( d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator); } /** - * \brief Computes a 256-bin device-wide histogram from multi-channel data. + * \brief Computes a 256-bin device-wide histogram. Uses global-memory atomic read-modify-write operations to compute the histogram. + * + * Sample input having lower diversity cause performance to be degraded. + * + * \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char + * \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin + */ + template < + typename InputIteratorRA, + typename HistoCounter> + __host__ __device__ __forceinline__ + static cudaError_t SingleChannelGlobalAtomic( + InputIteratorRA d_samples, ///< [in] Input samples + HistoCounter* d_histogram, ///< [out] Array of 256 counters of integral type \p HistoCounter. + int num_samples, ///< [in] Number of samples to process + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0. + bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. + DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. + { + return Dispatch( + d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator); + } + + + /** + * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses fast block-sorting to compute the histogram. + * + * Delivers consistent throughput regardless of sample diversity. * * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed @@ -669,12 +718,13 @@ struct DeviceHisto256 bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. { - return Dispatch( + return Dispatch( d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator); } + /** - * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses atomic read-modify-write operations to compute the histogram. + * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses shared-memory atomic read-modify-write operations to compute the histogram. * * Sample input having lower diversity cause performance to be degraded. * @@ -697,7 +747,36 @@ struct DeviceHisto256 bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. { - return Dispatch( + return Dispatch( + d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator); + } + + + /** + * \brief Computes a 256-bin device-wide histogram from multi-channel data. Uses global-memory atomic read-modify-write operations to compute the histogram. + * + * Sample input having lower diversity cause performance to be degraded. + * + * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam InputIteratorRA [inferred] The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to unsigned char + * \tparam HistoCounter [inferred] Integral type for counting sample occurrences per histogram bin + */ + template < + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIteratorRA, + typename HistoCounter> + __host__ __device__ __forceinline__ + static cudaError_t MultiChannelGlobalAtomic( + InputIteratorRA d_samples, ///< [in] Input samples. (Channels, if any, are interleaved in "AOS" format) + HistoCounter *(&d_histograms)[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having 256 counters of integral type \p HistoCounter. + int num_samples, ///< [in] Number of samples to process + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0. + bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. + DeviceAllocator* device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. + { + return Dispatch( d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator); } diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 31df6bce53..6e62879e16 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -37,8 +37,9 @@ #include #include -#include "tiles/tiles_reduce.cuh" +#include "persistent_block/persistent_block_reduce.cuh" #include "../util_allocator.cuh" +#include "../grid/grid_mapping.cuh" #include "../grid/grid_even_share.cuh" #include "../grid/grid_queue.cuh" @@ -60,12 +61,12 @@ namespace cub { * Multi-block reduction kernel entry point. Computes privatized reductions, one per thread block. */ template < - typename TilesReducePolicy, ///< Tuning policy for cub::TilesReduce abstraction + typename PersistentBlockReducePolicy, ///< Tuning policy for cub::PersistentBlockReduce abstraction typename InputIteratorRA, ///< The random-access iterator type for input (may be a simple pointer type). typename OutputIteratorRA, ///< The random-access iterator type for output (may be a simple pointer type). typename SizeT, ///< Integral type used for global array indexing typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) -__launch_bounds__ (TilesReducePolicy::BLOCK_THREADS, 1) +__launch_bounds__ (int(PersistentBlockReducePolicy::BLOCK_THREADS), 1) __global__ void MultiBlockReduceKernel( InputIteratorRA d_in, ///< [in] Input data to reduce OutputIteratorRA d_out, ///< [out] Output location for result @@ -74,26 +75,24 @@ __global__ void MultiBlockReduceKernel( GridQueue queue, ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks ReductionOp reduction_op) ///< [in] Binary reduction operator { - // Data type of input iterator + // Data type typedef typename std::iterator_traits::value_type T; - // Parameterize TilesReduce for the parallel execution context - typedef TilesReduce TilesReduceT; + // Thread block type for reducing input tiles + typedef PersistentBlockReduce PersistentBlockReduceT; - // Parameterize which mapping of tiles -> thread blocks we will use - typedef typename TilesReduceT::template Mapping Mapping; + // Block-wide aggregate + T block_aggregate; - // Declare shared memory for TilesReduce - __shared__ typename TilesReduceT::SmemStorage smem_storage; + // Shared memory storage + __shared__ typename PersistentBlockReduceT::SmemStorage smem_storage; - // Reduce tiles - T block_aggregate = Mapping::ProcessTiles( - smem_storage, - d_in, - num_items, - even_share, - queue, - reduction_op); + // Thread block instance + PersistentBlockReduceT tiles(smem_storage, d_in, reduction_op); + + // Consume tiles using thread block instance + GridMapping::ConsumeTilesFlagFirst( + tiles, num_items, even_share, queue, block_aggregate); // Output result if (threadIdx.x == 0) @@ -107,41 +106,41 @@ __global__ void MultiBlockReduceKernel( * Single-block reduction kernel entry point. */ template < - typename TilesReducePolicy, ///< Tuning policy for cub::TilesReduce abstraction + typename PersistentBlockReducePolicy, ///< Tuning policy for cub::PersistentBlockReduce abstraction typename InputIteratorRA, ///< The random-access iterator type for input (may be a simple pointer type). typename OutputIteratorRA, ///< The random-access iterator type for output (may be a simple pointer type). typename SizeT, ///< Integral type used for global array indexing typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) -__launch_bounds__ (TilesReducePolicy::BLOCK_THREADS, 1) +__launch_bounds__ (int(PersistentBlockReducePolicy::BLOCK_THREADS), 1) __global__ void SingleBlockReduceKernel( InputIteratorRA d_in, ///< [in] Input data to reduce OutputIteratorRA d_out, ///< [out] Output location for result SizeT num_items, ///< [in] Total number of input data items ReductionOp reduction_op) ///< [in] Binary reduction operator { - // Data type of input iterator + // Data type typedef typename std::iterator_traits::value_type T; - // Parameterize TilesReduce for the parallel execution context - typedef TilesReduce TilesReduceT; + // Thread block type for reducing input tiles + typedef PersistentBlockReduce PersistentBlockReduceT; + + // Block-wide aggregate + T block_aggregate; + + // Shared memory storage + __shared__ typename PersistentBlockReduceT::SmemStorage smem_storage; - // Declare shared memory for TilesReduce - __shared__ typename TilesReduceT::SmemStorage smem_storage; + // Block abstraction for reducing tiles + PersistentBlockReduceT tiles(smem_storage, d_in, reduction_op); - // Reduce tiles - T block_aggregate = TilesReduceT::ProcessTilesEvenShare( - smem_storage, - d_in, - SizeT(0), - num_items, - reduction_op); + // Reduce input tiles + ConsumeTilesFlagFirst(tiles, 0, num_items, block_aggregate); // Output result if (threadIdx.x == 0) { d_out[blockIdx.x] = block_aggregate; } - } #endif // DOXYGEN_SHOULD_SKIP_THIS @@ -163,7 +162,7 @@ struct DeviceReduce { #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - /// Generic structure for encapsulating dispatch properties. Mirrors the constants within TilesReducePolicy. + /// Generic structure for encapsulating dispatch properties. Mirrors the constants within PersistentBlockReducePolicy. struct KernelDispachParams { // Policy fields @@ -178,16 +177,16 @@ struct DeviceReduce // Derived fields int tile_size; - template + template __host__ __device__ __forceinline__ void Init(int subscription_factor = 1) { - block_threads = TilesReducePolicy::BLOCK_THREADS; - items_per_thread = TilesReducePolicy::ITEMS_PER_THREAD; - vector_load_length = TilesReducePolicy::VECTOR_LOAD_LENGTH; - block_algorithm = TilesReducePolicy::BLOCK_ALGORITHM; - load_modifier = TilesReducePolicy::LOAD_MODIFIER; - grid_mapping = TilesReducePolicy::GRID_MAPPING; + block_threads = PersistentBlockReducePolicy::BLOCK_THREADS; + items_per_thread = PersistentBlockReducePolicy::ITEMS_PER_THREAD; + vector_load_length = PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH; + block_algorithm = PersistentBlockReducePolicy::BLOCK_ALGORITHM; + load_modifier = PersistentBlockReducePolicy::LOAD_MODIFIER; + grid_mapping = PersistentBlockReducePolicy::GRID_MAPPING; this->subscription_factor = subscription_factor; tile_size = block_threads * items_per_thread; @@ -221,8 +220,8 @@ struct DeviceReduce struct TunedPolicies { // K20C: 182.1 @ 48M 32-bit T - typedef TilesReducePolicy<256, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; - typedef TilesReducePolicy<256, 16, 2, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; + typedef PersistentBlockReducePolicy<256, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; + typedef PersistentBlockReducePolicy<256, 16, 2, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; enum { SUBSCRIPTION_FACTOR = 4 }; }; @@ -231,8 +230,8 @@ struct DeviceReduce struct TunedPolicies { // GTX670: 154.0 @ 48M 32-bit T - typedef TilesReducePolicy<256, 2, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; - typedef TilesReducePolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; + typedef PersistentBlockReducePolicy<256, 2, 1, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; + typedef PersistentBlockReducePolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; enum { SUBSCRIPTION_FACTOR = 1 }; }; @@ -241,8 +240,8 @@ struct DeviceReduce struct TunedPolicies { // GTX 580: 178.9 @ 48M 32-bit T - typedef TilesReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_DYNAMIC> MultiBlockPolicy; - typedef TilesReducePolicy<128, 4, 1, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; + typedef PersistentBlockReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_DYNAMIC> MultiBlockPolicy; + typedef PersistentBlockReducePolicy<128, 4, 1, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; enum { SUBSCRIPTION_FACTOR = 1 }; }; @@ -250,8 +249,8 @@ struct DeviceReduce template struct TunedPolicies { - typedef TilesReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; - typedef TilesReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; + typedef PersistentBlockReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; + typedef PersistentBlockReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; enum { SUBSCRIPTION_FACTOR = 1 }; }; @@ -259,8 +258,8 @@ struct DeviceReduce template struct TunedPolicies { - typedef TilesReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; - typedef TilesReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; + typedef PersistentBlockReducePolicy<128, 8, 2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy; + typedef PersistentBlockReducePolicy<32, 4, 4, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE> SingleBlockPolicy; enum { SUBSCRIPTION_FACTOR = 1 }; }; @@ -354,7 +353,7 @@ struct DeviceReduce cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0. bool stream_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // Kernel launch not supported from this device return CubDebug(cudaErrorInvalidConfiguration); @@ -396,7 +395,7 @@ struct DeviceReduce template < typename MultiBlockReduceKernelPtr, ///< Function type of cub::MultiBlockReduceKernel typename ReduceSingleKernelPtr, ///< Function type of cub::SingleBlockReduceKernel - typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel + typename ResetDrainKernelPtr, ///< Function type of cub::ResetDrainKernel typename InputIteratorRA, ///< The random-access iterator type for input (may be a simple pointer type). typename OutputIteratorRA, ///< The random-access iterator type for output (may be a simple pointer type). typename SizeT, ///< Integral type used for global array indexing @@ -405,7 +404,7 @@ struct DeviceReduce static cudaError_t DispatchIterative( MultiBlockReduceKernelPtr multi_block_kernel, ///< [in] Kernel function pointer to parameterization of cub::MultiBlockReduceKernel ReduceSingleKernelPtr single_block_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleBlockReduceKernel - ResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel + ResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel KernelDispachParams &multi_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel_ptr was compiled for KernelDispachParams &single_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p single_block_kernel was compiled for InputIteratorRA d_in, ///< [in] Input data to reduce @@ -416,7 +415,7 @@ struct DeviceReduce bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. DeviceAllocator *device_allocator = DefaultDeviceAllocator()) ///< [in] [optional] Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // Kernel launch not supported from this device return CubDebug(cudaErrorInvalidConfiguration); @@ -585,7 +584,7 @@ struct DeviceReduce KernelDispachParams &single_block_dispatch_params, ///< [in] Dispatch parameters that match the policy that \p single_block_kernel was compiled for InputIteratorRA d_in, ///< [in] Input data to reduce OutputIteratorRA d_out, ///< [out] Output location for result - SizeT num_items, ///< [in] Number of items to reduce + SizeT num_items, ///< [in] Number of items to reduce ReductionOp reduction_op, ///< [in] Binary reduction operator cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream-0. bool stream_synchronous = false, ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. diff --git a/cub/device/persistent_block/persistent_block_histo_256.cuh b/cub/device/persistent_block/persistent_block_histo_256.cuh new file mode 100644 index 0000000000..8c4d93a1d1 --- /dev/null +++ b/cub/device/persistent_block/persistent_block_histo_256.cuh @@ -0,0 +1,810 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::PersistentBlockHisto256 implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram. + */ + +#pragma once + +#include + +#include "../../util_arch.cuh" +#include "../../block/block_load.cuh" +#include "../../block/block_histo_256.cuh" +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../grid/grid_mapping.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_vector.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + +/** + * \brief PersistentBlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms. + */ +enum PersistentBlockHisto256Algorithm +{ + + /** + * \par Overview + * A two-kernel approach in which: + * -# Thread blocks in the first kernel aggregate their own privatized + * histograms using block-wide sorting (see BlockHisto256Algorithm::BLOCK_HISTO_256_SORT). + * -# A single thread block in the second kernel reduces them into the output histogram(s). + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. + */ + GRID_HISTO_256_SORT, + + + /** + * \par Overview + * A two-kernel approach in which: + * -# Thread blocks in the first kernel aggregate their own privatized + * histograms using shared-memory \p atomicAdd(). + * -# A single thread block in the second kernel reduces them into the + * output histogram(s). + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + */ + GRID_HISTO_256_SHARED_ATOMIC, + + + /** + * \par Overview + * A single-kernel approach in which thread blocks update the output histogram(s) directly + * using global-memory \p atomicAdd(). + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + */ + GRID_HISTO_256_GLOBAL_ATOMIC, + +}; + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * Tuning policy for PersistentBlockHisto256 + */ +template < + int _BLOCK_THREADS, + int _ITEMS_PER_THREAD, + PersistentBlockHisto256Algorithm _GRID_ALGORITHM, + GridMappingStrategy _GRID_MAPPING, + int _SM_OCCUPANCY> +struct PersistentBlockHisto256Policy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, + SM_OCCUPANCY = _SM_OCCUPANCY, + }; + + static const PersistentBlockHisto256Algorithm GRID_ALGORITHM = _GRID_ALGORITHM; + static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; +}; + + + +/****************************************************************************** + * PersistentBlockHisto256 + ******************************************************************************/ + +/** + * \brief implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram. + */ +template < + typename PersistentBlockHisto256Policy, ///< Tuning policy + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char + typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin + typename SizeT, ///< Integer type for offsets + PersistentBlockHisto256Algorithm GRID_ALGORITHM = PersistentBlockHisto256Policy::GRID_ALGORITHM> +struct PersistentBlockHisto256; + + +/** + * Specialized for GRID_HISTO_256_GLOBAL_ATOMIC + */ +template < + typename PersistentBlockHisto256Policy, ///< Tuning policy + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char + typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin + typename SizeT> ///< Integer type for offsets +struct PersistentBlockHisto256 +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Constants + enum + { + BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS, + ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + }; + + // Shared memory type required by this thread block + struct SmemStorage {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to smem_storage + SmemStorage &smem_storage; + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIteratorRA d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ PersistentBlockHisto256( + SmemStorage &smem_storage, ///< Reference to smem_storage + InputIteratorRA d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) : ///< Reference to output histograms + smem_storage(smem_storage), + d_in(d_in), + d_out_histograms(d_out_histograms) + {} + + + /** + * The number of items processed per "tile" + */ + __device__ __forceinline__ int TileItems() + { + return TILE_ITEMS; + } + + + /** + * Process a single tile. + */ + __device__ __forceinline__ void ConsumeTile( + bool &sync_after, + SizeT block_offset, + int num_valid) + { + if (num_valid < TILE_ITEMS) + { + // Only a partially-full tile of samples to read and composite + int bounds = num_valid - (threadIdx.x * CHANNELS); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) + { + unsigned char item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + atomicAdd(d_out_histograms[CHANNEL] + item, 1); + } + } + } + + } + else + { + // Full tile of samples to read and composite + unsigned char items[ITEMS_PER_THREAD][CHANNELS]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + } + } + } + + __threadfence_block(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1); + } + } + } + } + + // No need to sync after processing this tile to ensure smem coherence + sync_after = false; + } + + + /** + * Finalize the computation. + */ + __device__ __forceinline__ void Finalize( + int dummy_result) + {} +}; + + + + +/** + * Specialized for GRID_HISTO_256_SHARED_ATOMIC + */ +template < + typename PersistentBlockHisto256Policy, ///< Tuning policy + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char + typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin + typename SizeT> ///< Integer type for offsets +struct PersistentBlockHisto256 +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Constants + enum + { + BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS, + ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + }; + + // Shared memory type required by this thread block + struct SmemStorage + { + HistoCounter histograms[ACTIVE_CHANNELS][256]; + }; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to smem_storage + SmemStorage &smem_storage; + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIteratorRA d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ PersistentBlockHisto256( + SmemStorage &smem_storage, ///< Reference to smem_storage + InputIteratorRA d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) : ///< Reference to output histograms + smem_storage(smem_storage), + d_in(d_in), + d_out_histograms(d_out_histograms) + { + // Initialize histogram bin counts to zeros + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS) + { + smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; + } + // Finish up with guarded initialization if necessary + if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256)) + { + smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; + } + } + } + + + /** + * The number of items processed per "tile" + */ + __device__ __forceinline__ int TileItems() + { + return TILE_ITEMS; + } + + + /** + * Process a single tile. + */ + __device__ __forceinline__ void ConsumeTile( + bool &sync_after, + SizeT block_offset, + int num_valid) + { + if (num_valid < TILE_ITEMS) + { + // Only a partially-full tile of samples to read and composite + int bounds = num_valid - (threadIdx.x * CHANNELS); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) + { + unsigned char item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + atomicAdd(smem_storage.histograms[CHANNEL] + item, 1); + } + } + } + + } + else + { + // Full tile of samples to read and composite + unsigned char items[ITEMS_PER_THREAD][CHANNELS]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + } + } + } + + __threadfence_block(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + atomicAdd(smem_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1); + } + } + } + } + + // No need to sync after processing this tile to ensure smem coherence + sync_after = false; + } + + + /** + * Finalize the computation. + */ + __device__ __forceinline__ void Finalize( + int dummy_result) + { + // Barrier to ensure shared memory histograms are coherent + __syncthreads(); + + // Copy shared memory histograms to output + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int channel_offset = (blockIdx.x * 256); + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS) + { + d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; + } + // Finish up with guarded initialization if necessary + if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256)) + { + d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; + } + } + } +}; + + +/** + * Specialized for GRID_HISTO_256_SORT + */ +template < + typename PersistentBlockHisto256Policy, ///< Tuning policy + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIteratorRA, ///< The input iterator type (may be a simple pointer type). Must have a value type that is assignable to unsigned char + typename HistoCounter, ///< Integral type for counting sample occurrences per histogram bin + typename SizeT> ///< Integer type for offsets +struct PersistentBlockHisto256 +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Constants + enum + { + BLOCK_THREADS = PersistentBlockHisto256Policy::BLOCK_THREADS, + ITEMS_PER_THREAD = PersistentBlockHisto256Policy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + + STRIPED_COUNTERS_PER_THREAD = (256 + BLOCK_THREADS - 1) / BLOCK_THREADS, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity BlockDiscontinuityT; + + // Shared memory type required by this thread block + union SmemStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::SmemStorage sort_storage; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::SmemStorage discont_storage; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + unsigned int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; + unsigned int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; + }; + }; + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to smem_storage + SmemStorage &smem_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(SmemStorage &smem_storage) : smem_storage(smem_storage) {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const unsigned char &a, const unsigned char &b, unsigned int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + smem_storage.run_begin[b] = b_index; + smem_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to smem_storage + SmemStorage &smem_storage; + + /// Histogram counters striped across threads + HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD]; + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIteratorRA d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ PersistentBlockHisto256( + SmemStorage &smem_storage, ///< Reference to smem_storage + InputIteratorRA d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) : ///< Reference to output histograms + smem_storage(smem_storage), + d_in(d_in), + d_out_histograms(d_out_histograms) + { + // Initialize histogram counters striped across threads + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + thread_counters[CHANNEL][COUNTER] = 0; + } + } + } + + + /** + * The number of items processed per "tile" + */ + __device__ __forceinline__ int TileItems() + { + return TILE_ITEMS; + } + + + /** + * Composite a tile of input items + */ + __device__ __forceinline__ void Composite( + unsigned char (&items)[ITEMS_PER_THREAD], ///< Tile of samples + HistoCounter thread_counters[STRIPED_COUNTERS_PER_THREAD]) ///< Histogram counters striped across threads + { + // Sort bytes in blocked arrangement + BlockRadixSortT::SortBlocked(smem_storage.sort_storage, items); + + __syncthreads(); + + // Initialize the shared memory's run_begin and run_end for each bin + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + smem_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; + smem_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; + } + + __syncthreads(); + + // Note the begin/end run offsets of bin runs in the sorted tile + int flags[ITEMS_PER_THREAD]; // unused + DiscontinuityOp flag_op(smem_storage); + BlockDiscontinuityT::Flag(smem_storage.discont_storage, items, flag_op, flags); + + // Update begin for first item + if (threadIdx.x == 0) smem_storage.run_begin[items[0]] = 0; + + __syncthreads(); + + // Composite into histogram + // Initialize the shared memory's run_begin and run_end for each bin + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; + thread_counters[COUNTER] += smem_storage.run_end[bin] - smem_storage.run_begin[bin]; + } + } + + + /** + * Process one channel within a tile. + */ + __device__ __forceinline__ void ConsumeTileChannel( + int channel, + SizeT block_offset, + int num_valid) + { + // Load items in striped fashion + if (num_valid < TILE_ITEMS) + { + // Only a partially-full tile of samples to read and composite + unsigned char items[ITEMS_PER_THREAD]; + + // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later + int bounds = (num_valid - (threadIdx.x * CHANNELS)); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ? + d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] : + 0; + } + + // Composite our histogram data + Composite(items, thread_counters[channel]); + + __syncthreads(); + + // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items + if (threadIdx.x == 0) + { + int extra = (TILE_ITEMS - num_valid) / CHANNELS; + thread_counters[channel][0] -= extra; + } + } + else + { + // Full tile of samples to read and composite + unsigned char items[ITEMS_PER_THREAD]; + + // Unguarded loads + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)]; + } + + // Composite our histogram data + Composite(items, thread_counters[channel]); + } + } + + + /** + * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Inductive step. + */ + template + struct IterateChannels + { + /** + * Process one channel within a tile. + */ + static __device__ __forceinline__ void ConsumeTileChannel( + PersistentBlockHisto256 *persistent_block_histo, + SizeT block_offset, + int num_valid) + { + __syncthreads(); + + persistent_block_histo->ConsumeTileChannel(CHANNEL, block_offset, num_valid); + + IterateChannels::ConsumeTileChannel(persistent_block_histo, block_offset, num_valid); + } + }; + + + /** + * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Base step. + */ + template + struct IterateChannels + { + static __device__ __forceinline__ void ConsumeTileChannel(PersistentBlockHisto256 *persistent_block_histo, SizeT block_offset, int num_valid) {} + }; + + + /** + * Process a single tile. + * + * We take several passes through the tile in this variant, extracting the samples for one channel at a time + */ + __device__ __forceinline__ void ConsumeTile( + bool &sync_after, + SizeT block_offset, + int num_valid) + { + // First channel + ConsumeTileChannel(0, block_offset, num_valid); + + // Iterate through remaining channels + IterateChannels<1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, num_valid); + + // Need to sync after processing this tile to ensure smem coherence + sync_after = true; + } + + + /** + * Finalize the computation. + */ + __device__ __forceinline__ void Finalize( + int dummy_result) + { + // Copy counters striped across threads into the histogram output + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int channel_offset = (blockIdx.x * 256); + + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; + + if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == 256) || (bin < 256)) + { + d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER]; + } + } + } + } +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/cub/device/persistent_block/persistent_block_reduce.cuh b/cub/device/persistent_block/persistent_block_reduce.cuh new file mode 100644 index 0000000000..9dbc7a3f15 --- /dev/null +++ b/cub/device/persistent_block/persistent_block_reduce.cuh @@ -0,0 +1,247 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::PersistentBlockReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction. + + */ + +#pragma once + +#include + +#include "../../grid/grid_mapping.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../block/block_load.cuh" +#include "../../block/block_reduce.cuh" +#include "../../util_vector.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Tuning policy for PersistentBlockReduce + */ +template < + int _BLOCK_THREADS, + int _ITEMS_PER_THREAD, + int _VECTOR_LOAD_LENGTH, + BlockReduceAlgorithm _BLOCK_ALGORITHM, + PtxLoadModifier _LOAD_MODIFIER, + GridMappingStrategy _GRID_MAPPING> +struct PersistentBlockReducePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, + VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, + }; + + static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; + static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; + static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; +}; + + +/** + * \brief PersistentBlockReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction. + */ +template < + typename PersistentBlockReducePolicy, + typename InputIteratorRA, + typename SizeT, + typename ReductionOp> +struct PersistentBlockReduce +{ + + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + typedef typename std::iterator_traits::value_type T; // Type of input iterator + typedef VectorHelper VecHelper; // Helper type for vectorizing loads of T + typedef typename VecHelper::Type VectorT; // Vector of T + + // Constants + enum + { + BLOCK_THREADS = PersistentBlockReducePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = PersistentBlockReducePolicy::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + VECTOR_LOAD_LENGTH = PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH, + + // Can vectorize according to the policy if the input iterator is a native pointer to a built-in primitive + CAN_VECTORIZE = (PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH > 1) && + (IsPointer::VALUE) && + (VecHelper::BUILT_IN), + + }; + + static const BlockReduceAlgorithm BLOCK_ALGORITHM = PersistentBlockReducePolicy::BLOCK_ALGORITHM; + + // Parameterized BlockReduce primitive + typedef BlockReduce BlockReduceT; + + // Shared memory type required by this thread block + typedef typename BlockReduceT::SmemStorage SmemStorage; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + T thread_aggregate; ///< Each thread's partial reduction + SmemStorage& smem_storage; ///< Reference to smem_storage + InputIteratorRA d_in; ///< Input data to reduce + ReductionOp reduction_op; ///< Binary reduction operator + int first_tile_size; ///< Size of first tile consumed + bool input_aligned; ///< Whether or not input is vector-aligned + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ PersistentBlockReduce( + SmemStorage& smem_storage, ///< Reference to smem_storage + InputIteratorRA d_in, ///< Input data to reduce + ReductionOp reduction_op) : ///< Binary reduction operator + smem_storage(smem_storage), + d_in(d_in), + reduction_op(reduction_op), + first_tile_size(TILE_ITEMS), + input_aligned(CAN_VECTORIZE && ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0)){} + + + /** + * The number of items processed per "tile" + */ + __device__ __forceinline__ int TileItems() + { + return TILE_ITEMS; + } + + + /** + * Process a single tile. + * + * Each thread reduces only the values it loads. If \p FIRST_TILE, this + * partial reduction is stored into \p thread_aggregate. Otherwise it is + * accumulated into \p thread_aggregate. + */ + __device__ __forceinline__ void ConsumeTile( + bool &sync_after, + SizeT block_offset, + int num_valid, + bool first_tile) + { + if (num_valid < TILE_ITEMS) + { + // Our first tile is a partial tile size + if (first_tile) first_tile_size = num_valid; + + // Partial tile + int thread_offset = threadIdx.x; + + if ((first_tile) && (thread_offset < num_valid)) + { + thread_aggregate = ThreadLoad(d_in + block_offset + thread_offset); + thread_offset += BLOCK_THREADS; + } + + while (thread_offset < num_valid) + { + T item = ThreadLoad(d_in + block_offset + thread_offset); + thread_aggregate = reduction_op(thread_aggregate, item); + thread_offset += BLOCK_THREADS; + } + } + else + { + T items[ITEMS_PER_THREAD]; + + // Load full tile + if (input_aligned) + { + // Alias items as an array of VectorT and load it in striped fashion + BlockLoadDirectStriped( + reinterpret_cast(d_in + block_offset), + reinterpret_cast(items)); + } + else + { + // Load items in striped fashion + BlockLoadDirectStriped(d_in + block_offset, items); + } + + // Prevent hoisting + __threadfence_block(); + + // Reduce items within each thread + T partial = ThreadReduce(items, reduction_op); + + // Update|assign the thread's running aggregate + thread_aggregate = (first_tile) ? + partial : + reduction_op(thread_aggregate, partial); + } + + // No synchronization needed after tile processing + sync_after = false; + } + + + /** + * Finalize the computation. + */ + __device__ __forceinline__ void Finalize( + T& block_aggregate) + { + // Cooperative reduction across the thread block (guarded reduction if our first tile was a partial tile) + block_aggregate = (first_tile_size < TILE_ITEMS) ? + BlockReduceT::Reduce(smem_storage, thread_aggregate, reduction_op, first_tile_size) : + BlockReduceT::Reduce(smem_storage, thread_aggregate, reduction_op); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/cub/device/tiles/tiles_scan.cuh b/cub/device/persistent_block/persistent_block_scan.cuh similarity index 88% rename from cub/device/tiles/tiles_scan.cuh rename to cub/device/persistent_block/persistent_block_scan.cuh index 9b92a7363d..a6ff677983 100644 --- a/cub/device/tiles/tiles_scan.cuh +++ b/cub/device/persistent_block/persistent_block_scan.cuh @@ -28,7 +28,7 @@ /** * \file - * cub::TilesScan implements an abstraction of CUDA thread blocks for + * cub::PersistentBlockScan implements an abstraction of CUDA thread blocks for * participating in device-wide prefix scan. */ @@ -61,7 +61,7 @@ enum /** - * Tuning policy for TilesScan + * Tuning policy for PersistentBlockScan */ template < int _BLOCK_THREADS, @@ -69,7 +69,7 @@ template < BlockLoadPolicy _LOAD_POLICY, BlockStorePolicy _STORE_POLICY, BlockScanAlgorithm _SCAN_ALGORITHM> -struct TilesScanPolicy +struct PersistentBlockScanPolicy { enum { @@ -87,15 +87,15 @@ struct TilesScanPolicy /** - * \brief TilesScan implements an abstraction of CUDA thread blocks for + * \brief PersistentBlockScan implements an abstraction of CUDA thread blocks for * participating in device-wide reduction. */ template < - typename TilesScanPolicy, + typename PersistentBlockScanPolicy, typename InputIteratorRA, typename OutputIteratorRA, typename SizeT> -class TilesScan +class PersistentBlockScan { public: @@ -107,12 +107,12 @@ public: typedef typename std::iterator_traits::value_type T; // Data type of block-signaling flag - typedef typename TilesScanPolicy::BlockFlag BlockFlag; + typedef typename PersistentBlockScanPolicy::BlockFlag BlockFlag; // Constants enum { - TILE_ITEMS = TilesScanPolicy::BLOCK_THREADS * TilesScanPolicy::ITEMS_PER_THREAD, + TILE_ITEMS = PersistentBlockScanPolicy::BLOCK_THREADS * PersistentBlockScanPolicy::ITEMS_PER_THREAD, }; struct Signal @@ -146,22 +146,22 @@ public: // Parameterized block load typedef BlockLoad< InputIteratorRA, - TilesScanPolicy::BLOCK_THREADS, - TilesScanPolicy::ITEMS_PER_THREAD, - TilesScanPolicy::LOAD_POLICY> BlockLoadT; + PersistentBlockScanPolicy::BLOCK_THREADS, + PersistentBlockScanPolicy::ITEMS_PER_THREAD, + PersistentBlockScanPolicy::LOAD_POLICY> BlockLoadT; // Parameterized block store typedef BlockStore< OutputIteratorRA, - TilesScanPolicy::BLOCK_THREADS, - TilesScanPolicy::ITEMS_PER_THREAD, - TilesScanPolicy::STORE_POLICY> BlockStoreT; + PersistentBlockScanPolicy::BLOCK_THREADS, + PersistentBlockScanPolicy::ITEMS_PER_THREAD, + PersistentBlockScanPolicy::STORE_POLICY> BlockStoreT; // Parameterized block scan typedef BlockScan< T, - TilesScanPolicy::BLOCK_THREADS, - TilesScanPolicy::SCAN_ALGORITHM> BlockScanT; + PersistentBlockScanPolicy::BLOCK_THREADS, + PersistentBlockScanPolicy::SCAN_ALGORITHM> BlockScanT; // Parameterized warp reduce typedef WarpReduce WarpReduceT; @@ -270,7 +270,7 @@ public: ScanOp &scan_op, T &thread_aggregate) { - T items[TilesScanPolicy::ITEMS_PER_THREAD]; + T items[PersistentBlockScanPolicy::ITEMS_PER_THREAD]; BlockLoadT::Load(smem_storage.load, d_in + block_offset, items); @@ -392,7 +392,7 @@ public: * The return value is undefined in threads other than thread0. */ template - static __device__ __forceinline__ T ProcessTilesEvenShare( + static __device__ __forceinline__ T ProcessPersistentBlockEvenShare( SmemStorage &smem_storage, InputIteratorRA d_in, SizeT block_offset, @@ -438,7 +438,7 @@ public: * The return value is undefined in threads other than thread0. */ template - static __device__ __forceinline__ T ProcessTilesDynamic( + static __device__ __forceinline__ T ProcessPersistentBlockDynamic( SmemStorage &smem_storage, InputIteratorRA d_in, SizeT num_items, @@ -505,45 +505,6 @@ public: } } - - /** - * \brief Consumes input tiles according to TilesScanPolicy::GRID_MAPPING, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor. - * - * The return value is undefined in threads other than thread0. - */ - template - static __device__ __forceinline__ T ProcessTiles( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridEvenShare &even_share, - GridQueue &queue, - ScanOp &scan_op) - { - if (TilesScanPolicy::GRID_MAPPING == GRID_MAPPING_EVEN_SHARE) - { - // Even share - even_share.BlockInit(); - - return ProcessTilesEvenShare( - smem_storage, - d_in, - even_share.block_offset, - even_share.block_oob, - scan_op); - } - else - { - // Dynamically dequeue - return ProcessTilesDynamic( - smem_storage, - d_in, - num_items, - queue, - scan_op); - } - } - }; diff --git a/cub/device/tiles/tiles_histo_256.cuh b/cub/device/tiles/tiles_histo_256.cuh deleted file mode 100644 index 94623db89f..0000000000 --- a/cub/device/tiles/tiles_histo_256.cuh +++ /dev/null @@ -1,499 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::TilesHisto256 implements an abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram. - */ - -#pragma once - -#include - -#include "../../grid/grid_mapping.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../block/block_load.cuh" -#include "../../block/block_histo_256.cuh" -#include "../../util_vector.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * Tuning policy for TilesHisto256 - */ -template < - int _BLOCK_THREADS, - int _ITEMS_PER_THREAD, - BlockHisto256Algorithm _BLOCK_ALGORITHM, - GridMappingStrategy _GRID_MAPPING> -struct TilesHisto256Policy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - }; - - static const BlockHisto256Algorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; - static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; -}; - - - -/****************************************************************************** - * TilesHisto256 - ******************************************************************************/ - -/** - * \brief TilesHisto256 implements an abstraction of CUDA thread blocks for participating in device-wide histogram. - */ -template < - typename TilesHisto256Policy, ///< Tuning policy - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) - typename SizeT> ///< Integer type for offsets -class TilesHisto256 -{ -private: - - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Constants - enum - { - BLOCK_THREADS = TilesHisto256Policy::BLOCK_THREADS, - ITEMS_PER_THREAD = TilesHisto256Policy::ITEMS_PER_THREAD, - TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, - }; - - static const BlockHisto256Algorithm BLOCK_ALGORITHM = TilesHisto256Policy::BLOCK_ALGORITHM; - - // Parameterized BlockHisto256 primitive - typedef BlockHisto256 BlockHisto256T; - - // Shared memory type for this threadblock - struct _SmemStorage - { - SizeT block_offset; // Location where to dequeue input for dynamic operation - typename BlockHisto256T::SmemStorage block_histo; // Smem needed for cooperative histogramming - }; - -public: - - /// \smemstorage{TilesHisto256} - typedef _SmemStorage SmemStorage; - -private: - - //--------------------------------------------------------------------- - // Utility operations - //--------------------------------------------------------------------- - - /** - * Channel-oriented (one channel at a time) - */ - template < - BlockHisto256Algorithm _BLOCK_ALGORITHM, - bool CHANNEL_ORIENTED = (_BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) > - struct TilesHisto256Internal - { - /** - * Process one channel within a tile. - */ - template < - typename InputIteratorRA, - typename HistoCounter, - int ACTIVE_CHANNELS> - static __device__ __forceinline__ void ConsumeTileChannel( - SmemStorage &smem_storage, - int channel, - InputIteratorRA d_in, - SizeT block_offset, - HistoCounter (&histograms)[ACTIVE_CHANNELS][256], - const int &guarded_items = TILE_ITEMS) - { - // Load items in striped fashion - if (guarded_items < TILE_ITEMS) - { - unsigned char items[ITEMS_PER_THREAD]; - - // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later - int bounds = (guarded_items - (threadIdx.x * CHANNELS)); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ? - d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] : - 0; - } - - // Composite our histogram data - BlockHisto256T::Composite(smem_storage.block_histo, items, histograms[channel]); - - __syncthreads(); - - if (threadIdx.x == 0) - { - int extra = (TILE_ITEMS - guarded_items) / CHANNELS; - histograms[channel][0] -= extra; - } - } - else - { - unsigned char items[ITEMS_PER_THREAD]; - - // Unguarded loads - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)]; - } - - // Composite our histogram data - BlockHisto256T::Composite(smem_storage.block_histo, items, histograms[channel]); - } - } - - - /** - * Process one tile. - */ - template < - typename InputIteratorRA, - typename HistoCounter, - int ACTIVE_CHANNELS> - static __device__ __forceinline__ void ConsumeTile( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT block_offset, - HistoCounter (&histograms)[ACTIVE_CHANNELS][256], - const int &guarded_items = TILE_ITEMS) - { - // We take several passes through the tile in this variant, extracting the samples for one channel at a time - - // First channel - ConsumeTileChannel(smem_storage, 0, d_in, block_offset, histograms, guarded_items); - - // Iterate through remaining channels - #pragma unroll - for (int CHANNEL = 1; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - __syncthreads(); - - ConsumeTileChannel(smem_storage, CHANNEL, d_in, block_offset, histograms, guarded_items); - } - } - }; - - - - /** - * BLOCK_BYTE_HISTO_ATOMIC algorithmic variant - */ - template - struct TilesHisto256Internal<_BLOCK_ALGORITHM, false> - { - /** - * Process one tile. - */ - template < - typename InputIteratorRA, - typename HistoCounter, - int ACTIVE_CHANNELS> - static __device__ __forceinline__ void ConsumeTile( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT block_offset, - HistoCounter (&histograms)[ACTIVE_CHANNELS][256], - const int &guarded_items = TILE_ITEMS) - { - - if (guarded_items < TILE_ITEMS) - { - int bounds = guarded_items - (threadIdx.x * CHANNELS); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) - { - unsigned char item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; - atomicAdd(histograms[CHANNEL] + item, 1); - } - } - } - - } - else - { - unsigned char items[ITEMS_PER_THREAD][CHANNELS]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (CHANNEL < ACTIVE_CHANNELS) - { - items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; - } - } - } - - __threadfence_block(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (CHANNEL < ACTIVE_CHANNELS) - { - atomicAdd(histograms[CHANNEL] + items[ITEM][CHANNEL], 1); - } - } - } - - -/* - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - unsigned char items[ITEMS_PER_THREAD][CHANNELS]; - - int tile_offset = (CHANNEL * TILE_CHANNEL_ITEMS); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = d_in[block_offset + tile_offset + (ITEM * BLOCK_THREADS) + threadIdx.x]; - } - - __threadfence_block(); - - // Update histogram - - if ((ACTIVE_CHANNELS == CHANNELS) || (my_channel < ACTIVE_CHANNELS)) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - atomicAdd(histograms[my_channel] + items[ITEM], 1); - } - } - } -*/ - } - } - }; - - - - - - -public: - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * \brief Consumes input tiles using an even-share policy - */ - template < - typename InputIteratorRA, - typename HistoCounter, - int ACTIVE_CHANNELS> - static __device__ __forceinline__ void ProcessTilesEvenShare( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT block_offset, - const SizeT &block_oob, - HistoCounter (&histograms)[ACTIVE_CHANNELS][256]) - { - // Initialize histograms - #pragma unroll - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - BlockHisto256T::InitHistogram(histograms[CHANNEL]); - } - - __syncthreads(); - - // Consume full tiles - while (block_offset + TILE_ITEMS <= block_oob) - { - TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms); - - block_offset += TILE_ITEMS; - - // Skip synchro for atomic version since we know it doesn't use any smem - if (BLOCK_ALGORITHM != BLOCK_BYTE_HISTO_ATOMIC) - { - __syncthreads(); - } - } - - // Consume any remaining partial-tile - if (block_offset < block_oob) - { - TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms, block_oob - block_offset); - } - } - - - /** - * \brief Consumes input tiles using a dynamic queue policy - */ - template < - typename InputIteratorRA, - typename HistoCounter, - int ACTIVE_CHANNELS> - static __device__ __forceinline__ void ProcessTilesDynamic( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridQueue &queue, - HistoCounter (&histograms)[ACTIVE_CHANNELS][256]) - { - - // Initialize histograms - #pragma unroll - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - BlockHisto256T::InitHistogram(histograms[CHANNEL]); - } - - // Dynamically consume tiles - while (true) - { - // Dequeue up to TILE_ITEMS - if (threadIdx.x == 0) - { - smem_storage.block_offset = queue.Drain(TILE_ITEMS); - } - - __syncthreads(); - - SizeT block_offset = smem_storage.block_offset; - - __syncthreads(); - - if (block_offset + TILE_ITEMS > num_items) - { - if (block_offset < num_items) - { - // We have less than a full tile to consume - TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms, num_items - block_offset); - } - - // No more work to do - break; - } - - // We have a full tile to consume - TilesHisto256Internal::ConsumeTile(smem_storage, d_in, block_offset, histograms); - } - } - - - /** - * Specialized for GRID_MAPPING_EVEN_SHARE - */ - template - struct Mapping - { - template < - typename InputIteratorRA, - typename HistoCounter, - int ACTIVE_CHANNELS> - static __device__ __forceinline__ void ProcessTiles( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridEvenShare &even_share, - GridQueue &queue, - HistoCounter (&histograms)[ACTIVE_CHANNELS][256]) - { - even_share.BlockInit(); - return ProcessTilesEvenShare(smem_storage, d_in, even_share.block_offset, even_share.block_oob, histograms); - } - - }; - - - /** - * Specialized for GRID_MAPPING_DYNAMIC - */ - template - struct Mapping - { - template < - typename InputIteratorRA, - typename HistoCounter, - int ACTIVE_CHANNELS> - static __device__ __forceinline__ void ProcessTiles( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridEvenShare &even_share, - GridQueue &queue, - HistoCounter (&histograms)[ACTIVE_CHANNELS][256]) - { - ProcessTilesDynamic(smem_storage, d_in, num_items, queue, histograms); - } - - }; - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/cub/device/tiles/tiles_reduce.cuh b/cub/device/tiles/tiles_reduce.cuh deleted file mode 100644 index 54b33d1ae9..0000000000 --- a/cub/device/tiles/tiles_reduce.cuh +++ /dev/null @@ -1,457 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::TilesReduce implements an abstraction of CUDA thread blocks for - * reducing multiple tiles as part of device-wide reduction. - - */ - -#pragma once - -#include - -#include "../../grid/grid_mapping.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../block/block_load.cuh" -#include "../../block/block_reduce.cuh" -#include "../../util_vector.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * Tuning policy for TilesReduce - */ -template < - int _BLOCK_THREADS, - int _ITEMS_PER_THREAD, - int _VECTOR_LOAD_LENGTH, - BlockReduceAlgorithm _BLOCK_ALGORITHM, - PtxLoadModifier _LOAD_MODIFIER, - GridMappingStrategy _GRID_MAPPING> -struct TilesReducePolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, - }; - - static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; - static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; - static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; -}; - - -/** - * \brief TilesReduce implements an abstraction of CUDA thread blocks for - * participating in device-wide reduction. - */ -template < - typename TilesReducePolicy, - typename InputIteratorRA, - typename SizeT> -class TilesReduce -{ -private: - - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Data type of input iterator - typedef typename std::iterator_traits::value_type T; - - // Constants - enum - { - // Number of items to be be processed to completion before the thread block terminates or obtains more work - TILE_ITEMS = TilesReducePolicy::BLOCK_THREADS * TilesReducePolicy::ITEMS_PER_THREAD, - }; - - // Parameterized BlockReduce primitive - typedef BlockReduce BlockReduceT; - - // Shared memory type for this threadblock - struct _SmemStorage - { - SizeT block_offset; // Location where to dequeue input for dynamic operation - typename BlockReduceT::SmemStorage reduce; // Smem needed for cooperative reduction - }; - -public: - - /// \smemstorage{TilesReduce} - typedef _SmemStorage SmemStorage; - -private: - - //--------------------------------------------------------------------- - // Utility operations - //--------------------------------------------------------------------- - - /** - * Process a single, full tile. Specialized for native pointers - * - * Each thread reduces only the values it loads. If \p FIRST_TILE, - * this partial reduction is stored into \p thread_aggregate. Otherwise - * it is accumulated into \p thread_aggregate. - * - * Performs a block-wide barrier synchronization - */ - template < - bool VECTORIZE_INPUT, - bool FIRST_TILE, - typename ReductionOp> - static __device__ __forceinline__ void ConsumeFullTile( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT block_offset, - ReductionOp &reduction_op, - T &thread_aggregate) - { - T items[TilesReducePolicy::ITEMS_PER_THREAD]; - - if (VECTORIZE_INPUT) - { - typedef VectorHelper VecHelper; - typedef typename VecHelper::Type VectorT; - - // Alias items as an array of VectorT and load it in striped fashion - BlockLoadDirectStriped( - reinterpret_cast(d_in + block_offset), - reinterpret_cast(items)); - } - else - { - // Load items in striped fashion - BlockLoadDirectStriped( - d_in + block_offset, - items); - } - - // Prevent hoisting - __threadfence_block(); - - // Reduce items within each thread - T partial = ThreadReduce(items, reduction_op); - - // Update|assign the thread's running aggregate - thread_aggregate = (FIRST_TILE) ? - partial : - reduction_op(thread_aggregate, partial); - } - - - /** - * Process a single, partial tile. - * - * Each thread reduces only the values it loads. If \p FIRST_TILE, - * this partial reduction is stored into \p thread_aggregate. Otherwise - * it is accumulated into \p thread_aggregate. - */ - template < - bool FIRST_TILE, - typename ReductionOp> - static __device__ __forceinline__ void ConsumePartialTile( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT block_offset, - const SizeT &block_oob, - ReductionOp &reduction_op, - T &thread_aggregate) - { - SizeT thread_offset = block_offset + threadIdx.x; - - if ((FIRST_TILE) && (thread_offset < block_oob)) - { - thread_aggregate = ThreadLoad(d_in + thread_offset); - thread_offset += TilesReducePolicy::BLOCK_THREADS; - } - - while (thread_offset < block_oob) - { - T item = ThreadLoad(d_in + thread_offset); - thread_aggregate = reduction_op(thread_aggregate, item); - thread_offset += TilesReducePolicy::BLOCK_THREADS; - } - } - - - /** - * \brief Consumes input tiles using an even-share policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor. - * - * The return value is undefined in threads other than thread0. - */ - template < - bool VECTORIZE_INPUT, - typename ReductionOp> - static __device__ __forceinline__ T ProcessTilesEvenShare( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT block_offset, - const SizeT &block_oob, - ReductionOp &reduction_op) - { - if (block_offset + TILE_ITEMS <= block_oob) - { - // We have at least one full tile to consume - T thread_aggregate; - ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate); - block_offset += TILE_ITEMS; - - // Consume any other full tiles - while (block_offset + TILE_ITEMS <= block_oob) - { - ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate); - block_offset += TILE_ITEMS; - } - - // Consume any remaining input - ConsumePartialTile(smem_storage, d_in, block_offset, block_oob, reduction_op, thread_aggregate); - - // Compute the block-wide reduction (every thread has a valid input) - return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op); - } - else - { - // We have less than a full tile to consume - T thread_aggregate; - ConsumePartialTile(smem_storage, d_in, block_offset, block_oob, reduction_op, thread_aggregate); - - // Compute the block-wide reduction (up to block_items threads have valid inputs) - SizeT block_items = block_oob - block_offset; - return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op, block_items); - } - } - - - /** - * \brief Consumes input tiles using a dynamic queue policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor. - * - * The return value is undefined in threads other than thread0. - */ - template < - bool VECTORIZE_INPUT, - typename ReductionOp> - static __device__ __forceinline__ T ProcessTilesDynamic( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridQueue &queue, - ReductionOp &reduction_op) - { - // Each thread block is statically assigned at some input, otherwise its - // block_aggregate will be undefined. - SizeT block_offset = blockIdx.x * TILE_ITEMS; - - if (block_offset + TILE_ITEMS <= num_items) - { - // We have a full tile to consume - T thread_aggregate; - ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate); - - // Dynamically consume other tiles - SizeT even_share_base = gridDim.x * TILE_ITEMS; - - if (even_share_base < num_items) - { - // There are tiles left to consume - while (true) - { - // Dequeue up to TILE_ITEMS - if (threadIdx.x == 0) - { - smem_storage.block_offset = queue.Drain(TILE_ITEMS) + even_share_base; - } - - __syncthreads(); - - block_offset = smem_storage.block_offset; - - __syncthreads(); - - if (block_offset + TILE_ITEMS > num_items) - { - if (block_offset < num_items) - { - // We have less than a full tile to consume - ConsumePartialTile(smem_storage, d_in, block_offset, num_items, reduction_op, thread_aggregate); - } - - // No more work to do - break; - } - - // We have a full tile to consume - ConsumeFullTile(smem_storage, d_in, block_offset, reduction_op, thread_aggregate); - } - } - - // Compute the block-wide reduction (every thread has a valid input) - return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op); - } - else - { - // We have less than a full tile to consume - T thread_aggregate; - SizeT block_items = num_items - block_offset; - ConsumePartialTile(smem_storage, d_in, block_offset, num_items, reduction_op, thread_aggregate); - - // Compute the block-wide reduction (up to block_items threads have valid inputs) - return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op, block_items); - } - } - - -public: - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * \brief Consumes input tiles using an even-share policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor. - * - * The return value is undefined in threads other than thread0. - */ - template - static __device__ __forceinline__ T ProcessTilesEvenShare( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT block_offset, - const SizeT &block_oob, - ReductionOp &reduction_op) - { - typedef VectorHelper VecHelper; - typedef typename VecHelper::Type VectorT; - - if ((IsPointer::VALUE) && - (TilesReducePolicy::VECTOR_LOAD_LENGTH > 1) && - (VecHelper::BUILT_IN) && - ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0)) - { - return ProcessTilesEvenShare(smem_storage, d_in, block_offset, block_oob, reduction_op); - } - else - { - return ProcessTilesEvenShare(smem_storage, d_in, block_offset, block_oob, reduction_op); - } - } - - - /** - * \brief Consumes input tiles using a dynamic queue policy, computing a threadblock-wide reduction for thread0 using the specified binary reduction functor. - * - * The return value is undefined in threads other than thread0. - */ - template - static __device__ __forceinline__ T ProcessTilesDynamic( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridQueue &queue, - ReductionOp &reduction_op) - { - typedef VectorHelper VecHelper; - typedef typename VecHelper::Type VectorT; - - if ((IsPointer::VALUE) && - (TilesReducePolicy::VECTOR_LOAD_LENGTH > 1) && - (VecHelper::BUILT_IN) && - ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0)) - { - return ProcessTilesDynamic(smem_storage, d_in, num_items, queue, reduction_op); - } - else - { - return ProcessTilesDynamic(smem_storage, d_in, num_items, queue, reduction_op); - } - } - - - /** - * Specialized for GRID_MAPPING_EVEN_SHARE - */ - template - struct Mapping - { - template - static __device__ __forceinline__ T ProcessTiles( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridEvenShare &even_share, - GridQueue &queue, - ReductionOp &reduction_op) - { - // Even share - even_share.BlockInit(); - - return ProcessTilesEvenShare(smem_storage, d_in, even_share.block_offset, even_share.block_oob, reduction_op); - } - - }; - - - /** - * Specialized for GRID_MAPPING_DYNAMIC - */ - template - struct Mapping - { - template - static __device__ __forceinline__ T ProcessTiles( - SmemStorage &smem_storage, - InputIteratorRA d_in, - SizeT num_items, - GridEvenShare &even_share, - GridQueue &queue, - ReductionOp &reduction_op) - { - // Dynamically dequeue - return ProcessTilesDynamic(smem_storage, d_in, num_items, queue, reduction_op); - } - - }; - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/cub/grid/grid_even_share.cuh b/cub/grid/grid_even_share.cuh index d3013285a8..136f4bed83 100644 --- a/cub/grid/grid_even_share.cuh +++ b/cub/grid/grid_even_share.cuh @@ -26,15 +26,11 @@ * ******************************************************************************/ -/****************************************************************************** - * Threadblock Work management. - * - * A given threadblock may receive one of three different amounts of - * work: "big", "normal", and "last". The big workloads are one - * grain greater than the normal, and the last workload - * does the extra work. - * - ******************************************************************************/ +/** + * \file + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + */ + #pragma once @@ -56,7 +52,7 @@ namespace cub { /** - * \brief A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). * * \par Overview * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks. @@ -87,17 +83,36 @@ private: public: /// Total number of input items - SizeT num_items; + SizeT num_items; /// Grid size in threadblocks - int grid_size; + int grid_size; /// Offset into input marking the beginning of the owning thread block's segment of input tiles - SizeT block_offset; + SizeT block_offset; /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles SizeT block_oob; + /** + * \brief Block-based constructor for single-block grids. + */ + __device__ __forceinline__ GridEvenShare(SizeT num_items) : + num_items(num_items), + grid_size(1), + block_offset(0), + block_oob(num_items) {} + + + /** + * \brief Default constructor. Zero-initializes block-specific fields. + */ + __host__ __device__ __forceinline__ GridEvenShare() : + num_items(0), + grid_size(0), + block_offset(0), + block_oob(0) {} + /** * \brief Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch) diff --git a/cub/grid/grid_mapping.cuh b/cub/grid/grid_mapping.cuh index ea4c6def89..a3e0b6d8b7 100644 --- a/cub/grid/grid_mapping.cuh +++ b/cub/grid/grid_mapping.cuh @@ -28,13 +28,13 @@ /** * \file - * cub::GridMappingStrategy enumerates alternative strategies for mapping - * constant-sized tiles of device-wide data onto a grid of CUDA thread - * blocks. + * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. */ #pragma once +#include "grid_even_share.cuh" +#include "grid_queue.cuh" #include "../util_namespace.cuh" /// Optional outer namespace(s) @@ -50,15 +50,18 @@ namespace cub { */ +/****************************************************************************** + * Mapping policies + *****************************************************************************/ + + /** - * GridMappingStrategy enumerates alternative strategies for mapping - * constant-sized tiles of device-wide data onto a grid of CUDA thread - * blocks. + * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. */ enum GridMappingStrategy { /** - * \brief An "even-share" strategy. + * \brief An "even-share" strategy for assigning input tiles to thread blocks. * * \par Overview * The input is evenly partitioned into \p p segments, where \p p is @@ -73,7 +76,7 @@ enum GridMappingStrategy GRID_MAPPING_EVEN_SHARE, /** - * \brief A dynamic "queue-based" strategy for commutative reduction operators. + * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. * * \par Overview * The input is treated as a queue to be dynamically consumed by a grid of @@ -87,6 +90,453 @@ enum GridMappingStrategy }; + +/****************************************************************************** + * Mapping engines + *****************************************************************************/ + +/** + * \brief Dispatches tiles of work from the given input range to the specified thread block abstraction. + * + * \par + * Expects the \p PersistentBlock type to have the following callback member functions: + * - Tile processing: + * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items); + * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile: + * - int TileItems() + * - Finalization: + * - void Finalize(Result &result); + * + * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing + * \tparam SizeT [inferred] Integral type used for global array indexing + * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance + */ + +template < + typename PersistentBlock, + typename SizeT, + typename Result> +__device__ __forceinline__ void ConsumeTiles( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT block_offset, ///< [in] Threadblock begin offset (inclusive) + SizeT block_oob, ///< [in] Threadblock end offset (exclusive) + Result &result) ///< [out] Result returned by tiles::Finalize() +{ + bool sync_after = true; + + // Number of items per tile that can be processed by tiles + int tile_items = persistent_block.TileItems(); + + // Consume any full tiles + while (block_offset + tile_items <= block_oob) + { + persistent_block.ConsumeTile(sync_after, block_offset, tile_items); + if (sync_after) __syncthreads(); + + block_offset += tile_items; + } + + // Consume any remaining input + if (block_offset < block_oob) + { + persistent_block.ConsumeTile(sync_after, block_offset, block_oob - block_offset); + if (sync_after) __syncthreads(); + } + + // Compute the block-wide reduction (every thread has a valid input) + persistent_block.Finalize(result); +} + + +/** + * \brief Uses a GridEvenShare descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.) + * + * \par + * Expects the \p PersistentBlock type to have the following callback member functions: + * - Tile processing: + * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items); + * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile: + * - int TileItems() + * - Finalization: + * - void Finalize(Result &result); + * + * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing + * \tparam SizeT [inferred] Integral type used for global array indexing + * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance + */ + +template < + typename PersistentBlock, + typename SizeT, + typename Result> +__device__ __forceinline__ void ConsumeTiles( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() +{ + even_share.BlockInit(); + ConsumeTiles(persistent_block, even_share.block_offset, even_share.block_oob, result); +} + + + +/** + * \brief Dispatches tiles of work from the given input range to the specified thread block abstraction. The first tile given to each thread block is flagged as such. + * + * \par + * Expects the \p PersistentBlock type to have the following callback member functions: + * - Tile processing: + * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile); + * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile: + * - int TileItems() + * - Finalization: + * - void Finalize(Result &result); + * + * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing + * \tparam SizeT [inferred] Integral type used for global array indexing + * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance + */ +template < + typename PersistentBlock, + typename SizeT, + typename Result> +__device__ __forceinline__ void ConsumeTilesFlagFirst( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT block_offset, ///< [in] Threadblock begin offset (inclusive) + SizeT block_oob, ///< [in] Threadblock end offset (exclusive) + Result &result) ///< [out] Result returned by tiles::Finalize() +{ + bool sync_after = true; + + // Number of items per tile that can be processed by tiles + int tile_items = persistent_block.TileItems(); + + if (block_offset + tile_items <= block_oob) + { + // We have at least one full tile to consume + persistent_block.ConsumeTile(sync_after, block_offset, tile_items, true); + if (sync_after) __syncthreads(); + + block_offset += tile_items; + + // Consume any other full tiles + while (block_offset + tile_items <= block_oob) + { + persistent_block.ConsumeTile(sync_after, block_offset, tile_items, false); + if (sync_after) __syncthreads(); + + block_offset += tile_items; + } + + // Consume any remaining input + if (block_offset < block_oob) + { + persistent_block.ConsumeTile(sync_after, block_offset, block_oob - block_offset, false); + if (sync_after) __syncthreads(); + } + } + else + { + // We have less than a full tile to consume + SizeT block_items = block_oob - block_offset; + + persistent_block.ConsumeTile(sync_after, block_offset, block_items, true); + if (sync_after) __syncthreads(); + } + + // Compute the block-wide reduction (every thread has a valid input) + persistent_block.Finalize(result); +} + + +/** + * \brief Uses a GridEvenShare descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.) + * + * \par + * Expects the \p PersistentBlock type to have the following callback member functions: + * - Tile processing: + * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile); + * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile: + * - int TileItems() + * - Finalization: + * - void Finalize(Result &result); + * + * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing + * \tparam SizeT [inferred] Integral type used for global array indexing + * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance + */ +template < + typename PersistentBlock, + typename SizeT, + typename Result> +__device__ __forceinline__ void ConsumeTilesFlagFirst( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() +{ + even_share.BlockInit(); + ConsumeTilesFlagFirst(persistent_block, even_share.block_offset, even_share.block_oob, result); +} + + + +/** + * \brief Uses a GridQueue descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.) + * + * \par + * Expects the \p PersistentBlock type to have the following callback member functions: + * - Tile processing: + * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items); + * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile: + * - int TileItems() + * - Finalization: + * - void Finalize(Result &result); + * + * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing + * \tparam SizeT [inferred] Integral type used for global array indexing + * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance + */ +template < + typename PersistentBlock, + typename SizeT, + typename Result> +__device__ __forceinline__ void ConsumeTiles( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridQueue &queue, ///< [in,out] GridQueue descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() +{ + // Shared tile-processing offset obtained dynamically from queue + __shared__ SizeT dynamic_block_offset; + + bool sync_after = true; + + // Number of items per tile that can be processed by tiles + int tile_items = persistent_block.TileItems(); + + // There are tiles left to consume + while (true) + { + // Dequeue up to tile_items + if (threadIdx.x == 0) + { + dynamic_block_offset = queue.Drain(tile_items); + } + + __syncthreads(); + + SizeT block_offset = dynamic_block_offset; + + __syncthreads(); + + if (block_offset + tile_items > num_items) + { + if (block_offset < num_items) + { + // We have less than a full tile to consume + persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset); + if (sync_after) __syncthreads(); + } + + // No more work to do + break; + } + + // We have a full tile to consume + persistent_block.ConsumeTile(sync_after, block_offset, tile_items); + } + + persistent_block.Finalize(result); +} + + + +/** + * \brief Uses a GridQueue descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.) + * + * \par + * Expects the \p PersistentBlock type to have the following callback member functions: + * - Tile processing: + * - void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile); + * - Getting the maximum number of items processed per call to PersistentBlock::ConsumeTile: + * - int TileItems() + * - Finalization: + * - void Finalize(Result &result); + * + * \tparam PersistentBlock [inferred] Thread block abstraction type for tile processing + * \tparam SizeT [inferred] Integral type used for global array indexing + * \tparam Result [inferred] Result type to be returned by the PersistentBlock instance + */ +template < + typename PersistentBlock, + typename SizeT, + typename Result> +__device__ __forceinline__ void ConsumeTilesFlagFirst( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridQueue &queue, ///< [in,out] GridQueue descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() +{ + // Shared tile-processing offset obtained dynamically from queue + __shared__ SizeT dynamic_block_offset; + + bool sync_after = true; + + // Number of items per tile that can be processed by tiles + int tile_items = persistent_block.TileItems(); + + // We give each thread block at least one tile of input. + SizeT block_offset = blockIdx.x * tile_items; + + // Check if we have a full tile to consume + if (block_offset + tile_items <= num_items) + { + persistent_block.ConsumeTile(sync_after, block_offset, tile_items, true); + if (sync_after) __syncthreads(); + + // Now that every block in the kernel has gotten a tile, attempt to dynamically consume any remaining + SizeT even_share_base = gridDim.x * tile_items; + if (even_share_base < num_items) + { + // There are tiles left to consume + while (true) + { + // Dequeue up to tile_items + if (threadIdx.x == 0) + { + dynamic_block_offset = queue.Drain(tile_items) + even_share_base; + } + + __syncthreads(); + + block_offset = dynamic_block_offset; + + __syncthreads(); + + if (block_offset + tile_items > num_items) + { + if (block_offset < num_items) + { + // We have less than a full tile to consume + persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset, false); + if (sync_after) __syncthreads(); + } + + // No more work to do + break; + } + + // We have a full tile to consume + persistent_block.ConsumeTile(sync_after, block_offset, tile_items, false); + } + } + } + else + { + // We have less than a full tile to consume + persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset, true); + if (sync_after) __syncthreads(); + } + + // Compute the block-wide reduction (every thread has a valid input) + persistent_block.Finalize(result); + +} + + + +/****************************************************************************** + * Type-directed dispatch to mapping engines + *****************************************************************************/ + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * \brief Dispatch helper for statically selecting between mapping strategies (e.g., to avoid compiling an alternative that is invaild for a given architecture) + */ +template +struct GridMapping; + +/** + * Even-share specialization of GridMapping + */ +template<> +struct GridMapping +{ + template < + typename PersistentBlock, + typename SizeT, + typename Result> + static __device__ __forceinline__ void ConsumeTiles( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() + { + cub::ConsumeTiles(persistent_block, num_items, even_share, result); + } + + template < + typename PersistentBlock, + typename SizeT, + typename Result> + static __device__ __forceinline__ void ConsumeTilesFlagFirst( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() + { + cub::ConsumeTilesFlagFirst(persistent_block, num_items, even_share, result); + } +}; + + +/** + * Even-share specialization of GridMapping + */ +template<> +struct GridMapping +{ + template < + typename PersistentBlock, + typename SizeT, + typename Result> + static __device__ __forceinline__ void ConsumeTiles( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() + { + cub::ConsumeTiles(persistent_block, num_items, queue, result); + } + + template < + typename PersistentBlock, + typename SizeT, + typename Result> + static __device__ __forceinline__ void ConsumeTilesFlagFirst( + PersistentBlock &persistent_block, ///< [in,out] Threadblock abstraction for tile processing + SizeT num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + Result &result) ///< [out] Result returned by tiles::Finalize() + { + cub::ConsumeTilesFlagFirst(persistent_block, num_items, queue, result); + } +}; + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + /** @} */ // end group GridModule } // CUB namespace diff --git a/cub/grid/grid_queue.cuh b/cub/grid/grid_queue.cuh index 3ca31b5a1a..daec348ea5 100644 --- a/cub/grid/grid_queue.cuh +++ b/cub/grid/grid_queue.cuh @@ -26,9 +26,10 @@ * ******************************************************************************/ -/****************************************************************************** - * Abstraction for grid-wide queue management - ******************************************************************************/ +/** + * \file + * cub::GridQueue is a descriptor utility for dynamic queue management. + */ #pragma once @@ -52,7 +53,7 @@ namespace cub { /** - * \brief Abstraction for grid-wide queue management. + * \brief GridQueue is a descriptor utility for dynamic queue management. * * \par Overview * GridQueue descriptors provides abstractions for "filling" or @@ -77,7 +78,8 @@ namespace cub { * \par * Iterative work management can be implemented simply with a pair of flip-flopping * work buffers, each with an associated set of fill and drain GridQueue descriptors. - + * + * \tparam SizeT Integer type for array indexing */ template class GridQueue diff --git a/cub/thread/thread_load.cuh b/cub/thread/thread_load.cuh index e36fbe9341..a07c758c27 100644 --- a/cub/thread/thread_load.cuh +++ b/cub/thread/thread_load.cuh @@ -161,6 +161,26 @@ struct ThreadLoadDispatch }; + +/** + * Generic PTX_LOAD_CG specialization for SM10-SM13 architectures + */ +#if CUB_PTX_ARCH < 200 +template <> +struct ThreadLoadDispatch +{ + // Iterator + template + static __device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorRA itr) + { + // Straightforward dereference + return *itr; + } +}; +#endif // CUB_PTX_ARCH < 200 + + + #endif // DOXYGEN_SHOULD_SKIP_THIS /** @@ -456,6 +476,7 @@ __device__ __forceinline__ typename std::iterator_traits::value /** * Expand ThreadLoad() implementations for primitive types. */ +#if CUB_PTX_ARCH >= 200 // Signed CUB_LOADS_0124(char, char, short, s8, h) @@ -491,6 +512,7 @@ CUB_LOADS_4L(double4, double2); CUB_LOADS_0124(unsigned long, ulong, unsigned long, u32, r) #endif +#endif // CUB_PTX_ARCH >= 200 /** * Undefine macros diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh index 0e524893ca..83de8d789c 100644 --- a/cub/thread/thread_store.cuh +++ b/cub/thread/thread_store.cuh @@ -214,54 +214,74 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val) /** * Define a global ThreadStore() specialization for type */ -#define CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - const asm_type raw = reinterpret_cast(val); \ - asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - #reg_mod(raw)); \ +#define CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + const asm_type raw = reinterpret_cast(val); \ + asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + #reg_mod(raw)); \ } /** * Define a global ThreadStore() specialization for the vector-1 type */ #define CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - const asm_type raw_x = reinterpret_cast(val.x); \ - asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - #reg_mod(raw_x)); \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + const asm_type raw_x = reinterpret_cast(val.x); \ + asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + #reg_mod(raw_x)); \ } /** - * Define a volatile-shared ThreadStore() specialization for the vector-1 type + * Define a global ThreadStore() specialization for the vector-2 type */ -#define CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - ThreadStore( \ - (asm_type*) ptr, \ - reinterpret_cast(val.x)); \ +#define CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + const asm_type raw_x = reinterpret_cast(val.x); \ + const asm_type raw_y = reinterpret_cast(val.y); \ + asm volatile ("st.global."#ptx_modifier".v2."#ptx_type" [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + #reg_mod(raw_x), \ + #reg_mod(raw_y)); \ } /** - * Define a global ThreadStore() specialization for the vector-2 type + * Define a global ThreadStore() specialization for the vector-4 type */ -#define CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - const asm_type raw_x = reinterpret_cast(val.x); \ - const asm_type raw_y = reinterpret_cast(val.y); \ - asm volatile ("st.global."#ptx_modifier".v2."#ptx_type" [%0], {%1, %2};" : : \ - _CUB_ASM_PTR_(ptr), \ - #reg_mod(raw_x), \ - #reg_mod(raw_y)); \ +#define CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + const asm_type raw_x = reinterpret_cast(val.x); \ + const asm_type raw_y = reinterpret_cast(val.y); \ + const asm_type raw_z = reinterpret_cast(val.z); \ + const asm_type raw_w = reinterpret_cast(val.w); \ + asm volatile ("st.global."#ptx_modifier".v4."#ptx_type" [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + #reg_mod(raw_x), \ + #reg_mod(raw_y), \ + #reg_mod(raw_z), \ + #reg_mod(raw_w)); \ + } + + +/** + * Define a volatile-shared ThreadStore() specialization for the vector-1 type + */ +#define CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod) \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + ThreadStore( \ + (asm_type*) ptr, \ + reinterpret_cast(val.x)); \ } /** @@ -269,48 +289,29 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val) * Performs separate references if the component_type is only 1 byte (otherwise we lose * performance due to the bitfield ops to disassemble the value) */ -#define CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100)) \ - { \ - component_type *base_ptr = (component_type*) ptr; \ +#define CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod) \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100)) \ + { \ + component_type *base_ptr = (component_type*) ptr; \ ThreadStore(base_ptr, (component_type) val.x); \ ThreadStore(base_ptr + 1, (component_type) val.y); \ - } \ - else \ - { \ - const asm_type raw_x = reinterpret_cast(val.x); \ - const asm_type raw_y = reinterpret_cast(val.y); \ - asm volatile ("{" \ + } \ + else \ + { \ + const asm_type raw_x = reinterpret_cast(val.x); \ + const asm_type raw_y = reinterpret_cast(val.y); \ + asm volatile ("{" \ " .reg ."_CUB_ASM_PTR_SIZE_" t1;" \ - " cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;" \ - " st.shared.volatile.v2."#ptx_type" [t1], {%1, %2};" \ - "}" : : \ - _CUB_ASM_PTR_(ptr), \ - #reg_mod(raw_x), \ - #reg_mod(raw_y)); \ - } \ - } - -/** - * Define a global ThreadStore() specialization for the vector-4 type - */ -#define CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - const asm_type raw_x = reinterpret_cast(val.x); \ - const asm_type raw_y = reinterpret_cast(val.y); \ - const asm_type raw_z = reinterpret_cast(val.z); \ - const asm_type raw_w = reinterpret_cast(val.w); \ - asm volatile ("st.global."#ptx_modifier".v4."#ptx_type" [%0], {%1, %2, %3, %4};" : : \ - _CUB_ASM_PTR_(ptr), \ - #reg_mod(raw_x), \ - #reg_mod(raw_y), \ - #reg_mod(raw_z), \ - #reg_mod(raw_w)); \ + " cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;" \ + " st.shared.volatile.v2."#ptx_type" [t1], {%1, %2};" \ + "}" : : \ + _CUB_ASM_PTR_(ptr), \ + #reg_mod(raw_x), \ + #reg_mod(raw_y)); \ + } \ } /** @@ -318,105 +319,105 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val) * Performs separate references if the component_type is only 1 byte (otherwise we lose * performance due to the bitfield ops to disassemble the value) */ -#define CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100)) \ - { \ - component_type *base_ptr = (component_type*) ptr; \ +#define CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod) \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100)) \ + { \ + component_type *base_ptr = (component_type*) ptr; \ ThreadStore(base_ptr, (component_type) val.x); \ ThreadStore(base_ptr + 1, (component_type) val.y); \ ThreadStore(base_ptr + 2, (component_type) val.z); \ ThreadStore(base_ptr + 3, (component_type) val.w); \ - } \ - else \ - { \ - const asm_type raw_x = reinterpret_cast(val.x); \ - const asm_type raw_y = reinterpret_cast(val.y); \ - const asm_type raw_z = reinterpret_cast(val.z); \ - const asm_type raw_w = reinterpret_cast(val.w); \ - asm volatile ("{" \ + } \ + else \ + { \ + const asm_type raw_x = reinterpret_cast(val.x); \ + const asm_type raw_y = reinterpret_cast(val.y); \ + const asm_type raw_z = reinterpret_cast(val.z); \ + const asm_type raw_w = reinterpret_cast(val.w); \ + asm volatile ("{" \ " .reg ."_CUB_ASM_PTR_SIZE_" t1;" \ - " cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;" \ - " st.volatile.shared.v4."#ptx_type" [t1], {%1, %2, %3, %4};" \ - "}" : : \ - _CUB_ASM_PTR_(ptr), \ - #reg_mod(raw_x), \ - #reg_mod(raw_y), \ - #reg_mod(raw_z), \ - #reg_mod(raw_w)); \ - } \ + " cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;" \ + " st.volatile.shared.v4."#ptx_type" [t1], {%1, %2, %3, %4};" \ + "}" : : \ + _CUB_ASM_PTR_(ptr), \ + #reg_mod(raw_x), \ + #reg_mod(raw_y), \ + #reg_mod(raw_z), \ + #reg_mod(raw_w)); \ + } \ } /** * Define a ThreadStore() specialization for the 64-bit vector-4 type. * Uses two vector-2 Stores. */ -#define CUB_STORE_4L(type, half_type, cub_modifier) \ - template<> \ - void ThreadStore(type* ptr, const type& val) \ - { \ - const half_type* half_val = reinterpret_cast(&val); \ - half_type* half_ptr = reinterpret_cast(ptr); \ - ThreadStore(half_ptr, half_val[0]); \ - ThreadStore(half_ptr + 1, half_val[1]); \ +#define CUB_STORE_4L(type, half_type, cub_modifier) \ + template<> \ + void ThreadStore(type* ptr, const type& val) \ + { \ + const half_type* half_val = reinterpret_cast(&val); \ + half_type* half_ptr = reinterpret_cast(ptr); \ + ThreadStore(half_ptr, half_val[0]); \ + ThreadStore(half_ptr + 1, half_val[1]); \ } /** * Define ThreadStore() specializations for the (non-vector) type */ -#define CUB_STORES_0(type, asm_type, ptx_type, reg_mod) \ - CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ - CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ - CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ +#define CUB_STORES_0(type, asm_type, ptx_type, reg_mod) \ + CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ + CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ + CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt) /** * Define ThreadStore() specializations for the vector-1 component_type */ -#define CUB_STORES_1(type, component_type, asm_type, ptx_type, reg_mod) \ - CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod) \ - CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ - CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ - CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ +#define CUB_STORES_1(type, component_type, asm_type, ptx_type, reg_mod) \ + CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod) \ + CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ + CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ + CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt) /** * Define ThreadStore() specializations for the vector-2 component_type */ -#define CUB_STORES_2(type, component_type, asm_type, ptx_type, reg_mod) \ - CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod) \ - CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ - CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ - CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ +#define CUB_STORES_2(type, component_type, asm_type, ptx_type, reg_mod) \ + CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod) \ + CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ + CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ + CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt) /** * Define ThreadStore() specializations for the vector-4 component_type */ -#define CUB_STORES_4(type, component_type, asm_type, ptx_type, reg_mod) \ - CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod) \ - CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ - CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ - CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ +#define CUB_STORES_4(type, component_type, asm_type, ptx_type, reg_mod) \ + CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod) \ + CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb) \ + CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg) \ + CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs) \ CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt) /** * Define ThreadStore() specializations for the 256-bit vector-4 component_type */ -#define CUB_STORES_4L(type, half_type) \ - CUB_STORE_4L(type, half_type, PTX_STORE_VS) \ - CUB_STORE_4L(type, half_type, PTX_STORE_WB) \ - CUB_STORE_4L(type, half_type, PTX_STORE_CG) \ - CUB_STORE_4L(type, half_type, PTX_STORE_CS) \ +#define CUB_STORES_4L(type, half_type) \ + CUB_STORE_4L(type, half_type, PTX_STORE_VS) \ + CUB_STORE_4L(type, half_type, PTX_STORE_WB) \ + CUB_STORE_4L(type, half_type, PTX_STORE_CG) \ + CUB_STORE_4L(type, half_type, PTX_STORE_CS) \ CUB_STORE_4L(type, half_type, PTX_STORE_WT) /** * Define vector-0/1/2 ThreadStore() specializations for the component type */ -#define CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod) \ - CUB_STORES_0(component_type, asm_type, ptx_type, reg_mod) \ +#define CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod) \ + CUB_STORES_0(component_type, asm_type, ptx_type, reg_mod) \ CUB_STORES_1(vec_prefix##1, component_type, asm_type, ptx_type, reg_mod) \ CUB_STORES_2(vec_prefix##2, component_type, asm_type, ptx_type, reg_mod) @@ -424,12 +425,15 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val) * Define vector-0/1/2/4 ThreadStore() specializations for the component type */ #define CUB_STORES_0124(component_type, vec_prefix, asm_type, ptx_type, reg_mod) \ - CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod) \ + CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod) \ CUB_STORES_4(vec_prefix##4, component_type, asm_type, ptx_type, reg_mod) /** * Expand ThreadStore() implementations for primitive types. */ + +#if CUB_PTX_ARCH >= 200 + // Signed CUB_STORES_0124(char, char, short, s8, h) CUB_STORES_0(signed char, short, s8, h) @@ -464,6 +468,8 @@ CUB_STORES_4L(double4, double2); CUB_STORES_0124(unsigned long, ulong, unsigned long, u32, r) #endif +#endif // CUB_PTX_ARCH >= 200 + /** * Undefine macros diff --git a/cub/util_allocator.cuh b/cub/util_allocator.cuh index c30e9ac78a..2fa664dbc7 100644 --- a/cub/util_allocator.cuh +++ b/cub/util_allocator.cuh @@ -180,20 +180,20 @@ struct CachingDeviceAllocator : DeviceAllocator */ struct BlockDescriptor { - DeviceOrdinal device; // device ordinal + int device; // device ordinal void* d_ptr; // Device pointer size_t bytes; // Size of allocation in bytes unsigned int bin; // Bin enumeration // Constructor - BlockDescriptor(void *d_ptr, DeviceOrdinal device) : + BlockDescriptor(void *d_ptr, int device) : d_ptr(d_ptr), bytes(0), bin(0), device(device) {} // Constructor - BlockDescriptor(size_t bytes, unsigned int bin, DeviceOrdinal device) : + BlockDescriptor(size_t bytes, unsigned int bin, int device) : d_ptr(NULL), bytes(bytes), bin(bin), @@ -234,7 +234,7 @@ struct CachingDeviceAllocator : DeviceAllocator typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; + typedef std::map GpuCachedBytes; //--------------------------------------------------------------------- @@ -355,7 +355,7 @@ struct CachingDeviceAllocator : DeviceAllocator __host__ __device__ __forceinline__ cudaError_t DeviceAllocate( void** d_ptr, size_t bytes, - DeviceOrdinal device) + int device) { #ifdef __CUDA_ARCH__ // Caching functionality only defined on host @@ -363,7 +363,7 @@ struct CachingDeviceAllocator : DeviceAllocator #else bool locked = false; - DeviceOrdinal entrypoint_device = INVALID_DEVICE_ORDINAL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; // Round up to nearest bin size @@ -472,7 +472,7 @@ struct CachingDeviceAllocator : DeviceAllocator #else cudaError_t error = cudaSuccess; do { - DeviceOrdinal current_device; + int current_device; if (CubDebug(error = cudaGetDevice(¤t_device))) break; if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break; } while(0); @@ -488,7 +488,7 @@ struct CachingDeviceAllocator : DeviceAllocator */ __host__ __device__ __forceinline__ cudaError_t DeviceFree( void* d_ptr, - DeviceOrdinal device) + int device) { #ifdef __CUDA_ARCH__ // Caching functionality only defined on host @@ -496,7 +496,7 @@ struct CachingDeviceAllocator : DeviceAllocator #else bool locked = false; - DeviceOrdinal entrypoint_device = INVALID_DEVICE_ORDINAL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; BlockDescriptor search_key(d_ptr, device); @@ -581,7 +581,7 @@ struct CachingDeviceAllocator : DeviceAllocator return CubDebug(cudaErrorInvalidConfiguration); #else - DeviceOrdinal current_device; + int current_device; cudaError_t error = cudaSuccess; do { @@ -607,8 +607,8 @@ struct CachingDeviceAllocator : DeviceAllocator cudaError_t error = cudaSuccess; bool locked = false; - DeviceOrdinal entrypoint_device = INVALID_DEVICE_ORDINAL; - DeviceOrdinal current_device = INVALID_DEVICE_ORDINAL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; // Lock if (!locked) { @@ -723,7 +723,7 @@ __host__ __device__ __forceinline__ cudaError_t DeviceAllocate( { if (device_allocator == NULL) { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // CUDA API not supported from this device return CubDebug(cudaErrorInvalidConfiguration); #else @@ -744,7 +744,7 @@ __host__ __device__ __forceinline__ cudaError_t DeviceFree( { if (device_allocator == NULL) { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // CUDA API not supported from this device return CubDebug(cudaErrorInvalidConfiguration); #else diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index ec89a00d0c..2582c80cfa 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -70,10 +70,8 @@ namespace cub { /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. -#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)) -#define CUB_CNP_ENABLED 1 -#else -#define CUB_CNP_ENABLED 0 +#if !defined(CUB_RUNTIME_ENABLED) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)) +#define CUB_RUNTIME_ENABLED #endif diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh index f8ae3ae46f..df3077723b 100644 --- a/cub/util_debug.cuh +++ b/cub/util_debug.cuh @@ -75,7 +75,7 @@ __host__ __device__ __forceinline__ cudaError_t Debug( if (error && !silent) { #if (CUB_PTX_ARCH == 0) - printf("CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); + fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); fflush(stderr); #elif (CUB_PTX_ARCH >= 200) printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line); diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 5e9a5f5179..97cbedd550 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -58,11 +58,6 @@ template __global__ void EmptyKernel(void) { } -/** - * \brief Type for representing GPU device ordinals - */ -typedef int DeviceOrdinal; - /// Invalid device ordinal enum { @@ -75,7 +70,7 @@ enum */ __host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version) { -#if !CUB_CNP_ENABLED +#ifndef CUB_RUNTIME_ENABLED // CUDA API calls not supported from this device return cudaErrorInvalidConfiguration; @@ -166,7 +161,7 @@ public: __host__ __device__ __forceinline__ cudaError_t Init(int device_ordinal) { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // CUDA API calls not supported from this device return CubDebug(cudaErrorInvalidConfiguration); @@ -210,7 +205,7 @@ public: __host__ __device__ __forceinline__ cudaError_t Init() { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // CUDA API calls not supported from this device return CubDebug(cudaErrorInvalidConfiguration); @@ -240,7 +235,7 @@ public: KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy int block_threads) ///< [in] Number of threads per thread block { - #if !CUB_CNP_ENABLED + #ifndef CUB_RUNTIME_ENABLED // CUDA API calls not supported from this device return CubDebug(cudaErrorInvalidConfiguration); @@ -254,30 +249,49 @@ public: cudaFuncAttributes kernel_attrs; if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break; + // Number of warps per threadblock int block_warps = (block_threads + warp_threads - 1) / warp_threads; - int block_allocated_warps = CUB_ROUND_UP_NEAREST(block_warps, warp_alloc_unit); - - int block_allocated_regs = (regs_by_block) ? - CUB_ROUND_UP_NEAREST( - block_allocated_warps * kernel_attrs.numRegs * warp_threads, - reg_alloc_unit) : - block_allocated_warps * CUB_ROUND_UP_NEAREST( - kernel_attrs.numRegs * warp_threads, - reg_alloc_unit); - + // Max warp occupancy + int max_warp_occupancy = (block_warps > 0) ? + max_sm_warps / block_warps : + max_sm_blocks; + + // Maximum register occupancy + int max_reg_occupancy; + if ((block_threads == 0) || (kernel_attrs.numRegs == 0)) + { + // Prevent divide-by-zero + max_reg_occupancy = max_sm_blocks; + } + else if (regs_by_block) + { + // Allocates registers by threadblock + int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit); + max_reg_occupancy = max_sm_registers / block_regs; + } + else + { + // Allocates registers by warp + int sm_sides = warp_alloc_unit; + int sm_registers_per_side = max_sm_registers / sm_sides; + int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit); + int warps_per_side = sm_registers_per_side / regs_per_warp; + int warps = warps_per_side * sm_sides; + max_reg_occupancy = warps / block_warps; + } + + // Shared memory per threadblock int block_allocated_smem = CUB_ROUND_UP_NEAREST( kernel_attrs.sharedSizeBytes, smem_alloc_unit); - int max_warp_occupancy = max_sm_warps / block_warps; - + // Max shared memory occupancy int max_smem_occupancy = (block_allocated_smem > 0) ? - (smem_bytes / block_allocated_smem) : - max_sm_blocks; - - int max_reg_occupancy = max_sm_registers / block_allocated_regs; + (smem_bytes / block_allocated_smem) : + max_sm_blocks; + // Max occupancy max_sm_occupancy = CUB_MIN( CUB_MIN(max_sm_blocks, max_warp_occupancy), CUB_MIN(max_smem_occupancy, max_reg_occupancy)); diff --git a/cub/util_type.cuh b/cub/util_type.cuh index c97aa2c248..fff020cef8 100644 --- a/cub/util_type.cuh +++ b/cub/util_type.cuh @@ -34,6 +34,7 @@ #pragma once #include +#include #include "util_namespace.cuh" @@ -499,7 +500,7 @@ template struct NumericTraits : BaseTraits struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; diff --git a/cub/warp/warp_scan.cuh b/cub/warp/warp_scan.cuh index 89cd6107d7..fbbe980258 100644 --- a/cub/warp/warp_scan.cuh +++ b/cub/warp/warp_scan.cuh @@ -205,24 +205,21 @@ private: * Constants and typedefs ******************************************************************************/ - /// WarpScan algorithmic variants - enum WarpScanPolicy - { - SHFL_SCAN, // Warp-synchronous SHFL-based scan - SMEM_SCAN, // Warp-synchronous smem-based scan - }; - /// Constants enum { POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), }; + /// WarpScan algorithmic variants (would use an enum, but it causes GCC crash as of CUDA5) + static const int SHFL_SCAN = 0; // Warp-synchronous SHFL-based scan + static const int SMEM_SCAN = 1; // Warp-synchronous smem-based scan + /// Use SHFL_SCAN if (architecture is >= SM30) and (T is a primitive) and (T is 4-bytes or smaller) and (LOGICAL_WARP_THREADS is a power-of-two) - static const WarpScanPolicy POLICY = ((CUB_PTX_ARCH >= 300) && Traits::PRIMITIVE && (sizeof(T) <= 4) && POW_OF_TWO) ? - SHFL_SCAN : - SMEM_SCAN; + static const int POLICY = ((CUB_PTX_ARCH >= 300) && Traits::PRIMITIVE && (sizeof(T) <= 4) && POW_OF_TWO) ? + SHFL_SCAN : + SMEM_SCAN; @@ -512,9 +509,6 @@ private: { /// Warpscan layout: 1.5 warps-worth of elements for each warp. T warp_scan[WARPS][WARP_SMEM_ELEMENTS]; - - /// Single variable for broadcasting aggregate, etc. - T broadcast; }; @@ -522,33 +516,33 @@ private: static __device__ __forceinline__ T Broadcast( SmemStorage &smem_storage, ///< [in] Reference to shared memory allocation having layout type SmemStorage T input, ///< [in] The value to broadcast - unsigned int src_lane) ///< [in] Which warp lane is to do the broacasting + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting { unsigned int lane_id = (WARPS == 1) ? threadIdx.x : (threadIdx.x & (LOGICAL_WARP_THREADS - 1)); + unsigned int warp_id = (WARPS == 1) ? 0 : (threadIdx.x / LOGICAL_WARP_THREADS); if (lane_id == src_lane) { - ThreadStore(&smem_storage.broadcast, input); + ThreadStore(smem_storage.warp_scan[warp_id], input); } - return ThreadLoad(&smem_storage.broadcast); +#if (CUB_PTX_ARCH <= 110) + __threadfence_block(); +#endif + return ThreadLoad(smem_storage.warp_scan[warp_id]); } - /// Basic inclusive scan + + + /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) template < bool HAS_IDENTITY, bool SHARE_FINAL, - typename ScanOp> - static __device__ __forceinline__ T BasicScan( - SmemStorage &smem_storage, ///< Reference to shared memory allocation having layout type SmemStorage - unsigned int warp_id, ///< Warp id - unsigned int lane_id, ///< thread-lane id - T partial, ///< Calling thread's input partial reduction - ScanOp scan_op) ///< Binary associative scan functor + int STEP> + struct Iteration { - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) + template + static __device__ __forceinline__ void ScanStep(SmemStorage &smem_storage, unsigned int warp_id, unsigned int lane_id, T &partial, ScanOp scan_op) { const int OFFSET = 1 << STEP; @@ -561,8 +555,37 @@ private: T addend = ThreadLoad(&smem_storage.warp_scan[warp_id][HALF_WARP_THREADS + lane_id - OFFSET]); partial = scan_op(addend, partial); } + + Iteration::ScanStep(smem_storage, warp_id, lane_id, partial, scan_op); } + }; + + /// Basic inclusive scan iteration(template unrolled, base-case specialization) + template < + bool HAS_IDENTITY, + bool SHARE_FINAL> + struct Iteration + { + template + static __device__ __forceinline__ void ScanStep(SmemStorage &smem_storage, unsigned int warp_id, unsigned int lane_id, T &partial, ScanOp scan_op) {} + }; + + + /// Basic inclusive scan + template < + bool HAS_IDENTITY, + bool SHARE_FINAL, + typename ScanOp> + static __device__ __forceinline__ T BasicScan( + SmemStorage &smem_storage, ///< Reference to shared memory allocation having layout type SmemStorage + unsigned int warp_id, ///< Warp id + unsigned int lane_id, ///< thread-lane id + T partial, ///< Calling thread's input partial reduction + ScanOp scan_op) ///< Binary associative scan functor + { + // Iterate scan steps + Iteration::ScanStep(smem_storage, warp_id, lane_id, partial, scan_op); if (SHARE_FINAL) { diff --git a/docs/download_cub.html b/docs/download_cub.html index e67862f65f..495332b463 100644 --- a/docs/download_cub.html +++ b/docs/download_cub.html @@ -37,14 +37,14 @@
If your download doesn't start in 3s:

- -Download CUB! + +Download CUB!
diff --git a/docs/html/annotated.html b/docs/html/annotated.html index bddba3a726..593cabb05b 100644 --- a/docs/html/annotated.html +++ b/docs/html/annotated.html @@ -165,8 +165,8 @@  oCEnableIfSimple enable-if (similar to Boost)  oCEqualityDefault equality functor  oCEqualsType equality test - oCGridEvenShareA descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains) - oCGridQueueAbstraction for grid-wide queue management + oCGridEvenShareGridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains) + oCGridQueueGridQueue is a descriptor utility for dynamic queue management  oCIfType selection (IF ? ThenType : ElseType)  oCInt2TypeAllows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)  oCIsPointerPointer vs. iterator @@ -198,7 +198,7 @@