Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions CHANGE_LOG.TXT
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
//-----------------------------------------------------------------------------

0.9.3 04/30/2013
0.9.4 05/07/2013

- Fixed compilation errors for SM10-SM13
- Fixed compilation errors for some WarpScan entrypoints on SM30+
- Added block-wide histogram (BlockHisto256)
- Added device-wide histogram (DeviceHisto256)
- Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which
trades more register consumption for less shared memory I/O)
- Added block-wide histogram (BlockHisto256)
- Updates to BlockRadixRank to use BlockScan (which improves performance
on Kepler due to SHFL instruction)
- Added device-wide histogram (DeviceHisto256)
- Fixed compilation errors for some WarpScan entrypoints on SM30+
- Allow types other than C++ primitives to be used in WarpScan::*Sum methods
if they only have operator + overloaded. (Previously they also required
to support assignment from int(0).)
Expand Down
49 changes: 32 additions & 17 deletions cub/block/block_histo_256.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,12 @@ CUB_NS_PREFIX
namespace cub {


/******************************************************************************
* Algorithmic variants
******************************************************************************/

/**
* BlockHisto256Algorithm enumerates alternative algorithms for the parallel
* construction of 8b histograms.
* \brief BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.
*/
enum BlockHisto256Algorithm
{
Expand All @@ -57,21 +60,33 @@ enum BlockHisto256Algorithm
* Sorting followed by differentiation. Execution is comprised of two phases:
* -# Sort the 8b data using efficient radix sort
* -# Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts.
*
* \par Performance Considerations
* Delivers consistent throughput regardless of sample bin distribution.
*/
BLOCK_BYTE_HISTO_SORT,
BLOCK_HISTO_256_SORT,


/**
* \par Overview
* Use atomic addition to update byte counts directly
*
* \par Usage Considerations
* BLOCK_BYTE_HISTO_ATOMIC can only be used on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used regardless.
* \par Performance Considerations
* Performance is strongly tied to the hardware implementation of atomic
* addition, and may be significantly degraded for non uniformly-random
* input distributions where many concurrent updates are likely to be
* made to the same bin counter.
*/
BLOCK_BYTE_HISTO_ATOMIC,
BLOCK_HISTO_256_ATOMIC,
};



/******************************************************************************
* Block histogram
******************************************************************************/


/**
* \addtogroup BlockModule
* @{
Expand All @@ -90,12 +105,12 @@ enum BlockHisto256Algorithm
*
* \tparam BLOCK_THREADS The threadblock size in threads
* \tparam ITEMS_PER_THREAD The number of items per thread
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_BYTE_HISTO_SORT)
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_HISTO_256_SORT)
*
* \par Algorithm
* BlockHisto256 can be (optionally) configured to use different algorithms:
* -# <b>cub::BLOCK_BYTE_HISTO_SORT</b>. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
* -# <b>cub::BLOCK_BYTE_HISTO_ATOMIC</b>. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
* -# <b>cub::BLOCK_HISTO_256_SORT</b>. Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
* -# <b>cub::BLOCK_HISTO_256_ATOMIC</b>. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
*
* \par Usage Considerations
* - The histogram output can be constructed in shared or global memory
Expand Down Expand Up @@ -167,7 +182,7 @@ enum BlockHisto256Algorithm
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT>
BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT>
class BlockHisto256
{
private:
Expand All @@ -178,13 +193,13 @@ private:

/**
* Ensure the template parameterization meets the requirements of the
* targeted device architecture. BLOCK_BYTE_HISTO_ATOMIC can only be used
* on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used
* targeted device architecture. BLOCK_HISTO_256_ATOMIC can only be used
* on version SM120 or later. Otherwise BLOCK_HISTO_256_SORT is used
* regardless.
*/
static const BlockHisto256Algorithm SAFE_ALGORITHM =
((ALGORITHM == BLOCK_BYTE_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
BLOCK_BYTE_HISTO_SORT :
((ALGORITHM == BLOCK_HISTO_256_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
BLOCK_HISTO_256_SORT :
ALGORITHM;

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
Expand All @@ -195,7 +210,7 @@ private:
******************************************************************************/

/**
* BLOCK_BYTE_HISTO_SORT algorithmic variant
* BLOCK_HISTO_256_SORT algorithmic variant
*/
template <BlockHisto256Algorithm _ALGORITHM, int DUMMY = 0>
struct BlockHisto256Internal
Expand Down Expand Up @@ -319,10 +334,10 @@ private:


/**
* BLOCK_BYTE_HISTO_ATOMIC algorithmic variant
* BLOCK_HISTO_256_ATOMIC algorithmic variant
*/
template <int DUMMY>
struct BlockHisto256Internal<BLOCK_BYTE_HISTO_ATOMIC, DUMMY>
struct BlockHisto256Internal<BLOCK_HISTO_256_ATOMIC, DUMMY>
{
/// Shared memory storage layout type
struct SmemStorage {};
Expand Down
62 changes: 31 additions & 31 deletions cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
// Load directly in thread-blocked order
#pragma unroll
Expand Down Expand Up @@ -106,8 +106,8 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadDirect<PTX_LOAD_NONE>(block_itr, items);
}
Expand All @@ -132,9 +132,9 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);

Expand Down Expand Up @@ -165,9 +165,9 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, items);
}
Expand All @@ -191,10 +191,10 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T oob_default, ///< [in] Default value to assign out-of-bound items
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T oob_default, ///< [in] Default value to assign out-of-bound items
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);

Expand Down Expand Up @@ -224,10 +224,10 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirect(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T oob_default, ///< [in] Default value to assign out-of-bound items
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T oob_default, ///< [in] Default value to assign out-of-bound items
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items);
}
Expand Down Expand Up @@ -348,10 +348,10 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirectStriped(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, items, stride);
}
Expand Down Expand Up @@ -409,11 +409,11 @@ template <
int ITEMS_PER_THREAD,
typename InputIteratorRA>
__device__ __forceinline__ void BlockLoadDirectStriped(
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T oob_default, ///< [in] Default value to assign out-of-bound items
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
InputIteratorRA block_itr, ///< [in] The threadblock's base input iterator for loading from
const int &guarded_items, ///< [in] Number of valid items in the tile
T oob_default, ///< [in] Default value to assign out-of-bound items
T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
int stride = blockDim.x) ///< [in] <b>[optional]</b> Stripe stride. Default is the width of the threadblock. More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
{
BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items, stride);
}
Expand Down Expand Up @@ -446,8 +446,8 @@ template <
typename T,
int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockLoadVectorized(
T *block_ptr, ///< [in] Input pointer for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
T *block_ptr, ///< [in] Input pointer for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
enum
{
Expand Down Expand Up @@ -503,8 +503,8 @@ template <
typename T,
int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockLoadVectorized(
T *block_ptr, ///< [in] Input pointer for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
T *block_ptr, ///< [in] Input pointer for loading from
T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
{
BlockLoadVectorized<PTX_LOAD_NONE>(block_ptr, items);
}
Expand Down
37 changes: 28 additions & 9 deletions cub/block/block_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ CUB_NS_PREFIX
namespace cub {



/******************************************************************************
* Algorithmic variants
******************************************************************************/

/**
* BlockReduceAlgorithm enumerates alternative algorithms for parallel
* reduction across a CUDA threadblock.
Expand All @@ -59,9 +64,13 @@ enum BlockReduceAlgorithm

/**
* \par Overview
* An efficient "raking" reduction algorithm. Execution is comprised of three phases:
* -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions.
* An efficient "raking" reduction algorithm. Execution is comprised of
* three phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Upsweep sequential reduction in shared memory. Threads within a
* single warp rake across segments of shared partial reductions.
* -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
*
* \par
Expand All @@ -78,24 +87,34 @@ enum BlockReduceAlgorithm

/**
* \par Overview
* A quick "tiled warp-reductions" reduction algorithm. Execution is comprised of four phases:
* -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory.
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style reduction within each warp.
* -# A propagation phase where the warp reduction outputs in each warp are updated with the aggregate from each preceding warp.
* A quick "tiled warp-reductions" reduction algorithm. Execution is
* comprised of four phases:
* -# Upsweep sequential reduction in registers (if threads contribute more
* than one input each). Each thread then places the partial reduction
* of its item(s) into shared memory.
* -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
* reduction within each warp.
* -# A propagation phase where the warp reduction outputs in each warp are
* updated with the aggregate from each preceding warp.
*
* \par
* \image html block_scan_warpscans.png
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - Although this variant may suffer lower overall throughput across the
* GPU because due to a heavy reliance on inefficient warp-reductions, it can
* often provide lower turnaround latencies when the GPU is under-occupied.
* GPU because due to a heavy reliance on inefficient warp-reductions, it
* can often provide lower turnaround latencies when the GPU is
* under-occupied.
*/
BLOCK_REDUCE_WARP_REDUCTIONS,
};


/******************************************************************************
* Block reduce
******************************************************************************/

/**
* \addtogroup BlockModule
* @{
Expand Down
8 changes: 8 additions & 0 deletions cub/block/block_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ CUB_NS_PREFIX
namespace cub {


/******************************************************************************
* Algorithmic variants
******************************************************************************/

/**
* BlockScanAlgorithm enumerates alternative algorithms for parallel prefix
* scan across a CUDA threadblock.
Expand Down Expand Up @@ -108,6 +112,10 @@ enum BlockScanAlgorithm
};


/******************************************************************************
* Block scan
******************************************************************************/

/**
* \addtogroup BlockModule
* @{
Expand Down
Loading