NVIDIA · dumerrill · May 7, 2013 · May 1, 2013 · May 1, 2013 · May 1, 2013
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -1,14 +1,15 @@
 //-----------------------------------------------------------------------------
 
-0.9.3	04/30/2013
+0.9.4	05/07/2013
 
+    - Fixed compilation errors for SM10-SM13
+    - Fixed compilation errors for some WarpScan entrypoints on SM30+
+    - Added block-wide histogram (BlockHisto256)
+    - Added device-wide histogram (DeviceHisto256)
     - Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which 
       trades more register consumption for less shared memory I/O)
-    - Added block-wide histogram (BlockHisto256)
     - Updates to BlockRadixRank to use BlockScan (which improves performance
       on Kepler due to SHFL instruction)
-    - Added device-wide histogram (DeviceHisto256)
-    - Fixed compilation errors for some WarpScan entrypoints on SM30+
     - Allow types other than C++ primitives to be used in WarpScan::*Sum methods 
       if they only have operator + overloaded.  (Previously they also required 
       to support assignment from int(0).) 

diff --git a/cub/block/block_histo_256.cuh b/cub/block/block_histo_256.cuh
@@ -45,9 +45,12 @@ CUB_NS_PREFIX
 namespace cub {
 
 
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
 /**
- * BlockHisto256Algorithm enumerates alternative algorithms for the parallel
- * construction of 8b histograms.
+ * \brief BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.
  */
 enum BlockHisto256Algorithm
 {
@@ -57,21 +60,33 @@ enum BlockHisto256Algorithm
      * Sorting followed by differentiation.  Execution is comprised of two phases:
      * -# Sort the 8b data using efficient radix sort
      * -# Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
      */
-    BLOCK_BYTE_HISTO_SORT,
+    BLOCK_HISTO_256_SORT,
 
 
     /**
      * \par Overview
      * Use atomic addition to update byte counts directly
      *
-     * \par Usage Considerations
-     * BLOCK_BYTE_HISTO_ATOMIC can only be used on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used regardless.
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
      */
-    BLOCK_BYTE_HISTO_ATOMIC,
+    BLOCK_HISTO_256_ATOMIC,
 };
 
 
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
 /**
  * \addtogroup BlockModule
  * @{
@@ -90,12 +105,12 @@ enum BlockHisto256Algorithm
  *
  * \tparam BLOCK_THREADS        The threadblock size in threads
  * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_BYTE_HISTO_SORT)
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_HISTO_256_SORT)
  *
  * \par Algorithm
  * BlockHisto256 can be (optionally) configured to use different algorithms:
- *   -# <b>cub::BLOCK_BYTE_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
- *   -# <b>cub::BLOCK_BYTE_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
+ *   -# <b>cub::BLOCK_HISTO_256_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
+ *   -# <b>cub::BLOCK_HISTO_256_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
  *
  * \par Usage Considerations
  * - The histogram output can be constructed in shared or global memory
@@ -167,7 +182,7 @@ enum BlockHisto256Algorithm
 template <
     int                         BLOCK_THREADS,
     int                         ITEMS_PER_THREAD,
-    BlockHisto256Algorithm      ALGORITHM = BLOCK_BYTE_HISTO_SORT>
+    BlockHisto256Algorithm      ALGORITHM = BLOCK_HISTO_256_SORT>
 class BlockHisto256
 {
 private:
@@ -178,13 +193,13 @@ private:
 
     /**
      * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_BYTE_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_BYTE_HISTO_SORT is used
+     * targeted device architecture.  BLOCK_HISTO_256_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_256_SORT is used
      * regardless.
      */
     static const BlockHisto256Algorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_BYTE_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
-            BLOCK_BYTE_HISTO_SORT :
+        ((ALGORITHM == BLOCK_HISTO_256_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
+            BLOCK_HISTO_256_SORT :
             ALGORITHM;
 
     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
@@ -195,7 +210,7 @@ private:
      ******************************************************************************/
 
     /**
-     * BLOCK_BYTE_HISTO_SORT algorithmic variant
+     * BLOCK_HISTO_256_SORT algorithmic variant
      */
     template <BlockHisto256Algorithm _ALGORITHM, int DUMMY = 0>
     struct BlockHisto256Internal
@@ -319,10 +334,10 @@ private:
 
 
     /**
-     * BLOCK_BYTE_HISTO_ATOMIC algorithmic variant
+     * BLOCK_HISTO_256_ATOMIC algorithmic variant
      */
     template <int DUMMY>
-    struct BlockHisto256Internal<BLOCK_BYTE_HISTO_ATOMIC, DUMMY>
+    struct BlockHisto256Internal<BLOCK_HISTO_256_ATOMIC, DUMMY>
     {
         /// Shared memory storage layout type
         struct SmemStorage {};

diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
@@ -77,8 +77,8 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA   block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA     block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    T                   (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     // Load directly in thread-blocked order
     #pragma unroll
@@ -106,8 +106,8 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadDirect<PTX_LOAD_NONE>(block_itr, items);
 }
@@ -132,9 +132,9 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
 
@@ -165,9 +165,9 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, items);
 }
@@ -191,10 +191,10 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               oob_default,                    ///< [in] Default value to assign out-of-bound items
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               oob_default,                        ///< [in] Default value to assign out-of-bound items
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
 
@@ -224,10 +224,10 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               oob_default,                    ///< [in] Default value to assign out-of-bound items
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               oob_default,                        ///< [in] Default value to assign out-of-bound items
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items);
 }
@@ -348,10 +348,10 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirectStriped(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD],         ///< [out] Data to load
+    int             stride = blockDim.x)                ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
     BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, items, stride);
 }
@@ -409,11 +409,11 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirectStriped(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               oob_default,                    ///< [in] Default value to assign out-of-bound items
-    T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               oob_default,                        ///< [in] Default value to assign out-of-bound items
+    T               (&items)[ITEMS_PER_THREAD],         ///< [out] Data to load
+    int             stride = blockDim.x)                ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
     BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items, stride);
 }
@@ -446,8 +446,8 @@ template <
     typename        T,
     int             ITEMS_PER_THREAD>
 __device__ __forceinline__ void BlockLoadVectorized(
-    T               *block_ptr,                       ///< [in] Input pointer for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    T               *block_ptr,                         ///< [in] Input pointer for loading from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     enum
     {
@@ -503,8 +503,8 @@ template <
     typename        T,
     int             ITEMS_PER_THREAD>
 __device__ __forceinline__ void BlockLoadVectorized(
-    T               *block_ptr,                       ///< [in] Input pointer for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    T               *block_ptr,                         ///< [in] Input pointer for loading from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadVectorized<PTX_LOAD_NONE>(block_ptr, items);
 }

diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh
@@ -50,6 +50,11 @@ CUB_NS_PREFIX
 namespace cub {
 
 
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
 /**
  * BlockReduceAlgorithm enumerates alternative algorithms for parallel
  * reduction across a CUDA threadblock.
@@ -59,9 +64,13 @@ enum BlockReduceAlgorithm
 
     /**
      * \par Overview
-     * An efficient "raking" reduction algorithm.  Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * An efficient "raking" reduction algorithm.  Execution is comprised of
+     * three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
      * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
      *
      * \par
@@ -78,24 +87,34 @@ enum BlockReduceAlgorithm
 
     /**
      * \par Overview
-     * A quick "tiled warp-reductions" reduction algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are updated with the aggregate from each preceding warp.
+     * A quick "tiled warp-reductions" reduction algorithm.  Execution is
+     * comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
      *
      * \par
      * \image html block_scan_warpscans.png
      * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
      *
      * \par Performance Considerations
      * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warp-reductions, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     *   GPU because due to a heavy reliance on inefficient warp-reductions, it
+     *   can often provide lower turnaround latencies when the GPU is
+     *   under-occupied.
      */
     BLOCK_REDUCE_WARP_REDUCTIONS,
 };
 
 
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
 /**
  * \addtogroup BlockModule
  * @{

diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh
@@ -49,6 +49,10 @@ CUB_NS_PREFIX
 namespace cub {
 
 
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
 /**
  * BlockScanAlgorithm enumerates alternative algorithms for parallel prefix
  * scan across a CUDA threadblock.
@@ -108,6 +112,10 @@ enum BlockScanAlgorithm
 };
 
 
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
 /**
  * \addtogroup BlockModule
  * @{