diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
index 82573be757..5ca79b4598 100644
--- a/CHANGE_LOG.TXT
+++ b/CHANGE_LOG.TXT
@@ -1,14 +1,15 @@
 //-----------------------------------------------------------------------------
 
-0.9.3	04/30/2013
+0.9.4	05/07/2013
 
+    - Fixed compilation errors for SM10-SM13
+    - Fixed compilation errors for some WarpScan entrypoints on SM30+
+    - Added block-wide histogram (BlockHisto256)
+    - Added device-wide histogram (DeviceHisto256)
     - Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which 
       trades more register consumption for less shared memory I/O)
-    - Added block-wide histogram (BlockHisto256)
     - Updates to BlockRadixRank to use BlockScan (which improves performance
       on Kepler due to SHFL instruction)
-    - Added device-wide histogram (DeviceHisto256)
-    - Fixed compilation errors for some WarpScan entrypoints on SM30+
     - Allow types other than C++ primitives to be used in WarpScan::*Sum methods 
       if they only have operator + overloaded.  (Previously they also required 
       to support assignment from int(0).) 
diff --git a/cub/block/block_histo_256.cuh b/cub/block/block_histo_256.cuh
index 46d18d338f..9c33921c36 100644
--- a/cub/block/block_histo_256.cuh
+++ b/cub/block/block_histo_256.cuh
@@ -45,9 +45,12 @@ CUB_NS_PREFIX
 namespace cub {
 
 
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
 /**
- * BlockHisto256Algorithm enumerates alternative algorithms for the parallel
- * construction of 8b histograms.
+ * \brief BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.
  */
 enum BlockHisto256Algorithm
 {
@@ -57,21 +60,33 @@ enum BlockHisto256Algorithm
      * Sorting followed by differentiation.  Execution is comprised of two phases:
      * -# Sort the 8b data using efficient radix sort
      * -# Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
      */
-    BLOCK_BYTE_HISTO_SORT,
+    BLOCK_HISTO_256_SORT,
 
 
     /**
      * \par Overview
      * Use atomic addition to update byte counts directly
      *
-     * \par Usage Considerations
-     * BLOCK_BYTE_HISTO_ATOMIC can only be used on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used regardless.
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
      */
-    BLOCK_BYTE_HISTO_ATOMIC,
+    BLOCK_HISTO_256_ATOMIC,
 };
 
 
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
 /**
  * \addtogroup BlockModule
  * @{
@@ -90,12 +105,12 @@ enum BlockHisto256Algorithm
  *
  * \tparam BLOCK_THREADS        The threadblock size in threads
  * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_BYTE_HISTO_SORT)
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use (default = cub::BLOCK_HISTO_256_SORT)
  *
  * \par Algorithm
  * BlockHisto256 can be (optionally) configured to use different algorithms:
- *   -# <b>cub::BLOCK_BYTE_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
- *   -# <b>cub::BLOCK_BYTE_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
+ *   -# <b>cub::BLOCK_HISTO_256_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHisto256Algorithm)
+ *   -# <b>cub::BLOCK_HISTO_256_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHisto256Algorithm)
  *
  * \par Usage Considerations
  * - The histogram output can be constructed in shared or global memory
@@ -167,7 +182,7 @@ enum BlockHisto256Algorithm
 template <
     int                         BLOCK_THREADS,
     int                         ITEMS_PER_THREAD,
-    BlockHisto256Algorithm      ALGORITHM = BLOCK_BYTE_HISTO_SORT>
+    BlockHisto256Algorithm      ALGORITHM = BLOCK_HISTO_256_SORT>
 class BlockHisto256
 {
 private:
@@ -178,13 +193,13 @@ private:
 
     /**
      * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_BYTE_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_BYTE_HISTO_SORT is used
+     * targeted device architecture.  BLOCK_HISTO_256_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_256_SORT is used
      * regardless.
      */
     static const BlockHisto256Algorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_BYTE_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
-            BLOCK_BYTE_HISTO_SORT :
+        ((ALGORITHM == BLOCK_HISTO_256_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
+            BLOCK_HISTO_256_SORT :
             ALGORITHM;
 
     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
@@ -195,7 +210,7 @@ private:
      ******************************************************************************/
 
     /**
-     * BLOCK_BYTE_HISTO_SORT algorithmic variant
+     * BLOCK_HISTO_256_SORT algorithmic variant
      */
     template <BlockHisto256Algorithm _ALGORITHM, int DUMMY = 0>
     struct BlockHisto256Internal
@@ -319,10 +334,10 @@ private:
 
 
     /**
-     * BLOCK_BYTE_HISTO_ATOMIC algorithmic variant
+     * BLOCK_HISTO_256_ATOMIC algorithmic variant
      */
     template <int DUMMY>
-    struct BlockHisto256Internal<BLOCK_BYTE_HISTO_ATOMIC, DUMMY>
+    struct BlockHisto256Internal<BLOCK_HISTO_256_ATOMIC, DUMMY>
     {
         /// Shared memory storage layout type
         struct SmemStorage {};
diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
index d07424f393..1d37601075 100644
--- a/cub/block/block_load.cuh
+++ b/cub/block/block_load.cuh
@@ -77,8 +77,8 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA   block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA     block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    T                   (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     // Load directly in thread-blocked order
     #pragma unroll
@@ -106,8 +106,8 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadDirect<PTX_LOAD_NONE>(block_itr, items);
 }
@@ -132,9 +132,9 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
 
@@ -165,9 +165,9 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, items);
 }
@@ -191,10 +191,10 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               oob_default,                    ///< [in] Default value to assign out-of-bound items
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               oob_default,                        ///< [in] Default value to assign out-of-bound items
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
 
@@ -224,10 +224,10 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirect(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               oob_default,                    ///< [in] Default value to assign out-of-bound items
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               oob_default,                        ///< [in] Default value to assign out-of-bound items
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadDirect<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items);
 }
@@ -348,10 +348,10 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirectStriped(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD],         ///< [out] Data to load
+    int             stride = blockDim.x)                ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
     BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, items, stride);
 }
@@ -409,11 +409,11 @@ template <
     int             ITEMS_PER_THREAD,
     typename        InputIteratorRA>
 __device__ __forceinline__ void BlockLoadDirectStriped(
-    InputIteratorRA block_itr,                        ///< [in] The threadblock's base input iterator for loading from
-    const int       &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               oob_default,                    ///< [in] Default value to assign out-of-bound items
-    T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    InputIteratorRA block_itr,                          ///< [in] The threadblock's base input iterator for loading from
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               oob_default,                        ///< [in] Default value to assign out-of-bound items
+    T               (&items)[ITEMS_PER_THREAD],         ///< [out] Data to load
+    int             stride = blockDim.x)                ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
     BlockLoadDirectStriped<PTX_LOAD_NONE>(block_itr, guarded_items, oob_default, items, stride);
 }
@@ -446,8 +446,8 @@ template <
     typename        T,
     int             ITEMS_PER_THREAD>
 __device__ __forceinline__ void BlockLoadVectorized(
-    T               *block_ptr,                       ///< [in] Input pointer for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    T               *block_ptr,                         ///< [in] Input pointer for loading from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     enum
     {
@@ -503,8 +503,8 @@ template <
     typename        T,
     int             ITEMS_PER_THREAD>
 __device__ __forceinline__ void BlockLoadVectorized(
-    T               *block_ptr,                       ///< [in] Input pointer for loading from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+    T               *block_ptr,                         ///< [in] Input pointer for loading from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [out] Data to load
 {
     BlockLoadVectorized<PTX_LOAD_NONE>(block_ptr, items);
 }
diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh
index 0749d858a3..f47f33ac55 100644
--- a/cub/block/block_reduce.cuh
+++ b/cub/block/block_reduce.cuh
@@ -50,6 +50,11 @@ CUB_NS_PREFIX
 namespace cub {
 
 
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
 /**
  * BlockReduceAlgorithm enumerates alternative algorithms for parallel
  * reduction across a CUDA threadblock.
@@ -59,9 +64,13 @@ enum BlockReduceAlgorithm
 
     /**
      * \par Overview
-     * An efficient "raking" reduction algorithm.  Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * An efficient "raking" reduction algorithm.  Execution is comprised of
+     * three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
      * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
      *
      * \par
@@ -78,10 +87,15 @@ enum BlockReduceAlgorithm
 
     /**
      * \par Overview
-     * A quick "tiled warp-reductions" reduction algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are updated with the aggregate from each preceding warp.
+     * A quick "tiled warp-reductions" reduction algorithm.  Execution is
+     * comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
      *
      * \par
      * \image html block_scan_warpscans.png
@@ -89,13 +103,18 @@ enum BlockReduceAlgorithm
      *
      * \par Performance Considerations
      * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warp-reductions, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     *   GPU because due to a heavy reliance on inefficient warp-reductions, it
+     *   can often provide lower turnaround latencies when the GPU is
+     *   under-occupied.
      */
     BLOCK_REDUCE_WARP_REDUCTIONS,
 };
 
 
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
 /**
  * \addtogroup BlockModule
  * @{
diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh
index bf9ef662dc..e8017594ef 100644
--- a/cub/block/block_scan.cuh
+++ b/cub/block/block_scan.cuh
@@ -49,6 +49,10 @@ CUB_NS_PREFIX
 namespace cub {
 
 
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
 /**
  * BlockScanAlgorithm enumerates alternative algorithms for parallel prefix
  * scan across a CUDA threadblock.
@@ -108,6 +112,10 @@ enum BlockScanAlgorithm
 };
 
 
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
 /**
  * \addtogroup BlockModule
  * @{
diff --git a/cub/block/block_store.cuh b/cub/block/block_store.cuh
index da625a5218..90f3dcae7e 100644
--- a/cub/block/block_store.cuh
+++ b/cub/block/block_store.cuh
@@ -77,8 +77,8 @@ template <
     int             ITEMS_PER_THREAD,
     typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirect(
-    OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    T               (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
+    OutputIteratorRA    block_itr,                          ///< [in] The threadblock's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD])         ///< [in] Data to store
 {
     // Store directly in thread-blocked order
     #pragma unroll
@@ -106,8 +106,8 @@ template <
     int             ITEMS_PER_THREAD,
     typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirect(
-    OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    T               (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
+    OutputIteratorRA    block_itr,                      ///< [in] The threadblock's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
 {
     BlockStoreDirect<PTX_STORE_NONE>(block_itr, items);
 }
@@ -124,27 +124,26 @@ __device__ __forceinline__ void BlockStoreDirect(
  * \tparam T                    <b>[inferred]</b> The data type to store.
  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT                <b>[inferred]</b> Integer type for offsets
  */
 template <
     PtxStoreModifier MODIFIER,
     typename        T,
     int             ITEMS_PER_THREAD,
-    typename        OutputIteratorRA,
-    typename        SizeT>
+    typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirect(
-    OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    const SizeT     &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
+    OutputIteratorRA    block_itr,                      ///< [in] The threadblock's base output iterator for storing to
+    const int           &guarded_items,                 ///< [in] Number of valid items in the tile
+    T                   (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
 {
+    int bounds = guarded_items - (threadIdx.x * ITEMS_PER_THREAD);
+
     // Store directly in thread-blocked order
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        int item_offset = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-        if (item_offset < guarded_items)
+        if (ITEM < bounds)
         {
-            ThreadStore<MODIFIER>(block_itr + item_offset, items[ITEM]);
+            ThreadStore<MODIFIER>(block_itr + (threadIdx.x * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
         }
     }
 }
@@ -160,17 +159,15 @@ __device__ __forceinline__ void BlockStoreDirect(
  * \tparam T                    <b>[inferred]</b> The data type to store.
  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT                <b>[inferred]</b> Integer type for offsets
  */
 template <
     typename        T,
     int             ITEMS_PER_THREAD,
-    typename        OutputIteratorRA,
-    typename        SizeT>
+    typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirect(
-    OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    const SizeT     &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
+    OutputIteratorRA    block_itr,                      ///< [in] The threadblock's base output iterator for storing to
+    const int           &guarded_items,                 ///< [in] Number of valid items in the tile
+    T                   (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
 {
     BlockStoreDirect<PTX_STORE_NONE>(block_itr, guarded_items, items);
 }
@@ -202,9 +199,9 @@ template <
     int             ITEMS_PER_THREAD,
     typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirectStriped(
-    OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    T               (&items)[ITEMS_PER_THREAD],     ///< [in] Data to store
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    OutputIteratorRA    block_itr,                      ///< [in] The threadblock's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Data to store
+    int                 stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
     // Store directly in striped order
     #pragma unroll
@@ -233,9 +230,9 @@ template <
     int             ITEMS_PER_THREAD,
     typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirectStriped(
-    OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    T               (&items)[ITEMS_PER_THREAD],     ///< [in] Data to store
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    OutputIteratorRA    block_itr,                      ///< [in] The threadblock's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Data to store
+    int                 stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
     BlockStoreDirectStriped<PTX_STORE_NONE>(block_itr, items, stride);
 }
@@ -252,28 +249,27 @@ __device__ __forceinline__ void BlockStoreDirectStriped(
  * \tparam T                    <b>[inferred]</b> The data type to store.
  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT                <b>[inferred]</b> Integer type for offsets
  */
 template <
     PtxStoreModifier MODIFIER,
     typename        T,
     int             ITEMS_PER_THREAD,
-    typename        OutputIteratorRA,
-    typename        SizeT>
+    typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirectStriped(
     OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    const SizeT     &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD],     ///< [in] Data to store
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD],         ///< [in] Data to store
+    int             stride = blockDim.x)                ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
+    int bounds = guarded_items - threadIdx.x;
+
     // Store directly in striped order
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        int item_offset = (ITEM * stride) + threadIdx.x;
-        if (item_offset < guarded_items)
+        if (ITEM * stride < bounds)
         {
-            ThreadStore<MODIFIER>(block_itr + item_offset, items[ITEM]);
+            ThreadStore<MODIFIER>(block_itr + (ITEM * stride) + threadIdx.x, items[ITEM]);
         }
     }
 }
@@ -289,18 +285,16 @@ __device__ __forceinline__ void BlockStoreDirectStriped(
  * \tparam T                    <b>[inferred]</b> The data type to store.
  * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- * \tparam SizeT                <b>[inferred]</b> Integer type for offsets
  */
 template <
     typename        T,
     int             ITEMS_PER_THREAD,
-    typename        OutputIteratorRA,
-    typename        SizeT>
+    typename        OutputIteratorRA>
 __device__ __forceinline__ void BlockStoreDirectStriped(
     OutputIteratorRA  block_itr,                        ///< [in] The threadblock's base output iterator for storing to
-    const SizeT     &guarded_items,                 ///< [in] Number of valid items in the tile
-    T               (&items)[ITEMS_PER_THREAD],     ///< [in] Data to store
-    int             stride = blockDim.x)            ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
+    const int       &guarded_items,                     ///< [in] Number of valid items in the tile
+    T               (&items)[ITEMS_PER_THREAD],         ///< [in] Data to store
+    int             stride = blockDim.x)                ///< [in] <b>[optional]</b> Stripe stride.  Default is the width of the threadblock.  More efficient code can be generated if a compile-time-constant (e.g., BLOCK_THREADS) is supplied.
 {
     BlockStoreDirectStriped<PTX_STORE_NONE>(block_itr, guarded_items, items, stride);
 }
@@ -337,8 +331,8 @@ template <
     typename        T,
     int             ITEMS_PER_THREAD>
 __device__ __forceinline__ void BlockStoreVectorized(
-    T               *block_ptr,                       ///< [in] Input pointer for storing from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
+    T               *block_ptr,                         ///< [in] Input pointer for storing from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [in] Data to store
 {
     enum
     {
@@ -398,8 +392,8 @@ template <
     typename        T,
     int             ITEMS_PER_THREAD>
 __device__ __forceinline__ void BlockStoreVectorized(
-    T               *block_ptr,                       ///< [in] Input pointer for storing from
-    T               (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
+    T               *block_ptr,                         ///< [in] Input pointer for storing from
+    T               (&items)[ITEMS_PER_THREAD])         ///< [in] Data to store
 {
     BlockStoreVectorized<PTX_STORE_NONE>(block_ptr, items);
 }
@@ -606,20 +600,19 @@ private:
 
         /// Store a tile of items across a threadblock
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            OutputIteratorRA  block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            OutputIteratorRA    block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             BlockStoreDirect<MODIFIER>(block_itr, items);
         }
 
         /// Store a tile of items across a threadblock, guarded by range
-        template <typename SizeT>
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            OutputIteratorRA  block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            const SizeT     &guarded_items,             ///< [in] Number of valid items in the tile
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            OutputIteratorRA    block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            const int           &guarded_items,             ///< [in] Number of valid items in the tile
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             BlockStoreDirect<MODIFIER>(block_itr, guarded_items, items);
         }
@@ -636,11 +629,10 @@ private:
         typedef NullType SmemStorage;
 
         /// Store a tile of items across a threadblock, specialized for native pointer types (attempts vectorization)
-        template <typename SizeT>
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            T               *block_ptr,                 ///< [in] The threadblock's base output iterator for storing to
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            T                   *block_ptr,                 ///< [in] The threadblock's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             BlockStoreVectorized<MODIFIER>(block_ptr, items);
         }
@@ -648,20 +640,19 @@ private:
         /// Store a tile of items across a threadblock, specialized for opaque input iterators (skips vectorization)
         template <typename _OutputIteratorRA>
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            _OutputIteratorRA block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            _OutputIteratorRA   block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             BlockStoreDirect<MODIFIER>(block_itr, items);
         }
 
         /// Store a tile of items across a threadblock, guarded by range
-        template <typename SizeT>
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            OutputIteratorRA  block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            const SizeT     &guarded_items,             ///< [in] Number of valid items in the tile
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            OutputIteratorRA    block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            const int           &guarded_items,             ///< [in] Number of valid items in the tile
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             BlockStoreDirect<MODIFIER>(block_itr, guarded_items, items);
         }
@@ -682,9 +673,9 @@ private:
 
         /// Store a tile of items across a threadblock
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            OutputIteratorRA  block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            OutputIteratorRA    block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             // Transpose to striped order
             BlockExchange::BlockedToStriped(smem_storage, items);
@@ -693,12 +684,11 @@ private:
         }
 
         /// Store a tile of items across a threadblock, guarded by range
-        template <typename SizeT>
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            OutputIteratorRA  block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            const SizeT     &guarded_items,             ///< [in] Number of valid items in the tile
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            OutputIteratorRA    block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            const int           &guarded_items,             ///< [in] Number of valid items in the tile
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             // Transpose to striped order
             BlockExchange::BlockedToStriped(smem_storage, items);
@@ -719,20 +709,19 @@ private:
 
         /// Store a tile of items across a threadblock
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            OutputIteratorRA  block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            OutputIteratorRA    block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             BlockStoreDirectStriped<MODIFIER>(block_itr, items);
         }
 
         /// Store a tile of items across a threadblock, guarded by range
-        template <typename SizeT>
         static __device__ __forceinline__ void Store(
-            SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-            OutputIteratorRA  block_itr,                  ///< [in] The threadblock's base output iterator for storing to
-            const SizeT     &guarded_items,             ///< [in] Number of valid items in the tile
-            T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+            SmemStorage         &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
+            OutputIteratorRA    block_itr,                  ///< [in] The threadblock's base output iterator for storing to
+            const int           &guarded_items,             ///< [in] Number of valid items in the tile
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
         {
             BlockStoreDirectStriped<MODIFIER>(block_itr, guarded_items, items);
         }
@@ -756,24 +745,21 @@ public:
      * \brief Store a tile of items across a threadblock.
      */
     static __device__ __forceinline__ void Store(
-        SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-        OutputIteratorRA  block_itr,                    ///< [in] The threadblock's base output iterator for storing to
-        T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        SmemStorage         &smem_storage,                  ///< [in] Reference to shared memory allocation having layout type SmemStorage
+        OutputIteratorRA    block_itr,                      ///< [in] The threadblock's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
     {
         StoreInternal<POLICY>::Store(smem_storage, block_itr, items);
     }
 
     /**
      * \brief Store a tile of items across a threadblock, guarded by range.
-     *
-     * \tparam SizeT                <b>[inferred]</b> Integer type for offsets
      */
-    template <typename SizeT>
     static __device__ __forceinline__ void Store(
-        SmemStorage     &smem_storage,              ///< [in] Reference to shared memory allocation having layout type SmemStorage
-        OutputIteratorRA  block_itr,                    ///< [in] The threadblock's base output iterator for storing to
-        const SizeT     &guarded_items,             ///< [in] Number of valid items in the tile
-        T               (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        SmemStorage         &smem_storage,                  ///< [in] Reference to shared memory allocation having layout type SmemStorage
+        OutputIteratorRA    block_itr,                      ///< [in] The threadblock's base output iterator for storing to
+        const int           &guarded_items,                 ///< [in] Number of valid items in the tile
+        T                   (&items)[ITEMS_PER_THREAD])     ///< [in] Data to store
     {
         StoreInternal<POLICY>::Store(smem_storage, block_itr, guarded_items, items);
     }
diff --git a/cub/device/device_histo_256.cuh b/cub/device/device_histo_256.cuh
index 48a52ef2f6..d14d30f4dd 100644
--- a/cub/device/device_histo_256.cuh
+++ b/cub/device/device_histo_256.cuh
@@ -37,7 +37,7 @@
 #include <stdio.h>
 #include <iterator>
 
-#include "tiles/tiles_histo_256.cuh"
+#include "persistent_block/persistent_block_histo_256.cuh"
 #include "../block/block_load.cuh"
 #include "../thread/thread_reduce.cuh"
 #include "../util_allocator.cuh"
@@ -60,84 +60,74 @@ namespace cub {
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
 
+/**
+ * Initialization kernel for queue descriptor preparation and for zeroing global counters
+ */
+template <
+    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename                                        SizeT,                  ///< Integral type used for global array indexing
+    typename                                        HistoCounter>           ///< Integral type for counting sample occurrences per histogram bin
+__launch_bounds__ (256, 1)
+__global__ void InitHisto256Kernel(
+    GridQueue<SizeT>                                grid_queue,             ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][256]</tt>
+    SizeT                                           num_samples)            ///< [in] Total number of samples \p d_samples for all channels
+{
+    d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
+    if (threadIdx.x == 0) grid_queue.ResetDrain(num_samples);
+}
+
+
 /**
  * Multi-block histogram kernel entry point.  Computes privatized histograms, one per thread block.
  */
 template <
-    typename                TilesHisto256Policy,                ///< Tuning policy for cub::TilesHisto256 abstraction
-    int                     CHANNELS,                           ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int                     ACTIVE_CHANNELS,                    ///< Number of channels actively being histogrammed
-    typename                InputIteratorRA,                    ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
-    typename                HistoCounter,                       ///< Integral type for counting sample occurrences per histogram bin
-    typename                SizeT>                              ///< Integral type used for global array indexing
-__launch_bounds__ (TilesHisto256Policy::BLOCK_THREADS)
+    typename                                        PersistentBlockHisto256Policy,    ///< Tuning policy for cub::PersistentBlockHisto256 abstraction
+    int                                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                                        HistoCounter,               ///< Integral type for counting sample occurrences per histogram bin
+    typename                                        SizeT>                      ///< Integral type used for global array indexing
+__launch_bounds__ (int(PersistentBlockHisto256Policy::BLOCK_THREADS), PersistentBlockHisto256Policy::SM_OCCUPANCY)
 __global__ void MultiBlockHisto256Kernel(
-    InputIteratorRA                                 d_samples,          ///< [in] Array of sample data. (Channels, if any, are interleaved in "AOS" format)
-    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,   ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][256]</tt>
-    SizeT                                           num_samples,        ///< [in] Total number of samples \p d_samples for all channels
-    GridEvenShare<SizeT>                            even_share,         ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
-    GridQueue<SizeT>                                queue)              ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+    InputIteratorRA                                 d_samples,                  ///< [in] Array of sample data. (Channels, if any, are interleaved in "AOS" format)
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,           ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][256]</tt>
+    SizeT                                           num_samples,                ///< [in] Total number of samples \p d_samples for all channels
+    GridEvenShare<SizeT>                            even_share,                 ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+    GridQueue<SizeT>                                queue)                      ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
 {
     // Constants
-    enum {
-        BLOCK_THREADS       = TilesHisto256Policy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = TilesHisto256Policy::ITEMS_PER_THREAD,
+    enum
+    {
+        BLOCK_THREADS       = PersistentBlockHisto256Policy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
         TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
     };
 
-    // Parameterize TilesHisto256 for the parallel execution context
-    typedef TilesHisto256 <TilesHisto256Policy, CHANNELS, SizeT> TilesHisto256T;
-
-    // Parameterize which mapping of tiles -> thread blocks we will use
-    typedef typename TilesHisto256T::template Mapping<TilesHisto256Policy::GRID_MAPPING> Mapping;
-
-    // Declare shared memory
-    __shared__ typename TilesHisto256T::SmemStorage block_histo;                            // Shared memory for TilesHisto256
-    __shared__ HistoCounter                         histograms[ACTIVE_CHANNELS][256];       // Shared memory histograms
-
-    // Composite samples into histogram(s)
-    Mapping::ProcessTiles(
-        block_histo,
-        d_samples,
-        num_samples,
-        even_share,
-        queue,
-        histograms);
+    // Thread block type for compositing input tiles
+    typedef PersistentBlockHisto256<PersistentBlockHisto256Policy, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> PersistentBlockHisto256T;
 
-    // Barrier to ensure histograms are coherent
-    __syncthreads();
+    // Shared memory for PersistentBlockHisto256
+    __shared__ typename PersistentBlockHisto256T::SmemStorage smem_storage;
 
-    // Output histogram for each active channel
-
-    #pragma unroll
-    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        int channel_offset  = (blockIdx.x * 256);
-        int histo_offset    = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS)
-        {
-            d_out_histograms.array[CHANNEL][channel_offset + histo_offset + threadIdx.x] = histograms[CHANNEL][histo_offset + threadIdx.x];
-        }
-        // Finish up with guarded initialization if necessary
-        if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256))
-        {
-            d_out_histograms.array[CHANNEL][channel_offset + histo_offset + threadIdx.x] = histograms[CHANNEL][histo_offset + threadIdx.x];
-        }
-    }
+    // Thread block instance
+    PersistentBlockHisto256T tiles(smem_storage, d_samples, d_out_histograms.array);
 
+    // Consume tiles using thread block instance
+    int dummy_result;
+    GridMapping<PersistentBlockHisto256Policy::GRID_MAPPING>::ConsumeTiles(
+        tiles, num_samples, even_share, queue, dummy_result);
 }
 
 
 /**
- * Single-block finalization kernel for aggregating privatized threadblock histograms from a previous kernel invocation.
+ * Aggregation kernel for aggregating privatized threadblock histograms from a previous kernel invocation.
  */
 template <
-    int             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename        HistoCounter>               ///< Integral type for counting sample occurrences per histogram bin
+    int                                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        HistoCounter>               ///< Integral type for counting sample occurrences per histogram bin
 __launch_bounds__ (256, 1)
-__global__ void FinalizeHisto256Kernel(
+__global__ void AggregateHisto256Kernel(
     HistoCounter*                                   d_block_histograms_linear,  ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][256]</tt>
     ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,           ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][256]</tt>
     int                                             num_threadblocks)           ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
@@ -148,11 +138,12 @@ __global__ void FinalizeHisto256Kernel(
     int block_offset = blockIdx.x * (num_threadblocks * 256);
     int block_oob = block_offset + (num_threadblocks * 256);
 
+#if CUB_PTX_ARCH >= 200
     #pragma unroll 32
+#endif
     while (block_offset < block_oob)
     {
         bin_aggregate += d_block_histograms_linear[block_offset + threadIdx.x];
-
         block_offset += 256;
     }
 
@@ -181,30 +172,30 @@ struct DeviceHisto256
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
 
-    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within TilesHisto256Policy.
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within PersistentBlockHisto256Policy.
     struct KernelDispachParams
     {
         // Policy fields
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockHisto256Algorithm  block_algorithm;
-        GridMappingStrategy     grid_mapping;
-        int                     subscription_factor;
+        int                         block_threads;
+        int                         items_per_thread;
+        PersistentBlockHisto256Algorithm  block_algorithm;
+        GridMappingStrategy         grid_mapping;
+        int                         subscription_factor;
 
         // Derived fields
-        int                     tile_size;
+        int                         tile_size;
 
-        template <typename TilesHisto256Policy>
+        template <typename PersistentBlockHisto256Policy>
         __host__ __device__ __forceinline__
         void Init(int subscription_factor = 1)
         {
-            block_threads               = TilesHisto256Policy::BLOCK_THREADS;
-            items_per_thread            = TilesHisto256Policy::ITEMS_PER_THREAD;
-            block_algorithm             = TilesHisto256Policy::BLOCK_ALGORITHM;
-            grid_mapping                = TilesHisto256Policy::GRID_MAPPING;
+            block_threads               = PersistentBlockHisto256Policy::BLOCK_THREADS;
+            items_per_thread            = PersistentBlockHisto256Policy::ITEMS_PER_THREAD;
+            block_algorithm             = PersistentBlockHisto256Policy::GRID_ALGORITHM;
+            grid_mapping                = PersistentBlockHisto256Policy::GRID_MAPPING;
             this->subscription_factor   = subscription_factor;
 
-            tile_size = block_threads * items_per_thread;
+            tile_size                   = block_threads * items_per_thread;
         }
 
         __host__ __device__ __forceinline__
@@ -225,44 +216,60 @@ struct DeviceHisto256
     template <
         int                         CHANNELS,
         int                         ACTIVE_CHANNELS,
-        BlockHisto256Algorithm      BLOCK_ALGORITHM,
+        PersistentBlockHisto256Algorithm      GRID_ALGORITHM,
         int                         ARCH>
     struct TunedPolicies;
 
     /// SM35 tune
-    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHisto256Algorithm BLOCK_ALGORITHM>
-    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM, 350>
+    template <int CHANNELS, int ACTIVE_CHANNELS, PersistentBlockHisto256Algorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350>
     {
-        typedef TilesHisto256Policy<
-            128, 
-            (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? 23 : (30 / ACTIVE_CHANNELS),
-            BLOCK_ALGORITHM,
-            (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
+        typedef PersistentBlockHisto256Policy<
+            (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 128 : 256,
+            (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
+            GRID_ALGORITHM,
+            (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+            (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 8 : 1> MultiBlockPolicy;
         enum { SUBSCRIPTION_FACTOR = 7 };
     };
 
+    /// SM30 tune
+    template <int CHANNELS, int ACTIVE_CHANNELS, PersistentBlockHisto256Algorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300>
+    {
+        typedef PersistentBlockHisto256Policy<
+            128,
+            (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
+            GRID_ALGORITHM,
+            (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+            1> MultiBlockPolicy;
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
     /// SM20 tune
-    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHisto256Algorithm BLOCK_ALGORITHM>
-    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM, 200>
+    template <int CHANNELS, int ACTIVE_CHANNELS, PersistentBlockHisto256Algorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200>
     {
-        typedef TilesHisto256Policy<
-            128, 
-            (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? 17 : (21 / ACTIVE_CHANNELS),
-            BLOCK_ALGORITHM,
-            (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
-        enum { SUBSCRIPTION_FACTOR = 3 };
+        typedef PersistentBlockHisto256Policy<
+            128,
+            (GRID_ALGORITHM == GRID_HISTO_256_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
+            GRID_ALGORITHM,
+            GRID_MAPPING_DYNAMIC,
+            1> MultiBlockPolicy;
+        enum { SUBSCRIPTION_FACTOR = 1 };
     };
 
     /// SM10 tune
-    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHisto256Algorithm BLOCK_ALGORITHM>
-    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM, 100>
+    template <int CHANNELS, int ACTIVE_CHANNELS, PersistentBlockHisto256Algorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100>
     {
-        typedef TilesHisto256Policy<
+        typedef PersistentBlockHisto256Policy<
             128, 
             7, 
-            BLOCK_ALGORITHM,
-            (BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> MultiBlockPolicy;
-        enum { SUBSCRIPTION_FACTOR = 2 };
+            GRID_HISTO_256_SORT,        // (use sort regardless because atomics are perf-useless)
+            GRID_MAPPING_EVEN_SHARE,
+            1> MultiBlockPolicy;
+        enum { SUBSCRIPTION_FACTOR = 1 };
     };
 
 
@@ -270,17 +277,19 @@ struct DeviceHisto256
     template <
         int                         CHANNELS,
         int                         ACTIVE_CHANNELS,
-        BlockHisto256Algorithm      BLOCK_ALGORITHM>
+        PersistentBlockHisto256Algorithm      GRID_ALGORITHM>
     struct PtxDefaultPolicies
     {
         static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
                                                 350 :
-                                                (CUB_PTX_ARCH >= 200) ?
-                                                    200 :
-                                                    100;
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
 
         // Tuned policy set for the current PTX compiler pass
-        typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM, PTX_TUNE_ARCH> PtxPassTunedPolicies;
+        typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, PTX_TUNE_ARCH> PtxPassTunedPolicies;
 
         // Subscription factor for the current PTX compiler pass
         static const int SUBSCRIPTION_FACTOR = PtxPassTunedPolicies::SUBSCRIPTION_FACTOR;
@@ -295,17 +304,22 @@ struct DeviceHisto256
         {
             if (ptx_version >= 350)
             {
-                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM, 350> TunedPolicies;
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300> TunedPolicies;
                 multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
             }
             else if (ptx_version >= 200)
             {
-                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM, 200> TunedPolicies;
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200> TunedPolicies;
                 multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
             }
             else
             {
-                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM, 100> TunedPolicies;
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100> TunedPolicies;
                 multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
             }
         }
@@ -321,17 +335,17 @@ struct DeviceHisto256
     template <
         int                             CHANNELS,                                           ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
         int                             ACTIVE_CHANNELS,                                    ///< Number of channels actively being histogrammed
+        typename                        InitHisto256KernelPtr,                              ///< Function type of cub::InitHisto256Kernel
         typename                        MultiBlockHisto256KernelPtr,                        ///< Function type of cub::MultiBlockHisto256Kernel
-        typename                        FinalizeHisto256KernelPtr,                          ///< Function type of cub::FinalizeHisto256Kernel
-        typename                        ResetDrainKernelPtr,                              ///< Function type of cub::ResetDrainKernel
+        typename                        AggregateHisto256KernelPtr,                         ///< Function type of cub::AggregateHisto256Kernel
         typename                        InputIteratorRA,                                    ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
         typename                        HistoCounter,                                       ///< Integral type for counting sample occurrences per histogram bin
         typename                        SizeT>                                              ///< Integral type used for global array indexing
     __host__ __device__ __forceinline__
     static cudaError_t Dispatch(
+        InitHisto256KernelPtr           init_kernel_ptr,                                    ///< [in] Kernel function pointer to parameterization of cub::InitHisto256Kernel
         MultiBlockHisto256KernelPtr     multi_block_kernel_ptr,                             ///< [in] Kernel function pointer to parameterization of cub::MultiBlockHisto256Kernel
-        FinalizeHisto256KernelPtr       finalize_kernel_ptr,                                ///< [in] Kernel function pointer to parameterization of cub::FinalizeHisto256Kernel
-        ResetDrainKernelPtr             prepare_drain_kernel_ptr,                           ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
+        AggregateHisto256KernelPtr      aggregate_kernel_ptr,                               ///< [in] Kernel function pointer to parameterization of cub::AggregateHisto256Kernel
         KernelDispachParams             &multi_block_dispatch_params,                       ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel_ptr was compiled for
         InputIteratorRA                 d_samples,                                          ///< [in] Input samples to histogram
         HistoCounter                    *(&d_histograms)[ACTIVE_CHANNELS],                  ///< [out] Array of channel histograms, each having 256 counters of integral type \p HistoCounter.
@@ -340,7 +354,7 @@ struct DeviceHisto256
         bool                            stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
         DeviceAllocator                 *device_allocator   = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
 
         // Kernel launch not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
@@ -354,11 +368,37 @@ struct DeviceHisto256
         cudaError error = cudaSuccess;
         do
         {
-            // Get GPU ordinal
+            // Setup array wrapper for histogram channel output because we can't pass static arrays as kernel parameters
+            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+            }
+
+            // Initialize counters and queue descriptor if necessary
+            if ((multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) ||
+                (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC))
+            {
+                queue.Allocate(device_allocator);
+
+                if (stream_synchronous) CubLog("Invoking init_kernel_ptr<<<%d, 256, 0, %d>>>()\n", ACTIVE_CHANNELS, (int) stream);
+
+                init_kernel_ptr<<<ACTIVE_CHANNELS, 256, 0, stream>>>(queue, d_histo_wrapper, num_samples);
+
+            #ifndef __CUDA_ARCH__
+                // Sync the stream on the host
+                if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
+            #else
+                // Sync the entire device on the device (cudaStreamSynchronize doesn't exist on device)
+                if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
+            #endif
+            }
+
+            // Determine grid size for the multi-block kernel
+
             int device_ordinal;
             if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
 
-            // Get SM count
             int sm_count;
             if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
 
@@ -380,7 +420,6 @@ struct DeviceHisto256
 
         #endif
 
-            // Determine grid size for the multi-block kernel
             int multi_occupancy = multi_sm_occupancy * sm_count;
             int multi_tile_size = multi_block_dispatch_params.block_threads * multi_block_dispatch_params.items_per_thread;
             int multi_grid_size;
@@ -402,42 +441,18 @@ struct DeviceHisto256
             case GRID_MAPPING_DYNAMIC:
 
                 // Prepare queue to distribute work dynamically
-                queue.Allocate(device_allocator);
                 int num_tiles = (num_samples + multi_tile_size - 1) / multi_tile_size;
 
-            #ifndef __CUDA_ARCH__
-
-                // We're on the host, so prepare queue on device (because its faster than if we prepare it here)
-                if (stream_synchronous) CubLog("Invoking prepare_drain_kernel_ptr<<<1, 1, 0, %d>>>()\n", (int) stream);
-                prepare_drain_kernel_ptr<<<1, 1, 0, stream>>>(queue, num_samples);
-
-                // Sync the stream on the host
-                if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
-
-            #else
-
-                // Prepare the queue here
-                queue.ResetDrain(num_samples);
-
-            #endif
-
                 // Set MultiBlock grid size
                 multi_grid_size = (num_tiles < multi_occupancy) ?
                     num_tiles :                 // Not enough to fill the device with threadblocks
                     multi_occupancy;            // Fill the device with threadblocks
 
-            break;
+                break;
             };
 
-            // Setup array wrapper for histogram channel output because we can't pass static arrays as kernel parameters
-            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
-            }
-
+            // Bind textures if the iterator supports it
         #ifndef __CUDA_ARCH__
-            // Host can bind texture if the iterator supports it
             if (CubDebug(error = BindIteratorTexture(d_samples))) break;
         #endif // __CUDA_ARCH__
 
@@ -445,7 +460,7 @@ struct DeviceHisto256
             if (stream_synchronous) CubLog("Invoking multi_block_kernel_ptr<<<%d, %d, 0, %d>>>(), %d items per thread, %d SM occupancy\n",
                 multi_grid_size, multi_block_dispatch_params.block_threads, (int) stream, multi_block_dispatch_params.items_per_thread, multi_sm_occupancy);
 
-            if (multi_grid_size == 1)
+            if ((multi_grid_size == 1) || (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC))
             {
                 // A single pass will do
                 multi_block_kernel_ptr<<<multi_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
@@ -487,10 +502,10 @@ struct DeviceHisto256
                     if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
                 #endif
 
-                if (stream_synchronous) CubLog("Invoking finalize_kernel_ptr<<<%d, %d, 0, %d>>>()\n",
+                if (stream_synchronous) CubLog("Invoking aggregate_kernel_ptr<<<%d, %d, 0, %d>>>()\n",
                     ACTIVE_CHANNELS, 256, (int) stream);
 
-                finalize_kernel_ptr<<<ACTIVE_CHANNELS, 256, 0, stream>>>(
+                aggregate_kernel_ptr<<<ACTIVE_CHANNELS, 256, 0, stream>>>(
                     d_block_histograms_linear,
                     d_histo_wrapper,
                     multi_grid_size);
@@ -507,10 +522,15 @@ struct DeviceHisto256
         while (0);
 
         // Free temporary storage allocation
-        if (d_block_histograms_linear) error = CubDebug(DeviceFree(d_block_histograms_linear, device_allocator));
+        if (d_block_histograms_linear)
+            error = CubDebug(DeviceFree(d_block_histograms_linear, device_allocator));
 
         // Free queue allocation
-        if (multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) error = CubDebug(queue.Free(device_allocator));
+        if ((multi_block_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC) ||
+            (multi_block_dispatch_params.block_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC))
+        {
+            error = CubDebug(queue.Free(device_allocator));
+        }
 
         // Unbind texture
     #ifndef __CUDA_ARCH__
@@ -526,18 +546,18 @@ struct DeviceHisto256
     /**
      * \brief Computes a 256-bin device-wide histogram
      *
-     * \tparam BLOCK_ALGORITHM      cub::BlockHisto256Algorithm enumerator specifying the underlying algorithm to use
+     * \tparam GRID_ALGORITHM      cub::PersistentBlockHisto256Algorithm enumerator specifying the underlying algorithm to use
      * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
      * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
      * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
      * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
      */
     template <
-        BlockHisto256Algorithm  BLOCK_ALGORITHM,
-        int                     CHANNELS,                                           ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-        int                     ACTIVE_CHANNELS,                                    ///< Number of channels actively being histogrammed
-        typename                InputIteratorRA,
-        typename                HistoCounter>
+        PersistentBlockHisto256Algorithm  GRID_ALGORITHM,
+        int                         CHANNELS,                                           ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+        int                         ACTIVE_CHANNELS,                                    ///< Number of channels actively being histogrammed
+        typename                    InputIteratorRA,
+        typename                    HistoCounter>
     __host__ __device__ __forceinline__
     static cudaError_t Dispatch(
         InputIteratorRA         d_samples,                                          ///< [in] Input samples to histogram
@@ -551,7 +571,7 @@ struct DeviceHisto256
         typedef int SizeT;
 
         // Tuning polices for the PTX architecture that will get dispatched to
-        typedef PtxDefaultPolicies<CHANNELS, ACTIVE_CHANNELS, BLOCK_ALGORITHM> PtxDefaultPolicies;
+        typedef PtxDefaultPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM> PtxDefaultPolicies;
         typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
 
         cudaError error = cudaSuccess;
@@ -575,9 +595,9 @@ struct DeviceHisto256
         #endif
 
             Dispatch<CHANNELS, ACTIVE_CHANNELS>(
+                InitHisto256Kernel<ACTIVE_CHANNELS, SizeT, HistoCounter>,
                 MultiBlockHisto256Kernel<MultiBlockPolicy, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>,
-                FinalizeHisto256Kernel<ACTIVE_CHANNELS, HistoCounter>,
-                ResetDrainKernel<SizeT>,
+                AggregateHisto256Kernel<ACTIVE_CHANNELS, HistoCounter>,
                 multi_block_dispatch_params,
                 d_samples,
                 d_histograms,
@@ -601,7 +621,9 @@ struct DeviceHisto256
     //---------------------------------------------------------------------
 
     /**
-     * \brief Computes a 256-bin device-wide histogram
+     * \brief Computes a 256-bin device-wide histogram.  Uses fast block-sorting to compute the histogram.
+     *
+     * Delivers consistent throughput regardless of sample diversity.
      *
      * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
      * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
@@ -618,12 +640,12 @@ struct DeviceHisto256
         bool                stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
         DeviceAllocator*    device_allocator    = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
     {
-        return Dispatch<BLOCK_BYTE_HISTO_SORT, 1, 1>(
+        return Dispatch<GRID_HISTO_256_SORT, 1, 1>(
             d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator);
     }
 
     /**
-     * \brief Computes a 256-bin device-wide histogram.  Uses atomic read-modify-write operations to compute the histogram.
+     * \brief Computes a 256-bin device-wide histogram.  Uses shared-memory atomic read-modify-write operations to compute the histogram.
      *
      * Sample input having lower diversity cause performance to be degraded.
      *
@@ -642,13 +664,40 @@ struct DeviceHisto256
         bool                stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
         DeviceAllocator*    device_allocator    = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
     {
-        return Dispatch<BLOCK_BYTE_HISTO_ATOMIC, 1, 1>(
+        return Dispatch<GRID_HISTO_256_SHARED_ATOMIC, 1, 1>(
             d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator);
     }
 
 
     /**
-     * \brief Computes a 256-bin device-wide histogram from multi-channel data.
+     * \brief Computes a 256-bin device-wide histogram.  Uses global-memory atomic read-modify-write operations to compute the histogram.
+     *
+     * Sample input having lower diversity cause performance to be degraded.
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t SingleChannelGlobalAtomic(
+        InputIteratorRA     d_samples,                                          ///< [in] Input samples
+        HistoCounter*       d_histogram,                                        ///< [out] Array of 256 counters of integral type \p HistoCounter.
+        int                 num_samples,                                        ///< [in] Number of samples to process
+        cudaStream_t        stream              = 0,                            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream-0.
+        bool                stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+        DeviceAllocator*    device_allocator    = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
+    {
+        return Dispatch<GRID_HISTO_256_GLOBAL_ATOMIC, 1, 1>(
+            d_samples, &d_histogram, num_samples, stream, stream_synchronous, device_allocator);
+    }
+
+
+    /**
+     * \brief Computes a 256-bin device-wide histogram from multi-channel data.  Uses fast block-sorting to compute the histogram.
+     *
+     * Delivers consistent throughput regardless of sample diversity.
      *
      * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
      * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
@@ -669,12 +718,13 @@ struct DeviceHisto256
         bool                stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
         DeviceAllocator*    device_allocator    = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
     {
-        return Dispatch<BLOCK_BYTE_HISTO_SORT, CHANNELS, ACTIVE_CHANNELS>(
+        return Dispatch<GRID_HISTO_256_SORT, CHANNELS, ACTIVE_CHANNELS>(
             d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator);
     }
 
+
     /**
-     * \brief Computes a 256-bin device-wide histogram from multi-channel data.  Uses atomic read-modify-write operations to compute the histogram.
+     * \brief Computes a 256-bin device-wide histogram from multi-channel data.  Uses shared-memory atomic read-modify-write operations to compute the histogram.
      *
      * Sample input having lower diversity cause performance to be degraded.
      *
@@ -697,7 +747,36 @@ struct DeviceHisto256
         bool                stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
         DeviceAllocator*    device_allocator    = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
     {
-        return Dispatch<BLOCK_BYTE_HISTO_ATOMIC, CHANNELS, ACTIVE_CHANNELS>(
+        return Dispatch<GRID_HISTO_256_SHARED_ATOMIC, CHANNELS, ACTIVE_CHANNELS>(
+            d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator);
+    }
+
+
+    /**
+     * \brief Computes a 256-bin device-wide histogram from multi-channel data.  Uses global-memory atomic read-modify-write operations to compute the histogram.
+     *
+     * Sample input having lower diversity cause performance to be degraded.
+     *
+     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        int                 CHANNELS,                                           ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+        int                 ACTIVE_CHANNELS,                                    ///< Number of channels actively being histogrammed
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t MultiChannelGlobalAtomic(
+        InputIteratorRA     d_samples,                                          ///< [in] Input samples. (Channels, if any, are interleaved in "AOS" format)
+        HistoCounter        *(&d_histograms)[ACTIVE_CHANNELS],                  ///< [out] Array of channel histograms, each having 256 counters of integral type \p HistoCounter.
+        int                 num_samples,                                        ///< [in] Number of samples to process
+        cudaStream_t        stream              = 0,                            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream-0.
+        bool                stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+        DeviceAllocator*    device_allocator    = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
+    {
+        return Dispatch<GRID_HISTO_256_GLOBAL_ATOMIC, CHANNELS, ACTIVE_CHANNELS>(
             d_samples, d_histograms, num_samples, stream, stream_synchronous, device_allocator);
     }
 
diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
index 31df6bce53..6e62879e16 100644
--- a/cub/device/device_reduce.cuh
+++ b/cub/device/device_reduce.cuh
@@ -37,8 +37,9 @@
 #include <stdio.h>
 #include <iterator>
 
-#include "tiles/tiles_reduce.cuh"
+#include "persistent_block/persistent_block_reduce.cuh"
 #include "../util_allocator.cuh"
+#include "../grid/grid_mapping.cuh"
 #include "../grid/grid_even_share.cuh"
 #include "../grid/grid_queue.cuh"
 
@@ -60,12 +61,12 @@ namespace cub {
  * Multi-block reduction kernel entry point.  Computes privatized reductions, one per thread block.
  */
 template <
-    typename                TilesReducePolicy,      ///< Tuning policy for cub::TilesReduce abstraction
+    typename                PersistentBlockReducePolicy,  ///< Tuning policy for cub::PersistentBlockReduce abstraction
     typename                InputIteratorRA,        ///< The random-access iterator type for input (may be a simple pointer type).
     typename                OutputIteratorRA,       ///< The random-access iterator type for output (may be a simple pointer type).
     typename                SizeT,                  ///< Integral type used for global array indexing
     typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (TilesReducePolicy::BLOCK_THREADS, 1)
+__launch_bounds__ (int(PersistentBlockReducePolicy::BLOCK_THREADS), 1)
 __global__ void MultiBlockReduceKernel(
     InputIteratorRA         d_in,                   ///< [in] Input data to reduce
     OutputIteratorRA        d_out,                  ///< [out] Output location for result
@@ -74,26 +75,24 @@ __global__ void MultiBlockReduceKernel(
     GridQueue<SizeT>        queue,                  ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
     ReductionOp             reduction_op)           ///< [in] Binary reduction operator
 {
-    // Data type of input iterator
+    // Data type
     typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
 
-    // Parameterize TilesReduce for the parallel execution context
-    typedef TilesReduce <TilesReducePolicy, InputIteratorRA, SizeT> TilesReduceT;
+    // Thread block type for reducing input tiles
+    typedef PersistentBlockReduce<PersistentBlockReducePolicy, InputIteratorRA, SizeT, ReductionOp> PersistentBlockReduceT;
 
-    // Parameterize which mapping of tiles -> thread blocks we will use
-    typedef typename TilesReduceT::template Mapping<TilesReducePolicy::GRID_MAPPING> Mapping;
+    // Block-wide aggregate
+    T block_aggregate;
 
-    // Declare shared memory for TilesReduce
-    __shared__ typename TilesReduceT::SmemStorage smem_storage;
+    // Shared memory storage
+    __shared__ typename PersistentBlockReduceT::SmemStorage smem_storage;
 
-    // Reduce tiles
-    T block_aggregate = Mapping::ProcessTiles(
-        smem_storage,
-        d_in,
-        num_items,
-        even_share,
-        queue,
-        reduction_op);
+    // Thread block instance
+    PersistentBlockReduceT tiles(smem_storage, d_in, reduction_op);
+
+    // Consume tiles using thread block instance
+    GridMapping<PersistentBlockReducePolicy::GRID_MAPPING>::ConsumeTilesFlagFirst(
+        tiles, num_items, even_share, queue, block_aggregate);
 
     // Output result
     if (threadIdx.x == 0)
@@ -107,41 +106,41 @@ __global__ void MultiBlockReduceKernel(
  * Single-block reduction kernel entry point.
  */
 template <
-    typename                TilesReducePolicy,      ///< Tuning policy for cub::TilesReduce abstraction
+    typename                PersistentBlockReducePolicy,  ///< Tuning policy for cub::PersistentBlockReduce abstraction
     typename                InputIteratorRA,        ///< The random-access iterator type for input (may be a simple pointer type).
     typename                OutputIteratorRA,       ///< The random-access iterator type for output (may be a simple pointer type).
     typename                SizeT,                  ///< Integral type used for global array indexing
     typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (TilesReducePolicy::BLOCK_THREADS, 1)
+__launch_bounds__ (int(PersistentBlockReducePolicy::BLOCK_THREADS), 1)
 __global__ void SingleBlockReduceKernel(
     InputIteratorRA         d_in,                   ///< [in] Input data to reduce
     OutputIteratorRA        d_out,                  ///< [out] Output location for result
     SizeT                   num_items,              ///< [in] Total number of input data items
     ReductionOp             reduction_op)           ///< [in] Binary reduction operator
 {
-    // Data type of input iterator
+    // Data type
     typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
 
-    // Parameterize TilesReduce for the parallel execution context
-    typedef TilesReduce <TilesReducePolicy, InputIteratorRA, SizeT> TilesReduceT;
+    // Thread block type for reducing input tiles
+    typedef PersistentBlockReduce<PersistentBlockReducePolicy, InputIteratorRA, SizeT, ReductionOp> PersistentBlockReduceT;
+
+    // Block-wide aggregate
+    T block_aggregate;
+
+    // Shared memory storage
+    __shared__ typename PersistentBlockReduceT::SmemStorage smem_storage;
 
-    // Declare shared memory for TilesReduce
-    __shared__ typename TilesReduceT::SmemStorage smem_storage;
+    // Block abstraction for reducing tiles
+    PersistentBlockReduceT tiles(smem_storage, d_in, reduction_op);
 
-    // Reduce tiles
-    T block_aggregate = TilesReduceT::ProcessTilesEvenShare(
-        smem_storage,
-        d_in,
-        SizeT(0),
-        num_items,
-        reduction_op);
+    // Reduce input tiles
+    ConsumeTilesFlagFirst(tiles, 0, num_items, block_aggregate);
 
     // Output result
     if (threadIdx.x == 0)
     {
         d_out[blockIdx.x] = block_aggregate;
     }
-
 }
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
@@ -163,7 +162,7 @@ struct DeviceReduce
 {
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within TilesReducePolicy.
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within PersistentBlockReducePolicy.
     struct KernelDispachParams
     {
         // Policy fields
@@ -178,16 +177,16 @@ struct DeviceReduce
         // Derived fields
         int                     tile_size;
 
-        template <typename TilesReducePolicy>
+        template <typename PersistentBlockReducePolicy>
         __host__ __device__ __forceinline__
         void Init(int subscription_factor = 1)
         {
-            block_threads               = TilesReducePolicy::BLOCK_THREADS;
-            items_per_thread            = TilesReducePolicy::ITEMS_PER_THREAD;
-            vector_load_length          = TilesReducePolicy::VECTOR_LOAD_LENGTH;
-            block_algorithm             = TilesReducePolicy::BLOCK_ALGORITHM;
-            load_modifier               = TilesReducePolicy::LOAD_MODIFIER;
-            grid_mapping                = TilesReducePolicy::GRID_MAPPING;
+            block_threads               = PersistentBlockReducePolicy::BLOCK_THREADS;
+            items_per_thread            = PersistentBlockReducePolicy::ITEMS_PER_THREAD;
+            vector_load_length          = PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH;
+            block_algorithm             = PersistentBlockReducePolicy::BLOCK_ALGORITHM;
+            load_modifier               = PersistentBlockReducePolicy::LOAD_MODIFIER;
+            grid_mapping                = PersistentBlockReducePolicy::GRID_MAPPING;
             this->subscription_factor   = subscription_factor;
 
             tile_size = block_threads * items_per_thread;
@@ -221,8 +220,8 @@ struct DeviceReduce
     struct TunedPolicies<T, SizeT, 350>
     {
         // K20C: 182.1 @ 48M 32-bit T
-        typedef TilesReducePolicy<256, 8,  2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>             MultiBlockPolicy;
-        typedef TilesReducePolicy<256, 16, 2, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>    SingleBlockPolicy;
+        typedef PersistentBlockReducePolicy<256, 8,  2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>             MultiBlockPolicy;
+        typedef PersistentBlockReducePolicy<256, 16, 2, BLOCK_REDUCE_WARP_REDUCTIONS, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>    SingleBlockPolicy;
         enum { SUBSCRIPTION_FACTOR = 4 };
     };
 
@@ -231,8 +230,8 @@ struct DeviceReduce
     struct TunedPolicies<T, SizeT, 300>
     {
         // GTX670: 154.0 @ 48M 32-bit T
-        typedef TilesReducePolicy<256, 2,  2, BLOCK_REDUCE_RAKING,  PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            MultiBlockPolicy;
-        typedef TilesReducePolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS,  PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>   SingleBlockPolicy;
+        typedef PersistentBlockReducePolicy<256, 2,  1, BLOCK_REDUCE_WARP_REDUCTIONS,  PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            MultiBlockPolicy;
+        typedef PersistentBlockReducePolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS,  PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>   SingleBlockPolicy;
         enum { SUBSCRIPTION_FACTOR = 1 };
     };
 
@@ -241,8 +240,8 @@ struct DeviceReduce
     struct TunedPolicies<T, SizeT, 200>
     {
         // GTX 580: 178.9 @ 48M 32-bit T
-        typedef TilesReducePolicy<128, 8,  2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_DYNAMIC>                MultiBlockPolicy;
-        typedef TilesReducePolicy<128, 4,  1, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>             SingleBlockPolicy;
+        typedef PersistentBlockReducePolicy<128, 8,  2, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_DYNAMIC>                MultiBlockPolicy;
+        typedef PersistentBlockReducePolicy<128, 4,  1, BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>             SingleBlockPolicy;
         enum { SUBSCRIPTION_FACTOR = 1 };
     };
 
@@ -250,8 +249,8 @@ struct DeviceReduce
     template <typename T, typename SizeT>
     struct TunedPolicies<T, SizeT, 130>
     {
-        typedef TilesReducePolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            MultiBlockPolicy;
-        typedef TilesReducePolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            SingleBlockPolicy;
+        typedef PersistentBlockReducePolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            MultiBlockPolicy;
+        typedef PersistentBlockReducePolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            SingleBlockPolicy;
         enum { SUBSCRIPTION_FACTOR = 1 };
     };
 
@@ -259,8 +258,8 @@ struct DeviceReduce
     template <typename T, typename SizeT>
     struct TunedPolicies<T, SizeT, 100>
     {
-        typedef TilesReducePolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            MultiBlockPolicy;
-        typedef TilesReducePolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            SingleBlockPolicy;
+        typedef PersistentBlockReducePolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            MultiBlockPolicy;
+        typedef PersistentBlockReducePolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, PTX_LOAD_NONE, GRID_MAPPING_EVEN_SHARE>            SingleBlockPolicy;
         enum { SUBSCRIPTION_FACTOR = 1 };
     };
 
@@ -354,7 +353,7 @@ struct DeviceReduce
         cudaStream_t            stream              = 0,                            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream-0.
         bool                    stream_synchronous  = false)                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
 
         // Kernel launch not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
@@ -396,7 +395,7 @@ struct DeviceReduce
     template <
         typename                    MultiBlockReduceKernelPtr,                          ///< Function type of cub::MultiBlockReduceKernel
         typename                    ReduceSingleKernelPtr,                              ///< Function type of cub::SingleBlockReduceKernel
-        typename                    ResetDrainKernelPtr,                              ///< Function type of cub::ResetDrainKernel
+        typename                    ResetDrainKernelPtr,                                ///< Function type of cub::ResetDrainKernel
         typename                    InputIteratorRA,                                    ///< The random-access iterator type for input (may be a simple pointer type).
         typename                    OutputIteratorRA,                                   ///< The random-access iterator type for output (may be a simple pointer type).
         typename                    SizeT,                                              ///< Integral type used for global array indexing
@@ -405,7 +404,7 @@ struct DeviceReduce
     static cudaError_t DispatchIterative(
         MultiBlockReduceKernelPtr   multi_block_kernel,                                 ///< [in] Kernel function pointer to parameterization of cub::MultiBlockReduceKernel
         ReduceSingleKernelPtr       single_block_kernel,                                ///< [in] Kernel function pointer to parameterization of cub::SingleBlockReduceKernel
-        ResetDrainKernelPtr       prepare_drain_kernel,                               ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
+        ResetDrainKernelPtr         prepare_drain_kernel,                               ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
         KernelDispachParams         &multi_block_dispatch_params,                       ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel_ptr was compiled for
         KernelDispachParams         &single_block_dispatch_params,                      ///< [in] Dispatch parameters that match the policy that \p single_block_kernel was compiled for
         InputIteratorRA             d_in,                                               ///< [in] Input data to reduce
@@ -416,7 +415,7 @@ struct DeviceReduce
         bool                        stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
         DeviceAllocator             *device_allocator   = DefaultDeviceAllocator())     ///< [in] <b>[optional]</b> Allocator for allocating and freeing device memory.  Default is provided by DefaultDeviceAllocator.
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
 
         // Kernel launch not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
@@ -585,7 +584,7 @@ struct DeviceReduce
         KernelDispachParams         &single_block_dispatch_params,                      ///< [in] Dispatch parameters that match the policy that \p single_block_kernel was compiled for
         InputIteratorRA             d_in,                                               ///< [in] Input data to reduce
         OutputIteratorRA            d_out,                                              ///< [out] Output location for result
-        SizeT                         num_items,                                          ///< [in] Number of items to reduce
+        SizeT                       num_items,                                          ///< [in] Number of items to reduce
         ReductionOp                 reduction_op,                                       ///< [in] Binary reduction operator
         cudaStream_t                stream              = 0,                            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream-0.
         bool                        stream_synchronous  = false,                        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
diff --git a/cub/device/persistent_block/persistent_block_histo_256.cuh b/cub/device/persistent_block/persistent_block_histo_256.cuh
new file mode 100644
index 0000000000..8c4d93a1d1
--- /dev/null
+++ b/cub/device/persistent_block/persistent_block_histo_256.cuh
@@ -0,0 +1,810 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::PersistentBlockHisto256 implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../util_arch.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_histo_256.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief PersistentBlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.
+ */
+enum PersistentBlockHisto256Algorithm
+{
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using block-wide sorting (see BlockHisto256Algorithm::BLOCK_HISTO_256_SORT).
+     * -# A single thread block in the second kernel reduces them into the output histogram(s).
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    GRID_HISTO_256_SORT,
+
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using shared-memory \p atomicAdd().
+     * -# A single thread block in the second kernel reduces them into the
+     *    output histogram(s).
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    GRID_HISTO_256_SHARED_ATOMIC,
+
+
+    /**
+     * \par Overview
+     * A single-kernel approach in which thread blocks update the output histogram(s) directly
+     * using global-memory \p atomicAdd().
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    GRID_HISTO_256_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for PersistentBlockHisto256
+ */
+template <
+    int                         _BLOCK_THREADS,
+    int                         _ITEMS_PER_THREAD,
+    PersistentBlockHisto256Algorithm  _GRID_ALGORITHM,
+    GridMappingStrategy         _GRID_MAPPING,
+    int                         _SM_OCCUPANCY>
+struct PersistentBlockHisto256Policy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        SM_OCCUPANCY        = _SM_OCCUPANCY,
+    };
+
+    static const PersistentBlockHisto256Algorithm     GRID_ALGORITHM      = _GRID_ALGORITHM;
+    static const GridMappingStrategy            GRID_MAPPING        = _GRID_MAPPING;
+};
+
+
+
+/******************************************************************************
+ * PersistentBlockHisto256
+ ******************************************************************************/
+
+/**
+ * \brief implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram.
+ */
+template <
+    typename                PersistentBlockHisto256Policy,                                        ///< Tuning policy
+    int                     CHANNELS,                                                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int                     ACTIVE_CHANNELS,                                                ///< Number of channels actively being histogrammed
+    typename                InputIteratorRA,                                                ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                HistoCounter,                                                   ///< Integral type for counting sample occurrences per histogram bin
+    typename                SizeT,                                                          ///< Integer type for offsets
+    PersistentBlockHisto256Algorithm  GRID_ALGORITHM = PersistentBlockHisto256Policy::GRID_ALGORITHM>
+struct PersistentBlockHisto256;
+
+
+/**
+ * Specialized for GRID_HISTO_256_GLOBAL_ATOMIC
+ */
+template <
+    typename                PersistentBlockHisto256Policy,    ///< Tuning policy
+    int                     CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int                     ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                HistoCounter,               ///< Integral type for counting sample occurrences per histogram bin
+    typename                SizeT>                      ///< Integer type for offsets
+struct PersistentBlockHisto256<PersistentBlockHisto256Policy, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT, GRID_HISTO_256_GLOBAL_ATOMIC>
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = PersistentBlockHisto256Policy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    // Shared memory type required by this thread block
+    struct SmemStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to smem_storage
+    SmemStorage &smem_storage;
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ PersistentBlockHisto256(
+        SmemStorage         &smem_storage,                                  ///< Reference to smem_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS]) :         ///< Reference to output histograms
+            smem_storage(smem_storage),
+            d_in(d_in),
+            d_out_histograms(d_out_histograms)
+    {}
+
+
+    /**
+     * The number of items processed per "tile"
+     */
+    __device__ __forceinline__ int TileItems()
+    {
+        return TILE_ITEMS;
+    }
+
+
+    /**
+     * Process a single tile.
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        bool    &sync_after,
+        SizeT   block_offset,
+        int     num_valid)
+    {
+        if (num_valid < TILE_ITEMS)
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = num_valid - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        unsigned char item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(d_out_histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+        else
+        {
+            // Full tile of samples to read and composite
+            unsigned char items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+        }
+
+        // No need to sync after processing this tile to ensure smem coherence
+        sync_after = false;
+    }
+
+
+    /**
+     * Finalize the computation.
+     */
+    __device__ __forceinline__ void Finalize(
+        int dummy_result)
+    {}
+};
+
+
+
+
+/**
+ * Specialized for GRID_HISTO_256_SHARED_ATOMIC
+ */
+template <
+    typename                PersistentBlockHisto256Policy,    ///< Tuning policy
+    int                     CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int                     ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                HistoCounter,               ///< Integral type for counting sample occurrences per histogram bin
+    typename                SizeT>                      ///< Integer type for offsets
+struct PersistentBlockHisto256<PersistentBlockHisto256Policy, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT, GRID_HISTO_256_SHARED_ATOMIC>
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = PersistentBlockHisto256Policy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    // Shared memory type required by this thread block
+    struct SmemStorage
+    {
+        HistoCounter histograms[ACTIVE_CHANNELS][256];
+    };
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to smem_storage
+    SmemStorage &smem_storage;
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ PersistentBlockHisto256(
+        SmemStorage         &smem_storage,                                  ///< Reference to smem_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS]) :         ///< Reference to output histograms
+            smem_storage(smem_storage),
+            d_in(d_in),
+            d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int histo_offset = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS)
+            {
+                smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+            // Finish up with guarded initialization if necessary
+            if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256))
+            {
+                smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * The number of items processed per "tile"
+     */
+    __device__ __forceinline__ int TileItems()
+    {
+        return TILE_ITEMS;
+    }
+
+
+    /**
+     * Process a single tile.
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        bool    &sync_after,
+        SizeT   block_offset,
+        int     num_valid)
+    {
+        if (num_valid < TILE_ITEMS)
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = num_valid - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        unsigned char item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(smem_storage.histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+        else
+        {
+            // Full tile of samples to read and composite
+            unsigned char items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(smem_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+        }
+
+        // No need to sync after processing this tile to ensure smem coherence
+        sync_after = false;
+    }
+
+
+    /**
+     * Finalize the computation.
+     */
+    __device__ __forceinline__ void Finalize(
+        int dummy_result)
+    {
+        // Barrier to ensure shared memory histograms are coherent
+        __syncthreads();
+
+        // Copy shared memory histograms to output
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_offset  = (blockIdx.x * 256);
+            int histo_offset    = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= 256; histo_offset += BLOCK_THREADS)
+            {
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+            }
+            // Finish up with guarded initialization if necessary
+            if ((histo_offset < BLOCK_THREADS) && (histo_offset + threadIdx.x < 256))
+            {
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = smem_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+            }
+        }
+    }
+};
+
+
+/**
+ * Specialized for GRID_HISTO_256_SORT
+ */
+template <
+    typename                PersistentBlockHisto256Policy,    ///< Tuning policy
+    int                     CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int                     ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                HistoCounter,               ///< Integral type for counting sample occurrences per histogram bin
+    typename                SizeT>                      ///< Integer type for offsets
+struct PersistentBlockHisto256<PersistentBlockHisto256Policy, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT, GRID_HISTO_256_SORT>
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = PersistentBlockHisto256Policy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = PersistentBlockHisto256Policy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+
+        STRIPED_COUNTERS_PER_THREAD = (256 + BLOCK_THREADS - 1) / BLOCK_THREADS,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<unsigned char, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<unsigned char, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Shared memory type required by this thread block
+    union SmemStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::SmemStorage sort_storage;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::SmemStorage discont_storage;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+            unsigned int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+        };
+    };
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to smem_storage
+        SmemStorage &smem_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(SmemStorage &smem_storage) : smem_storage(smem_storage) {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const unsigned char &a, const unsigned char &b, unsigned int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                smem_storage.run_begin[b] = b_index;
+                smem_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to smem_storage
+    SmemStorage &smem_storage;
+
+    /// Histogram counters striped across threads
+    HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ PersistentBlockHisto256(
+        SmemStorage         &smem_storage,                                  ///< Reference to smem_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS]) :         ///< Reference to output histograms
+            smem_storage(smem_storage),
+            d_in(d_in),
+            d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram counters striped across threads
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                thread_counters[CHANNEL][COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * The number of items processed per "tile"
+     */
+    __device__ __forceinline__ int TileItems()
+    {
+        return TILE_ITEMS;
+    }
+
+
+    /**
+     * Composite a tile of input items
+     */
+    __device__ __forceinline__ void Composite(
+        unsigned char   (&items)[ITEMS_PER_THREAD],                     ///< Tile of samples
+        HistoCounter    thread_counters[STRIPED_COUNTERS_PER_THREAD])   ///< Histogram counters striped across threads
+    {
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT::SortBlocked(smem_storage.sort_storage, items);
+
+        __syncthreads();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            smem_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+            smem_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+        }
+
+        __syncthreads();
+
+        // Note the begin/end run offsets of bin runs in the sorted tile
+        int flags[ITEMS_PER_THREAD];                // unused
+        DiscontinuityOp flag_op(smem_storage);
+        BlockDiscontinuityT::Flag(smem_storage.discont_storage, items, flag_op, flags);
+
+        // Update begin for first item
+        if (threadIdx.x == 0) smem_storage.run_begin[items[0]] = 0;
+
+        __syncthreads();
+
+        // Composite into histogram
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+            thread_counters[COUNTER] += smem_storage.run_end[bin] - smem_storage.run_begin[bin];
+        }
+    }
+
+
+    /**
+     * Process one channel within a tile.
+     */
+    __device__ __forceinline__ void ConsumeTileChannel(
+        int     channel,
+        SizeT   block_offset,
+        int     num_valid)
+    {
+        // Load items in striped fashion
+        if (num_valid < TILE_ITEMS)
+        {
+            // Only a partially-full tile of samples to read and composite
+            unsigned char items[ITEMS_PER_THREAD];
+
+            // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
+            int bounds = (num_valid - (threadIdx.x * CHANNELS));
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
+                    d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
+                    0;
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+
+            __syncthreads();
+
+            // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
+            if (threadIdx.x == 0)
+            {
+                int extra = (TILE_ITEMS - num_valid) / CHANNELS;
+                thread_counters[channel][0] -= extra;
+            }
+        }
+        else
+        {
+            // Full tile of samples to read and composite
+            unsigned char items[ITEMS_PER_THREAD];
+
+            // Unguarded loads
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+        }
+    }
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Inductive step.
+     */
+    template <int CHANNEL, int END>
+    struct IterateChannels
+    {
+        /**
+         * Process one channel within a tile.
+         */
+        static __device__ __forceinline__ void ConsumeTileChannel(
+            PersistentBlockHisto256   *persistent_block_histo,
+            SizeT           block_offset,
+            int             num_valid)
+        {
+            __syncthreads();
+
+            persistent_block_histo->ConsumeTileChannel(CHANNEL, block_offset, num_valid);
+
+            IterateChannels<CHANNEL + 1, END>::ConsumeTileChannel(persistent_block_histo, block_offset, num_valid);
+        }
+    };
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Base step.
+     */
+    template <int END>
+    struct IterateChannels<END, END>
+    {
+        static __device__ __forceinline__ void ConsumeTileChannel(PersistentBlockHisto256 *persistent_block_histo, SizeT block_offset, int num_valid) {}
+    };
+
+
+    /**
+     * Process a single tile.
+     *
+     * We take several passes through the tile in this variant, extracting the samples for one channel at a time
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        bool    &sync_after,
+        SizeT   block_offset,
+        int     num_valid)
+    {
+        // First channel
+        ConsumeTileChannel(0, block_offset, num_valid);
+
+        // Iterate through remaining channels
+        IterateChannels<1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, num_valid);
+
+        // Need to sync after processing this tile to ensure smem coherence
+        sync_after = true;
+    }
+
+
+    /**
+     * Finalize the computation.
+     */
+    __device__ __forceinline__ void Finalize(
+        int dummy_result)
+    {
+        // Copy counters striped across threads into the histogram output
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_offset  = (blockIdx.x * 256);
+
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+
+                if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == 256) || (bin < 256))
+                {
+                    d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
+                }
+            }
+        }
+    }
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cub/device/persistent_block/persistent_block_reduce.cuh b/cub/device/persistent_block/persistent_block_reduce.cuh
new file mode 100644
index 0000000000..9dbc7a3f15
--- /dev/null
+++ b/cub/device/persistent_block/persistent_block_reduce.cuh
@@ -0,0 +1,247 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::PersistentBlockReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_reduce.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Tuning policy for PersistentBlockReduce
+ */
+template <
+    int                     _BLOCK_THREADS,
+    int                     _ITEMS_PER_THREAD,
+    int                     _VECTOR_LOAD_LENGTH,
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,
+    PtxLoadModifier         _LOAD_MODIFIER,
+    GridMappingStrategy     _GRID_MAPPING>
+struct PersistentBlockReducePolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;
+    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;
+    static const PtxLoadModifier       LOAD_MODIFIER        = _LOAD_MODIFIER;
+};
+
+
+/**
+ * \brief PersistentBlockReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+ */
+template <
+    typename PersistentBlockReducePolicy,
+    typename InputIteratorRA,
+    typename SizeT,
+    typename ReductionOp>
+struct PersistentBlockReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type      T;              // Type of input iterator
+    typedef VectorHelper<T, PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH>      VecHelper;      // Helper type for vectorizing loads of T
+    typedef typename VecHelper::Type                                        VectorT;        // Vector of T
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = PersistentBlockReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = PersistentBlockReducePolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a built-in primitive
+        CAN_VECTORIZE       = (PersistentBlockReducePolicy::VECTOR_LOAD_LENGTH > 1) &&
+                                (IsPointer<InputIteratorRA>::VALUE) &&
+                                (VecHelper::BUILT_IN),
+
+    };
+
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = PersistentBlockReducePolicy::BLOCK_ALGORITHM;
+
+    // Parameterized BlockReduce primitive
+    typedef BlockReduce<T, BLOCK_THREADS, PersistentBlockReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    // Shared memory type required by this thread block
+    typedef typename BlockReduceT::SmemStorage SmemStorage;
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    T                       thread_aggregate;   ///< Each thread's partial reduction
+    SmemStorage&            smem_storage;       ///< Reference to smem_storage
+    InputIteratorRA         d_in;               ///< Input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+    int                     first_tile_size;    ///< Size of first tile consumed
+    bool                    input_aligned;      ///< Whether or not input is vector-aligned
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ PersistentBlockReduce(
+        SmemStorage&            smem_storage,       ///< Reference to smem_storage
+        InputIteratorRA         d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op) :     ///< Binary reduction operator
+            smem_storage(smem_storage),
+            d_in(d_in),
+            reduction_op(reduction_op),
+            first_tile_size(TILE_ITEMS),
+            input_aligned(CAN_VECTORIZE && ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0)){}
+
+
+    /**
+     * The number of items processed per "tile"
+     */
+    __device__ __forceinline__ int TileItems()
+    {
+        return TILE_ITEMS;
+    }
+
+
+    /**
+     * Process a single tile.
+     *
+     * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+     * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+     * accumulated into \p thread_aggregate.
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        bool    &sync_after,
+        SizeT   block_offset,
+        int     num_valid,
+        bool    first_tile)
+    {
+        if (num_valid < TILE_ITEMS)
+        {
+            // Our first tile is a partial tile size
+            if (first_tile) first_tile_size = num_valid;
+
+            // Partial tile
+            int thread_offset = threadIdx.x;
+
+            if ((first_tile) && (thread_offset < num_valid))
+            {
+                thread_aggregate = ThreadLoad<PersistentBlockReducePolicy::LOAD_MODIFIER>(d_in + block_offset + thread_offset);
+                thread_offset += BLOCK_THREADS;
+            }
+
+            while (thread_offset < num_valid)
+            {
+                T item = ThreadLoad<PersistentBlockReducePolicy::LOAD_MODIFIER>(d_in + block_offset + thread_offset);
+                thread_aggregate = reduction_op(thread_aggregate, item);
+                thread_offset += BLOCK_THREADS;
+            }
+        }
+        else
+        {
+            T items[ITEMS_PER_THREAD];
+
+            // Load full tile
+            if (input_aligned)
+            {
+                // Alias items as an array of VectorT and load it in striped fashion
+                BlockLoadDirectStriped(
+                    reinterpret_cast<VectorT*>(d_in + block_offset),
+                    reinterpret_cast<VectorT (&)[ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH]>(items));
+            }
+            else
+            {
+                // Load items in striped fashion
+                BlockLoadDirectStriped(d_in + block_offset, items);
+            }
+
+            // Prevent hoisting
+            __threadfence_block();
+
+            // Reduce items within each thread
+            T partial = ThreadReduce(items, reduction_op);
+
+            // Update|assign the thread's running aggregate
+            thread_aggregate = (first_tile) ?
+                partial :
+                reduction_op(thread_aggregate, partial);
+        }
+
+        // No synchronization needed after tile processing
+        sync_after = false;
+    }
+
+
+    /**
+     * Finalize the computation.
+     */
+    __device__ __forceinline__ void Finalize(
+        T& block_aggregate)
+    {
+        // Cooperative reduction across the thread block (guarded reduction if our first tile was a partial tile)
+        block_aggregate = (first_tile_size < TILE_ITEMS) ?
+            BlockReduceT::Reduce(smem_storage, thread_aggregate, reduction_op, first_tile_size) :
+            BlockReduceT::Reduce(smem_storage, thread_aggregate, reduction_op);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/cub/device/tiles/tiles_scan.cuh b/cub/device/persistent_block/persistent_block_scan.cuh
similarity index 88%
rename from cub/device/tiles/tiles_scan.cuh
rename to cub/device/persistent_block/persistent_block_scan.cuh
index 9b92a7363d..a6ff677983 100644
--- a/cub/device/tiles/tiles_scan.cuh
+++ b/cub/device/persistent_block/persistent_block_scan.cuh
@@ -28,7 +28,7 @@
 
 /**
  * \file
- * cub::TilesScan implements an abstraction of CUDA thread blocks for
+ * cub::PersistentBlockScan implements an abstraction of CUDA thread blocks for
  * participating in device-wide prefix scan.
  */
 
@@ -61,7 +61,7 @@ enum
 
 
 /**
- * Tuning policy for TilesScan
+ * Tuning policy for PersistentBlockScan
  */
 template <
     int                         _BLOCK_THREADS,
@@ -69,7 +69,7 @@ template <
     BlockLoadPolicy             _LOAD_POLICY,
     BlockStorePolicy            _STORE_POLICY,
     BlockScanAlgorithm          _SCAN_ALGORITHM>
-struct TilesScanPolicy
+struct PersistentBlockScanPolicy
 {
     enum
     {
@@ -87,15 +87,15 @@ struct TilesScanPolicy
 
 
 /**
- * \brief TilesScan implements an abstraction of CUDA thread blocks for
+ * \brief PersistentBlockScan implements an abstraction of CUDA thread blocks for
  * participating in device-wide reduction.
  */
 template <
-    typename TilesScanPolicy,
+    typename PersistentBlockScanPolicy,
     typename InputIteratorRA,
     typename OutputIteratorRA,
     typename SizeT>
-class TilesScan
+class PersistentBlockScan
 {
 public:
 
@@ -107,12 +107,12 @@ public:
     typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
 
     // Data type of block-signaling flag
-    typedef typename TilesScanPolicy::BlockFlag BlockFlag;
+    typedef typename PersistentBlockScanPolicy::BlockFlag BlockFlag;
 
     // Constants
     enum
     {
-        TILE_ITEMS = TilesScanPolicy::BLOCK_THREADS * TilesScanPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS = PersistentBlockScanPolicy::BLOCK_THREADS * PersistentBlockScanPolicy::ITEMS_PER_THREAD,
     };
 
     struct Signal
@@ -146,22 +146,22 @@ public:
     // Parameterized block load
     typedef BlockLoad<
         InputIteratorRA,
-        TilesScanPolicy::BLOCK_THREADS,
-        TilesScanPolicy::ITEMS_PER_THREAD,
-        TilesScanPolicy::LOAD_POLICY>          BlockLoadT;
+        PersistentBlockScanPolicy::BLOCK_THREADS,
+        PersistentBlockScanPolicy::ITEMS_PER_THREAD,
+        PersistentBlockScanPolicy::LOAD_POLICY>          BlockLoadT;
 
     // Parameterized block store
     typedef BlockStore<
         OutputIteratorRA,
-        TilesScanPolicy::BLOCK_THREADS,
-        TilesScanPolicy::ITEMS_PER_THREAD,
-        TilesScanPolicy::STORE_POLICY>         BlockStoreT;
+        PersistentBlockScanPolicy::BLOCK_THREADS,
+        PersistentBlockScanPolicy::ITEMS_PER_THREAD,
+        PersistentBlockScanPolicy::STORE_POLICY>         BlockStoreT;
 
     // Parameterized block scan
     typedef BlockScan<
         T,
-        TilesScanPolicy::BLOCK_THREADS,
-        TilesScanPolicy::SCAN_ALGORITHM>       BlockScanT;
+        PersistentBlockScanPolicy::BLOCK_THREADS,
+        PersistentBlockScanPolicy::SCAN_ALGORITHM>       BlockScanT;
 
     // Parameterized warp reduce
     typedef WarpReduce<Signal>                      WarpReduceT;
@@ -270,7 +270,7 @@ public:
         ScanOp                  &scan_op,
         T                       &thread_aggregate)
     {
-        T items[TilesScanPolicy::ITEMS_PER_THREAD];
+        T items[PersistentBlockScanPolicy::ITEMS_PER_THREAD];
 
         BlockLoadT::Load(smem_storage.load, d_in + block_offset, items);
 
@@ -392,7 +392,7 @@ public:
      * The return value is undefined in threads other than thread<sub>0</sub>.
      */
     template <typename SizeT, typename ScanOp>
-    static __device__ __forceinline__ T ProcessTilesEvenShare(
+    static __device__ __forceinline__ T ProcessPersistentBlockEvenShare(
         SmemStorage             &smem_storage,
         InputIteratorRA           d_in,
         SizeT                   block_offset,
@@ -438,7 +438,7 @@ public:
      * The return value is undefined in threads other than thread<sub>0</sub>.
      */
     template <typename SizeT, typename ScanOp>
-    static __device__ __forceinline__ T ProcessTilesDynamic(
+    static __device__ __forceinline__ T ProcessPersistentBlockDynamic(
         SmemStorage             &smem_storage,
         InputIteratorRA           d_in,
         SizeT                   num_items,
@@ -505,45 +505,6 @@ public:
         }
     }
 
-
-    /**
-     * \brief Consumes input tiles according to <tt>TilesScanPolicy::GRID_MAPPING</tt>, computing a threadblock-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     */
-    template <typename SizeT, typename ScanOp>
-    static __device__ __forceinline__ T ProcessTiles(
-        SmemStorage             &smem_storage,
-        InputIteratorRA           d_in,
-        SizeT                   num_items,
-        GridEvenShare<SizeT>    &even_share,
-        GridQueue<SizeT>        &queue,
-        ScanOp             &scan_op)
-    {
-        if (TilesScanPolicy::GRID_MAPPING == GRID_MAPPING_EVEN_SHARE)
-        {
-            // Even share
-            even_share.BlockInit();
-
-            return ProcessTilesEvenShare(
-                smem_storage,
-                d_in,
-                even_share.block_offset,
-                even_share.block_oob,
-                scan_op);
-        }
-        else
-        {
-            // Dynamically dequeue
-            return ProcessTilesDynamic(
-                smem_storage,
-                d_in,
-                num_items,
-                queue,
-                scan_op);
-        }
-    }
-
 };
 
 
diff --git a/cub/device/tiles/tiles_histo_256.cuh b/cub/device/tiles/tiles_histo_256.cuh
deleted file mode 100644
index 94623db89f..0000000000
--- a/cub/device/tiles/tiles_histo_256.cuh
+++ /dev/null
@@ -1,499 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::TilesHisto256 implements an abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide 256-bin histogram.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../grid/grid_mapping.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_histo_256.cuh"
-#include "../../util_vector.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Tuning policy for TilesHisto256
- */
-template <
-    int                     _BLOCK_THREADS,
-    int                     _ITEMS_PER_THREAD,
-    BlockHisto256Algorithm  _BLOCK_ALGORITHM,
-    GridMappingStrategy     _GRID_MAPPING>
-struct TilesHisto256Policy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
-    };
-
-    static const BlockHisto256Algorithm     BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;
-    static const GridMappingStrategy        GRID_MAPPING         = _GRID_MAPPING;
-};
-
-
-
-/******************************************************************************
- * TilesHisto256
- ******************************************************************************/
-
-/**
- * \brief TilesHisto256 implements an abstraction of CUDA thread blocks for participating in device-wide histogram.
- */
-template <
-    typename    TilesHisto256Policy,      ///< Tuning policy
-    int         CHANNELS,                 ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    typename    SizeT>                    ///< Integer type for offsets
-class TilesHisto256
-{
-private:
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = TilesHisto256Policy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = TilesHisto256Policy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
-    };
-
-    static const BlockHisto256Algorithm BLOCK_ALGORITHM = TilesHisto256Policy::BLOCK_ALGORITHM;
-
-    // Parameterized BlockHisto256 primitive
-    typedef BlockHisto256<BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_ALGORITHM> BlockHisto256T;
-
-    // Shared memory type for this threadblock
-    struct _SmemStorage
-    {
-        SizeT                                   block_offset;   // Location where to dequeue input for dynamic operation
-        typename BlockHisto256T::SmemStorage    block_histo;    // Smem needed for cooperative histogramming
-    };
-
-public:
-
-    /// \smemstorage{TilesHisto256}
-    typedef _SmemStorage SmemStorage;
-
-private:
-
-    //---------------------------------------------------------------------
-    // Utility operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Channel-oriented (one channel at a time)
-     */
-    template <
-        BlockHisto256Algorithm _BLOCK_ALGORITHM,
-        bool CHANNEL_ORIENTED = (_BLOCK_ALGORITHM == BLOCK_BYTE_HISTO_SORT) >
-    struct TilesHisto256Internal
-    {
-        /**
-         * Process one channel within a tile.
-         */
-        template <
-            typename        InputIteratorRA,
-            typename        HistoCounter,
-            int             ACTIVE_CHANNELS>
-        static __device__ __forceinline__ void ConsumeTileChannel(
-            SmemStorage     &smem_storage,
-            int             channel,
-            InputIteratorRA d_in,
-            SizeT           block_offset,
-            HistoCounter    (&histograms)[ACTIVE_CHANNELS][256],
-            const int       &guarded_items = TILE_ITEMS)
-        {
-            // Load items in striped fashion
-            if (guarded_items < TILE_ITEMS)
-            {
-                unsigned char items[ITEMS_PER_THREAD];
-
-                // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
-                int bounds = (guarded_items - (threadIdx.x * CHANNELS));
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
-                        d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
-                        0;
-                }
-
-                // Composite our histogram data
-                BlockHisto256T::Composite(smem_storage.block_histo, items, histograms[channel]);
-
-                __syncthreads();
-
-                if (threadIdx.x == 0)
-                {
-                    int extra = (TILE_ITEMS - guarded_items) / CHANNELS;
-                    histograms[channel][0] -= extra;
-                }
-            }
-            else
-            {
-                unsigned char items[ITEMS_PER_THREAD];
-
-                // Unguarded loads
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
-                }
-
-                // Composite our histogram data
-                BlockHisto256T::Composite(smem_storage.block_histo, items, histograms[channel]);
-            }
-        }
-
-
-        /**
-         * Process one tile.
-         */
-        template <
-            typename        InputIteratorRA,
-            typename        HistoCounter,
-            int             ACTIVE_CHANNELS>
-        static __device__ __forceinline__ void ConsumeTile(
-            SmemStorage     &smem_storage,
-            InputIteratorRA d_in,
-            SizeT           block_offset,
-            HistoCounter    (&histograms)[ACTIVE_CHANNELS][256],
-            const int       &guarded_items = TILE_ITEMS)
-        {
-            // We take several passes through the tile in this variant, extracting the samples for one channel at a time
-
-            // First channel
-            ConsumeTileChannel(smem_storage, 0, d_in, block_offset, histograms, guarded_items);
-
-            // Iterate through remaining channels
-            #pragma unroll
-            for (int CHANNEL = 1; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                __syncthreads();
-
-                ConsumeTileChannel(smem_storage, CHANNEL, d_in, block_offset, histograms, guarded_items);
-            }
-        }
-    };
-
-
-
-    /**
-     * BLOCK_BYTE_HISTO_ATOMIC algorithmic variant
-     */
-    template <BlockHisto256Algorithm _BLOCK_ALGORITHM>
-    struct TilesHisto256Internal<_BLOCK_ALGORITHM, false>
-    {
-        /**
-         * Process one tile.
-         */
-        template <
-            typename        InputIteratorRA,
-            typename        HistoCounter,
-            int             ACTIVE_CHANNELS>
-        static __device__ __forceinline__ void ConsumeTile(
-            SmemStorage     &smem_storage,
-            InputIteratorRA d_in,
-            SizeT           block_offset,
-            HistoCounter    (&histograms)[ACTIVE_CHANNELS][256],
-            const int       &guarded_items = TILE_ITEMS)
-        {
-
-            if (guarded_items < TILE_ITEMS)
-            {
-                int bounds = guarded_items - (threadIdx.x * CHANNELS);
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-                {
-                    #pragma unroll
-                    for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                    {
-                        if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
-                        {
-                            unsigned char item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                            atomicAdd(histograms[CHANNEL] + item, 1);
-                        }
-                    }
-                }
-
-            }
-            else
-            {
-                unsigned char items[ITEMS_PER_THREAD][CHANNELS];
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    #pragma unroll
-                    for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                    {
-                        if (CHANNEL < ACTIVE_CHANNELS)
-                        {
-                            items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                        }
-                    }
-                }
-
-                __threadfence_block();
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    #pragma unroll
-                    for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                    {
-                        if (CHANNEL < ACTIVE_CHANNELS)
-                        {
-                            atomicAdd(histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
-                        }
-                    }
-                }
-
-
-/*
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    unsigned char items[ITEMS_PER_THREAD][CHANNELS];
-
-                    int tile_offset = (CHANNEL * TILE_CHANNEL_ITEMS);
-
-                    #pragma unroll
-                    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                    {
-                        items[ITEM] = d_in[block_offset + tile_offset + (ITEM * BLOCK_THREADS) + threadIdx.x];
-                    }
-
-                    __threadfence_block();
-
-                    // Update histogram
-
-                    if ((ACTIVE_CHANNELS == CHANNELS) || (my_channel < ACTIVE_CHANNELS))
-                    {
-                        #pragma unroll
-                        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                        {
-                            atomicAdd(histograms[my_channel] + items[ITEM], 1);
-                        }
-                    }
-                }
-*/
-            }
-        }
-    };
-
-
-
-
-
-
-public:
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Consumes input tiles using an even-share policy
-     */
-    template <
-        typename        InputIteratorRA,
-        typename        HistoCounter,
-        int             ACTIVE_CHANNELS>
-    static __device__ __forceinline__ void ProcessTilesEvenShare(
-        SmemStorage     &smem_storage,
-        InputIteratorRA d_in,
-        SizeT           block_offset,
-        const SizeT     &block_oob,
-        HistoCounter    (&histograms)[ACTIVE_CHANNELS][256])
-    {
-        // Initialize histograms
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            BlockHisto256T::InitHistogram(histograms[CHANNEL]);
-        }
-
-        __syncthreads();
-
-        // Consume full tiles
-        while (block_offset + TILE_ITEMS <= block_oob)
-        {
-            TilesHisto256Internal<BLOCK_ALGORITHM>::ConsumeTile(smem_storage, d_in, block_offset, histograms);
-
-            block_offset += TILE_ITEMS;
-
-            // Skip synchro for atomic version since we know it doesn't use any smem
-            if (BLOCK_ALGORITHM !=  BLOCK_BYTE_HISTO_ATOMIC)
-            {
-                __syncthreads();
-            }
-        }
-
-        // Consume any remaining partial-tile
-        if (block_offset < block_oob)
-        {
-            TilesHisto256Internal<BLOCK_ALGORITHM>::ConsumeTile(smem_storage, d_in, block_offset, histograms, block_oob - block_offset);
-        }
-    }
-
-
-    /**
-     * \brief Consumes input tiles using a dynamic queue policy
-     */
-    template <
-        typename            InputIteratorRA,
-        typename            HistoCounter,
-        int                 ACTIVE_CHANNELS>
-    static __device__ __forceinline__ void ProcessTilesDynamic(
-        SmemStorage         &smem_storage,
-        InputIteratorRA     d_in,
-        SizeT               num_items,
-        GridQueue<SizeT>    &queue,
-        HistoCounter        (&histograms)[ACTIVE_CHANNELS][256])
-    {
-
-        // Initialize histograms
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            BlockHisto256T::InitHistogram(histograms[CHANNEL]);
-        }
-
-        // Dynamically consume tiles
-        while (true)
-        {
-            // Dequeue up to TILE_ITEMS
-            if (threadIdx.x == 0)
-            {
-                smem_storage.block_offset = queue.Drain(TILE_ITEMS);
-            }
-
-            __syncthreads();
-
-            SizeT block_offset = smem_storage.block_offset;
-
-            __syncthreads();
-
-            if (block_offset + TILE_ITEMS > num_items)
-            {
-                if (block_offset < num_items)
-                {
-                    // We have less than a full tile to consume
-                    TilesHisto256Internal<BLOCK_ALGORITHM>::ConsumeTile(smem_storage, d_in, block_offset, histograms, num_items - block_offset);
-                }
-
-                // No more work to do
-                break;
-            }
-
-            // We have a full tile to consume
-            TilesHisto256Internal<BLOCK_ALGORITHM>::ConsumeTile(smem_storage, d_in, block_offset, histograms);
-        }
-    }
-
-
-    /**
-     * Specialized for GRID_MAPPING_EVEN_SHARE
-     */
-    template <GridMappingStrategy GRID_MAPPING, int DUMMY = 0>
-    struct Mapping
-    {
-        template <
-            typename                InputIteratorRA,
-            typename                HistoCounter,
-            int                     ACTIVE_CHANNELS>
-        static __device__ __forceinline__ void ProcessTiles(
-            SmemStorage             &smem_storage,
-            InputIteratorRA         d_in,
-            SizeT                   num_items,
-            GridEvenShare<SizeT>    &even_share,
-            GridQueue<SizeT>        &queue,
-            HistoCounter            (&histograms)[ACTIVE_CHANNELS][256])
-        {
-            even_share.BlockInit();
-            return ProcessTilesEvenShare(smem_storage, d_in, even_share.block_offset, even_share.block_oob, histograms);
-        }
-
-    };
-
-
-    /**
-     * Specialized for GRID_MAPPING_DYNAMIC
-     */
-    template <int DUMMY>
-    struct Mapping<GRID_MAPPING_DYNAMIC, DUMMY>
-    {
-        template <
-            typename                InputIteratorRA,
-            typename                HistoCounter,
-            int                     ACTIVE_CHANNELS>
-        static __device__ __forceinline__ void ProcessTiles(
-            SmemStorage             &smem_storage,
-            InputIteratorRA         d_in,
-            SizeT                   num_items,
-            GridEvenShare<SizeT>    &even_share,
-            GridQueue<SizeT>        &queue,
-            HistoCounter            (&histograms)[ACTIVE_CHANNELS][256])
-        {
-            ProcessTilesDynamic(smem_storage, d_in, num_items, queue, histograms);
-        }
-
-    };
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/cub/device/tiles/tiles_reduce.cuh b/cub/device/tiles/tiles_reduce.cuh
deleted file mode 100644
index 54b33d1ae9..0000000000
--- a/cub/device/tiles/tiles_reduce.cuh
+++ /dev/null
@@ -1,457 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::TilesReduce implements an abstraction of CUDA thread blocks for
- * reducing multiple tiles as part of device-wide reduction.
-
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../grid/grid_mapping.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_reduce.cuh"
-#include "../../util_vector.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Tuning policy for TilesReduce
- */
-template <
-    int                     _BLOCK_THREADS,
-    int                     _ITEMS_PER_THREAD,
-    int                     _VECTOR_LOAD_LENGTH,
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,
-    PtxLoadModifier         _LOAD_MODIFIER,
-    GridMappingStrategy     _GRID_MAPPING>
-struct TilesReducePolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;
-    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;
-    static const PtxLoadModifier       LOAD_MODIFIER        = _LOAD_MODIFIER;
-};
-
-
-/**
- * \brief TilesReduce implements an abstraction of CUDA thread blocks for
- * participating in device-wide reduction.
- */
-template <
-    typename TilesReducePolicy,
-    typename InputIteratorRA,
-    typename SizeT>
-class TilesReduce
-{
-private:
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-    // Constants
-    enum
-    {
-        // Number of items to be be processed to completion before the thread block terminates or obtains more work
-        TILE_ITEMS = TilesReducePolicy::BLOCK_THREADS * TilesReducePolicy::ITEMS_PER_THREAD,
-    };
-
-    // Parameterized BlockReduce primitive
-    typedef BlockReduce<T, TilesReducePolicy::BLOCK_THREADS, TilesReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
-
-    // Shared memory type for this threadblock
-    struct _SmemStorage
-    {
-        SizeT block_offset;                                 // Location where to dequeue input for dynamic operation
-        typename BlockReduceT::SmemStorage reduce;          // Smem needed for cooperative reduction
-    };
-
-public:
-
-    /// \smemstorage{TilesReduce}
-    typedef _SmemStorage SmemStorage;
-
-private:
-
-    //---------------------------------------------------------------------
-    // Utility operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a single, full tile.  Specialized for native pointers
-     *
-     * Each thread reduces only the values it loads.  If \p FIRST_TILE,
-     * this partial reduction is stored into \p thread_aggregate.  Otherwise
-     * it is accumulated into \p thread_aggregate.
-     *
-     * Performs a block-wide barrier synchronization
-     */
-    template <
-        bool            VECTORIZE_INPUT,
-        bool            FIRST_TILE,
-        typename        ReductionOp>
-    static __device__ __forceinline__ void ConsumeFullTile(
-        SmemStorage     &smem_storage,
-        InputIteratorRA   d_in,
-        SizeT           block_offset,
-        ReductionOp     &reduction_op,
-        T               &thread_aggregate)
-    {
-        T items[TilesReducePolicy::ITEMS_PER_THREAD];
-
-        if (VECTORIZE_INPUT)
-        {
-            typedef VectorHelper<T, TilesReducePolicy::VECTOR_LOAD_LENGTH> VecHelper;
-            typedef typename VecHelper::Type VectorT;
-
-            // Alias items as an array of VectorT and load it in striped fashion
-            BlockLoadDirectStriped(
-                reinterpret_cast<VectorT*>(d_in + block_offset),
-                reinterpret_cast<VectorT (&)[TilesReducePolicy::ITEMS_PER_THREAD / TilesReducePolicy::VECTOR_LOAD_LENGTH]>(items));
-        }
-        else
-        {
-            // Load items in striped fashion
-            BlockLoadDirectStriped(
-                d_in + block_offset,
-                items);
-        }
-
-        // Prevent hoisting
-        __threadfence_block();
-
-        // Reduce items within each thread
-        T partial = ThreadReduce(items, reduction_op);
-
-        // Update|assign the thread's running aggregate
-        thread_aggregate = (FIRST_TILE) ?
-            partial :
-            reduction_op(thread_aggregate, partial);
-    }
-
-
-    /**
-     * Process a single, partial tile.
-     *
-     * Each thread reduces only the values it loads.  If \p FIRST_TILE,
-     * this partial reduction is stored into \p thread_aggregate.  Otherwise
-     * it is accumulated into \p thread_aggregate.
-     */
-    template <
-        bool            FIRST_TILE,
-        typename        ReductionOp>
-    static __device__ __forceinline__ void ConsumePartialTile(
-        SmemStorage     &smem_storage,
-        InputIteratorRA   d_in,
-        SizeT           block_offset,
-        const SizeT     &block_oob,
-        ReductionOp     &reduction_op,
-        T               &thread_aggregate)
-    {
-        SizeT thread_offset = block_offset + threadIdx.x;
-
-        if ((FIRST_TILE) && (thread_offset < block_oob))
-        {
-            thread_aggregate = ThreadLoad<TilesReducePolicy::LOAD_MODIFIER>(d_in + thread_offset);
-            thread_offset += TilesReducePolicy::BLOCK_THREADS;
-        }
-
-        while (thread_offset < block_oob)
-        {
-            T item = ThreadLoad<TilesReducePolicy::LOAD_MODIFIER>(d_in + thread_offset);
-            thread_aggregate = reduction_op(thread_aggregate, item);
-            thread_offset += TilesReducePolicy::BLOCK_THREADS;
-        }
-    }
-
-
-    /**
-     * \brief Consumes input tiles using an even-share policy, computing a threadblock-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     */
-    template <
-        bool            VECTORIZE_INPUT,
-        typename        ReductionOp>
-    static __device__ __forceinline__ T ProcessTilesEvenShare(
-        SmemStorage     &smem_storage,
-        InputIteratorRA   d_in,
-        SizeT           block_offset,
-        const SizeT     &block_oob,
-        ReductionOp     &reduction_op)
-    {
-        if (block_offset + TILE_ITEMS <= block_oob)
-        {
-            // We have at least one full tile to consume
-            T thread_aggregate;
-            ConsumeFullTile<VECTORIZE_INPUT, true>(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
-            block_offset += TILE_ITEMS;
-
-            // Consume any other full tiles
-            while (block_offset + TILE_ITEMS <= block_oob)
-            {
-                ConsumeFullTile<VECTORIZE_INPUT, false>(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
-                block_offset += TILE_ITEMS;
-            }
-
-            // Consume any remaining input
-            ConsumePartialTile<false>(smem_storage, d_in, block_offset, block_oob, reduction_op, thread_aggregate);
-
-            // Compute the block-wide reduction (every thread has a valid input)
-            return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op);
-        }
-        else
-        {
-            // We have less than a full tile to consume
-            T thread_aggregate;
-            ConsumePartialTile<true>(smem_storage, d_in, block_offset, block_oob, reduction_op, thread_aggregate);
-
-            // Compute the block-wide reduction  (up to block_items threads have valid inputs)
-            SizeT block_items = block_oob - block_offset;
-            return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op, block_items);
-        }
-    }
-
-
-    /**
-     * \brief Consumes input tiles using a dynamic queue policy, computing a threadblock-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     */
-    template <
-        bool                VECTORIZE_INPUT,
-        typename            ReductionOp>
-    static __device__ __forceinline__ T ProcessTilesDynamic(
-        SmemStorage         &smem_storage,
-        InputIteratorRA       d_in,
-        SizeT               num_items,
-        GridQueue<SizeT>    &queue,
-        ReductionOp         &reduction_op)
-    {
-        // Each thread block is statically assigned at some input, otherwise its
-        // block_aggregate will be undefined.
-        SizeT block_offset = blockIdx.x * TILE_ITEMS;
-
-        if (block_offset + TILE_ITEMS <= num_items)
-        {
-            // We have a full tile to consume
-            T thread_aggregate;
-            ConsumeFullTile<VECTORIZE_INPUT, true>(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
-
-            // Dynamically consume other tiles
-            SizeT even_share_base = gridDim.x * TILE_ITEMS;
-
-            if (even_share_base < num_items)
-            {
-                // There are tiles left to consume
-                while (true)
-                {
-                    // Dequeue up to TILE_ITEMS
-                    if (threadIdx.x == 0)
-                    {
-                        smem_storage.block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-                    }
-
-                    __syncthreads();
-
-                    block_offset = smem_storage.block_offset;
-
-                    __syncthreads();
-
-                    if (block_offset + TILE_ITEMS > num_items)
-                    {
-                        if (block_offset < num_items)
-                        {
-                            // We have less than a full tile to consume
-                            ConsumePartialTile<false>(smem_storage, d_in, block_offset, num_items, reduction_op, thread_aggregate);
-                        }
-
-                        // No more work to do
-                        break;
-                    }
-
-                    // We have a full tile to consume
-                    ConsumeFullTile<VECTORIZE_INPUT, false>(smem_storage, d_in, block_offset, reduction_op, thread_aggregate);
-                }
-            }
-
-            // Compute the block-wide reduction (every thread has a valid input)
-            return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op);
-        }
-        else
-        {
-            // We have less than a full tile to consume
-            T thread_aggregate;
-            SizeT block_items = num_items - block_offset;
-            ConsumePartialTile<true>(smem_storage, d_in, block_offset, num_items, reduction_op, thread_aggregate);
-
-            // Compute the block-wide reduction  (up to block_items threads have valid inputs)
-            return BlockReduceT::Reduce(smem_storage.reduce, thread_aggregate, reduction_op, block_items);
-        }
-    }
-
-
-public:
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Consumes input tiles using an even-share policy, computing a threadblock-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     */
-    template <typename ReductionOp>
-    static __device__ __forceinline__ T ProcessTilesEvenShare(
-        SmemStorage     &smem_storage,
-        InputIteratorRA   d_in,
-        SizeT           block_offset,
-        const SizeT     &block_oob,
-        ReductionOp     &reduction_op)
-    {
-        typedef VectorHelper<T, TilesReducePolicy::VECTOR_LOAD_LENGTH> VecHelper;
-        typedef typename VecHelper::Type VectorT;
-
-        if ((IsPointer<InputIteratorRA>::VALUE) &&
-            (TilesReducePolicy::VECTOR_LOAD_LENGTH > 1) &&
-            (VecHelper::BUILT_IN) &&
-            ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
-        {
-            return ProcessTilesEvenShare<true>(smem_storage, d_in, block_offset, block_oob, reduction_op);
-        }
-        else
-        {
-            return ProcessTilesEvenShare<false>(smem_storage, d_in, block_offset, block_oob, reduction_op);
-        }
-    }
-
-
-    /**
-     * \brief Consumes input tiles using a dynamic queue policy, computing a threadblock-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     */
-    template <typename ReductionOp>
-    static __device__ __forceinline__ T ProcessTilesDynamic(
-        SmemStorage         &smem_storage,
-        InputIteratorRA       d_in,
-        SizeT               num_items,
-        GridQueue<SizeT>    &queue,
-        ReductionOp         &reduction_op)
-    {
-        typedef VectorHelper<T, TilesReducePolicy::VECTOR_LOAD_LENGTH> VecHelper;
-        typedef typename VecHelper::Type VectorT;
-
-        if ((IsPointer<InputIteratorRA>::VALUE) &&
-            (TilesReducePolicy::VECTOR_LOAD_LENGTH > 1) &&
-            (VecHelper::BUILT_IN) &&
-            ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
-        {
-            return ProcessTilesDynamic<true>(smem_storage, d_in, num_items, queue, reduction_op);
-        }
-        else
-        {
-            return ProcessTilesDynamic<false>(smem_storage, d_in, num_items, queue, reduction_op);
-        }
-    }
-
-
-    /**
-     * Specialized for GRID_MAPPING_EVEN_SHARE
-     */
-    template <GridMappingStrategy GRID_MAPPING, int DUMMY = 0>
-    struct Mapping
-    {
-        template <typename ReductionOp>
-        static __device__ __forceinline__ T ProcessTiles(
-            SmemStorage             &smem_storage,
-            InputIteratorRA           d_in,
-            SizeT                   num_items,
-            GridEvenShare<SizeT>    &even_share,
-            GridQueue<SizeT>        &queue,
-            ReductionOp             &reduction_op)
-        {
-            // Even share
-            even_share.BlockInit();
-
-            return ProcessTilesEvenShare(smem_storage, d_in, even_share.block_offset, even_share.block_oob, reduction_op);
-        }
-
-    };
-
-
-    /**
-     * Specialized for GRID_MAPPING_DYNAMIC
-     */
-    template <int DUMMY>
-    struct Mapping<GRID_MAPPING_DYNAMIC, DUMMY>
-    {
-        template <typename ReductionOp>
-        static __device__ __forceinline__ T ProcessTiles(
-            SmemStorage             &smem_storage,
-            InputIteratorRA           d_in,
-            SizeT                   num_items,
-            GridEvenShare<SizeT>    &even_share,
-            GridQueue<SizeT>        &queue,
-            ReductionOp             &reduction_op)
-        {
-            // Dynamically dequeue
-            return ProcessTilesDynamic(smem_storage, d_in, num_items, queue, reduction_op);
-        }
-
-    };
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/cub/grid/grid_even_share.cuh b/cub/grid/grid_even_share.cuh
index d3013285a8..136f4bed83 100644
--- a/cub/grid/grid_even_share.cuh
+++ b/cub/grid/grid_even_share.cuh
@@ -26,15 +26,11 @@
  *
  ******************************************************************************/
 
-/******************************************************************************
- * Threadblock Work management.
- *
- * A given threadblock may receive one of three different amounts of
- * work: "big", "normal", and "last".  The big workloads are one
- * grain greater than the normal, and the last workload
- * does the extra work.
- *
- ******************************************************************************/
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
+ */
+
 
 #pragma once
 
@@ -56,7 +52,7 @@ namespace cub {
 
 
 /**
- * \brief A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
+ * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
  *
  * \par Overview
  * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
@@ -87,17 +83,36 @@ private:
 public:
 
     /// Total number of input items
-    SizeT  num_items;
+    SizeT   num_items;
 
     /// Grid size in threadblocks
-    int grid_size;
+    int     grid_size;
 
     /// Offset into input marking the beginning of the owning thread block's segment of input tiles
-    SizeT block_offset;
+    SizeT   block_offset;
 
     /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
     SizeT   block_oob;
 
+    /**
+     * \brief Block-based constructor for single-block grids.
+     */
+    __device__ __forceinline__ GridEvenShare(SizeT num_items) :
+        num_items(num_items),
+        grid_size(1),
+        block_offset(0),
+        block_oob(num_items) {}
+
+
+    /**
+     * \brief Default constructor.  Zero-initializes block-specific fields.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_oob(0) {}
+
 
     /**
      * \brief Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
diff --git a/cub/grid/grid_mapping.cuh b/cub/grid/grid_mapping.cuh
index ea4c6def89..a3e0b6d8b7 100644
--- a/cub/grid/grid_mapping.cuh
+++ b/cub/grid/grid_mapping.cuh
@@ -28,13 +28,13 @@
 
 /**
  * \file
- * cub::GridMappingStrategy enumerates alternative strategies for mapping
- * constant-sized tiles of device-wide data onto a grid of CUDA thread
- * blocks.
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
  */
 
 #pragma once
 
+#include "grid_even_share.cuh"
+#include "grid_queue.cuh"
 #include "../util_namespace.cuh"
 
 /// Optional outer namespace(s)
@@ -50,15 +50,18 @@ namespace cub {
  */
 
 
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
 /**
- * GridMappingStrategy enumerates alternative strategies for mapping
- * constant-sized tiles of device-wide data onto a grid of CUDA thread
- * blocks.
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
  */
 enum GridMappingStrategy
 {
     /**
-     * \brief An "even-share" strategy.
+     * \brief An "even-share" strategy for assigning input tiles to thread blocks.
      *
      * \par Overview
      * The input is evenly partitioned into \p p segments, where \p p is
@@ -73,7 +76,7 @@ enum GridMappingStrategy
     GRID_MAPPING_EVEN_SHARE,
 
     /**
-     * \brief A dynamic "queue-based" strategy for commutative reduction operators.
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
      *
      * \par Overview
      * The input is treated as a queue to be dynamically consumed by a grid of
@@ -87,6 +90,453 @@ enum GridMappingStrategy
 };
 
 
+
+/******************************************************************************
+ * Mapping engines
+ *****************************************************************************/
+
+/**
+ * \brief Dispatches tiles of work from the given input range to the specified thread block abstraction.
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ *   - <tt>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);</tt>
+ * - Getting the maximum number of items processed per call to <tt>PersistentBlock::ConsumeTile</tt>:
+ *   - <tt>int TileItems()</tt>
+ * - Finalization:
+ *   - <tt>void Finalize(Result &result);</tt>
+ *
+ * \tparam PersistentBlock    <b>[inferred]</b> Thread block abstraction type for tile processing
+ * \tparam SizeT        <b>[inferred]</b> Integral type used for global array indexing
+ * \tparam Result       <b>[inferred]</b> Result type to be returned by the PersistentBlock instance
+ */
+
+template <
+    typename                PersistentBlock,
+    typename                SizeT,
+    typename                Result>
+__device__ __forceinline__ void ConsumeTiles(
+    PersistentBlock         &persistent_block,          ///< [in,out] Threadblock abstraction for tile processing
+    SizeT                   block_offset,               ///< [in] Threadblock begin offset (inclusive)
+    SizeT                   block_oob,                  ///< [in] Threadblock end offset (exclusive)
+    Result                  &result)                    ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+{
+    bool sync_after = true;
+
+    // Number of items per tile that can be processed by tiles
+    int tile_items = persistent_block.TileItems();
+
+    // Consume any full tiles
+    while (block_offset + tile_items <= block_oob)
+    {
+        persistent_block.ConsumeTile(sync_after, block_offset, tile_items);
+        if (sync_after) __syncthreads();
+
+        block_offset += tile_items;
+    }
+
+    // Consume any remaining input
+    if (block_offset < block_oob)
+    {
+        persistent_block.ConsumeTile(sync_after, block_offset, block_oob - block_offset);
+        if (sync_after) __syncthreads();
+    }
+
+    // Compute the block-wide reduction (every thread has a valid input)
+    persistent_block.Finalize(result);
+}
+
+
+/**
+ * \brief Uses a GridEvenShare descriptor to dispatch tiles of work to the specified thread block abstraction.  (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ *   - <tt>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);</tt>
+ * - Getting the maximum number of items processed per call to <tt>PersistentBlock::ConsumeTile</tt>:
+ *   - <tt>int TileItems()</tt>
+ * - Finalization:
+ *   - <tt>void Finalize(Result &result);</tt>
+ *
+ * \tparam PersistentBlock    <b>[inferred]</b> Thread block abstraction type for tile processing
+ * \tparam SizeT        <b>[inferred]</b> Integral type used for global array indexing
+ * \tparam Result       <b>[inferred]</b> Result type to be returned by the PersistentBlock instance
+ */
+
+template <
+    typename                PersistentBlock,
+    typename                SizeT,
+    typename                Result>
+__device__ __forceinline__ void ConsumeTiles(
+    PersistentBlock         &persistent_block,          ///< [in,out] Threadblock abstraction for tile processing
+    SizeT                   num_items,                  ///< [in] Total number of global input items
+    GridEvenShare<SizeT>    &even_share,                ///< [in] GridEvenShare descriptor
+    Result                  &result)                    ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+{
+    even_share.BlockInit();
+    ConsumeTiles(persistent_block, even_share.block_offset, even_share.block_oob, result);
+}
+
+
+
+/**
+ * \brief Dispatches tiles of work from the given input range to the specified thread block abstraction.  The first tile given to each thread block is flagged as such.
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ *   - <tt>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);</tt>
+ * - Getting the maximum number of items processed per call to <tt>PersistentBlock::ConsumeTile</tt>:
+ *   - <tt>int TileItems()</tt>
+ * - Finalization:
+ *   - <tt>void Finalize(Result &result);</tt>
+ *
+ * \tparam PersistentBlock    <b>[inferred]</b> Thread block abstraction type for tile processing
+ * \tparam SizeT        <b>[inferred]</b> Integral type used for global array indexing
+ * \tparam Result       <b>[inferred]</b> Result type to be returned by the PersistentBlock instance
+ */
+template <
+    typename                PersistentBlock,
+    typename                SizeT,
+    typename                Result>
+__device__ __forceinline__ void ConsumeTilesFlagFirst(
+    PersistentBlock         &persistent_block,          ///< [in,out] Threadblock abstraction for tile processing
+    SizeT                   block_offset,               ///< [in] Threadblock begin offset (inclusive)
+    SizeT                   block_oob,                  ///< [in] Threadblock end offset (exclusive)
+    Result                  &result)                    ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+{
+    bool sync_after = true;
+
+    // Number of items per tile that can be processed by tiles
+    int tile_items = persistent_block.TileItems();
+
+    if (block_offset + tile_items <= block_oob)
+    {
+        // We have at least one full tile to consume
+        persistent_block.ConsumeTile(sync_after, block_offset, tile_items, true);
+        if (sync_after) __syncthreads();
+
+        block_offset += tile_items;
+
+        // Consume any other full tiles
+        while (block_offset + tile_items <= block_oob)
+        {
+            persistent_block.ConsumeTile(sync_after, block_offset, tile_items, false);
+            if (sync_after) __syncthreads();
+
+            block_offset += tile_items;
+        }
+
+        // Consume any remaining input
+        if (block_offset < block_oob)
+        {
+            persistent_block.ConsumeTile(sync_after, block_offset, block_oob - block_offset, false);
+            if (sync_after) __syncthreads();
+        }
+    }
+    else
+    {
+        // We have less than a full tile to consume
+        SizeT block_items = block_oob - block_offset;
+
+        persistent_block.ConsumeTile(sync_after, block_offset, block_items, true);
+        if (sync_after) __syncthreads();
+    }
+
+    // Compute the block-wide reduction (every thread has a valid input)
+    persistent_block.Finalize(result);
+}
+
+
+/**
+ * \brief Uses a GridEvenShare descriptor to dispatch tiles of work to the specified thread block abstraction.  The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ *   - <tt>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);</tt>
+ * - Getting the maximum number of items processed per call to <tt>PersistentBlock::ConsumeTile</tt>:
+ *   - <tt>int TileItems()</tt>
+ * - Finalization:
+ *   - <tt>void Finalize(Result &result);</tt>
+ *
+ * \tparam PersistentBlock  <b>[inferred]</b> Thread block abstraction type for tile processing
+ * \tparam SizeT            <b>[inferred]</b> Integral type used for global array indexing
+ * \tparam Result           <b>[inferred]</b> Result type to be returned by the PersistentBlock instance
+ */
+template <
+    typename                PersistentBlock,
+    typename                SizeT,
+    typename                Result>
+__device__ __forceinline__ void ConsumeTilesFlagFirst(
+    PersistentBlock         &persistent_block,          ///< [in,out] Threadblock abstraction for tile processing
+    SizeT                   num_items,                  ///< [in] Total number of global input items
+    GridEvenShare<SizeT>    &even_share,                ///< [in] GridEvenShare descriptor
+    Result                  &result)                    ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+{
+    even_share.BlockInit();
+    ConsumeTilesFlagFirst(persistent_block, even_share.block_offset, even_share.block_oob, result);
+}
+
+
+
+/**
+ * \brief Uses a GridQueue descriptor to dispatch tiles of work to the specified thread block abstraction.  (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ *   - <tt>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);</tt>
+ * - Getting the maximum number of items processed per call to <tt>PersistentBlock::ConsumeTile</tt>:
+ *   - <tt>int TileItems()</tt>
+ * - Finalization:
+ *   - <tt>void Finalize(Result &result);</tt>
+ *
+ * \tparam PersistentBlock  <b>[inferred]</b> Thread block abstraction type for tile processing
+ * \tparam SizeT            <b>[inferred]</b> Integral type used for global array indexing
+ * \tparam Result           <b>[inferred]</b> Result type to be returned by the PersistentBlock instance
+ */
+template <
+    typename                PersistentBlock,
+    typename                SizeT,
+    typename                Result>
+__device__ __forceinline__ void ConsumeTiles(
+    PersistentBlock         &persistent_block,          ///< [in,out] Threadblock abstraction for tile processing
+    SizeT                   num_items,                  ///< [in] Total number of global input items
+    GridQueue<SizeT>        &queue,                     ///< [in,out] GridQueue descriptor
+    Result                  &result)                    ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+{
+    // Shared tile-processing offset obtained dynamically from queue
+    __shared__ SizeT dynamic_block_offset;
+
+    bool sync_after = true;
+
+    // Number of items per tile that can be processed by tiles
+    int tile_items = persistent_block.TileItems();
+
+    // There are tiles left to consume
+    while (true)
+    {
+        // Dequeue up to tile_items
+        if (threadIdx.x == 0)
+        {
+            dynamic_block_offset = queue.Drain(tile_items);
+        }
+
+        __syncthreads();
+
+        SizeT block_offset = dynamic_block_offset;
+
+        __syncthreads();
+
+        if (block_offset + tile_items > num_items)
+        {
+            if (block_offset < num_items)
+            {
+                // We have less than a full tile to consume
+                persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset);
+                if (sync_after) __syncthreads();
+            }
+
+            // No more work to do
+            break;
+        }
+
+        // We have a full tile to consume
+        persistent_block.ConsumeTile(sync_after, block_offset, tile_items);
+    }
+
+    persistent_block.Finalize(result);
+}
+
+
+
+/**
+ * \brief Uses a GridQueue descriptor to dispatch tiles of work to the specified thread block abstraction.  The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)
+ *
+ * \par
+ * Expects the \p PersistentBlock type to have the following callback member functions:
+ * - Tile processing:
+ *   - <tt>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);</tt>
+ * - Getting the maximum number of items processed per call to <tt>PersistentBlock::ConsumeTile</tt>:
+ *   - <tt>int TileItems()</tt>
+ * - Finalization:
+ *   - <tt>void Finalize(Result &result);</tt>
+ *
+ * \tparam PersistentBlock  <b>[inferred]</b> Thread block abstraction type for tile processing
+ * \tparam SizeT            <b>[inferred]</b> Integral type used for global array indexing
+ * \tparam Result           <b>[inferred]</b> Result type to be returned by the PersistentBlock instance
+ */
+template <
+    typename                PersistentBlock,
+    typename                SizeT,
+    typename                Result>
+__device__ __forceinline__ void ConsumeTilesFlagFirst(
+    PersistentBlock         &persistent_block,          ///< [in,out] Threadblock abstraction for tile processing
+    SizeT                   num_items,                  ///< [in] Total number of global input items
+    GridQueue<SizeT>        &queue,                     ///< [in,out] GridQueue descriptor
+    Result                  &result)                    ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+{
+    // Shared tile-processing offset obtained dynamically from queue
+    __shared__ SizeT dynamic_block_offset;
+
+    bool sync_after = true;
+
+    // Number of items per tile that can be processed by tiles
+    int tile_items = persistent_block.TileItems();
+
+    // We give each thread block at least one tile of input.
+    SizeT block_offset = blockIdx.x * tile_items;
+
+    // Check if we have a full tile to consume
+    if (block_offset + tile_items <= num_items)
+    {
+        persistent_block.ConsumeTile(sync_after, block_offset, tile_items, true);
+        if (sync_after) __syncthreads();
+
+        // Now that every block in the kernel has gotten a tile, attempt to dynamically consume any remaining
+        SizeT even_share_base = gridDim.x * tile_items;
+        if (even_share_base < num_items)
+        {
+            // There are tiles left to consume
+            while (true)
+            {
+                // Dequeue up to tile_items
+                if (threadIdx.x == 0)
+                {
+                    dynamic_block_offset = queue.Drain(tile_items) + even_share_base;
+                }
+
+                __syncthreads();
+
+                block_offset = dynamic_block_offset;
+
+                __syncthreads();
+
+                if (block_offset + tile_items > num_items)
+                {
+                    if (block_offset < num_items)
+                    {
+                        // We have less than a full tile to consume
+                        persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset, false);
+                        if (sync_after) __syncthreads();
+                    }
+
+                    // No more work to do
+                    break;
+                }
+
+                // We have a full tile to consume
+                persistent_block.ConsumeTile(sync_after, block_offset, tile_items, false);
+            }
+        }
+    }
+    else
+    {
+        // We have less than a full tile to consume
+        persistent_block.ConsumeTile(sync_after, block_offset, num_items - block_offset, true);
+        if (sync_after) __syncthreads();
+    }
+
+    // Compute the block-wide reduction (every thread has a valid input)
+    persistent_block.Finalize(result);
+
+}
+
+
+
+/******************************************************************************
+ * Type-directed dispatch to mapping engines
+ *****************************************************************************/
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * \brief Dispatch helper for statically selecting between mapping strategies (e.g., to avoid compiling an alternative that is invaild for a given architecture)
+ */
+template <GridMappingStrategy MAPPING_STRATEGY>
+struct GridMapping;
+
+/**
+ * Even-share specialization of GridMapping
+ */
+template<>
+struct GridMapping<GRID_MAPPING_EVEN_SHARE>
+{
+    template <
+        typename                PersistentBlock,
+        typename                SizeT,
+        typename                Result>
+    static __device__ __forceinline__ void ConsumeTiles(
+        PersistentBlock         &persistent_block,  ///< [in,out] Threadblock abstraction for tile processing
+        SizeT                   num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>    &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>        &queue,             ///< [in,out] GridQueue descriptor
+        Result                  &result)            ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+    {
+        cub::ConsumeTiles(persistent_block, num_items, even_share, result);
+    }
+
+    template <
+        typename                PersistentBlock,
+        typename                SizeT,
+        typename                Result>
+    static __device__ __forceinline__ void ConsumeTilesFlagFirst(
+        PersistentBlock         &persistent_block,  ///< [in,out] Threadblock abstraction for tile processing
+        SizeT                   num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>    &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>        &queue,             ///< [in,out] GridQueue descriptor
+        Result                  &result)            ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+    {
+        cub::ConsumeTilesFlagFirst(persistent_block, num_items, even_share, result);
+    }
+};
+
+
+/**
+ * Even-share specialization of GridMapping
+ */
+template<>
+struct GridMapping<GRID_MAPPING_DYNAMIC>
+{
+    template <
+        typename                PersistentBlock,
+        typename                SizeT,
+        typename                Result>
+    static __device__ __forceinline__ void ConsumeTiles(
+        PersistentBlock         &persistent_block,  ///< [in,out] Threadblock abstraction for tile processing
+        SizeT                   num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>    &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>        &queue,             ///< [in,out] GridQueue descriptor
+        Result                  &result)            ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+    {
+        cub::ConsumeTiles(persistent_block, num_items, queue, result);
+    }
+
+    template <
+        typename                PersistentBlock,
+        typename                SizeT,
+        typename                Result>
+    static __device__ __forceinline__ void ConsumeTilesFlagFirst(
+        PersistentBlock         &persistent_block,  ///< [in,out] Threadblock abstraction for tile processing
+        SizeT                   num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>    &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>        &queue,             ///< [in,out] GridQueue descriptor
+        Result                  &result)            ///< [out] Result returned by <tt>tiles::Finalize()</tt>
+    {
+        cub::ConsumeTilesFlagFirst(persistent_block, num_items, queue, result);
+    }
+};
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
 /** @} */       // end group GridModule
 
 }               // CUB namespace
diff --git a/cub/grid/grid_queue.cuh b/cub/grid/grid_queue.cuh
index 3ca31b5a1a..daec348ea5 100644
--- a/cub/grid/grid_queue.cuh
+++ b/cub/grid/grid_queue.cuh
@@ -26,9 +26,10 @@
  *
  ******************************************************************************/
 
-/******************************************************************************
- * Abstraction for grid-wide queue management
- ******************************************************************************/
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
 
 #pragma once
 
@@ -52,7 +53,7 @@ namespace cub {
 
 
 /**
- * \brief Abstraction for grid-wide queue management.
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
  *
  * \par Overview
  * GridQueue descriptors provides abstractions for "filling" or
@@ -77,7 +78,8 @@ namespace cub {
  * \par
  * Iterative work management can be implemented simply with a pair of flip-flopping
  * work buffers, each with an associated set of fill and drain GridQueue descriptors.
-
+ *
+ * \tparam SizeT Integer type for array indexing
  */
 template <typename SizeT>
 class GridQueue
diff --git a/cub/thread/thread_load.cuh b/cub/thread/thread_load.cuh
index e36fbe9341..a07c758c27 100644
--- a/cub/thread/thread_load.cuh
+++ b/cub/thread/thread_load.cuh
@@ -161,6 +161,26 @@ struct ThreadLoadDispatch<PTX_LOAD_VS, false>
 };
 
 
+
+/**
+ * Generic PTX_LOAD_CG specialization for SM10-SM13 architectures
+ */
+#if CUB_PTX_ARCH < 200
+template <>
+struct ThreadLoadDispatch<PTX_LOAD_CG, false>
+{
+    // Iterator
+    template <typename InputIteratorRA>
+    static __device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr)
+    {
+        // Straightforward dereference
+        return *itr;
+    }
+};
+#endif  // CUB_PTX_ARCH < 200
+
+
+
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
 /**
@@ -456,6 +476,7 @@ __device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value
 /**
  * Expand ThreadLoad() implementations for primitive types.
  */
+#if CUB_PTX_ARCH >= 200
 
 // Signed
 CUB_LOADS_0124(char, char, short, s8, h)
@@ -491,6 +512,7 @@ CUB_LOADS_4L(double4, double2);
     CUB_LOADS_0124(unsigned long, ulong, unsigned long, u32, r)
 #endif
 
+#endif  // CUB_PTX_ARCH >= 200
 
 /**
  * Undefine macros
diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh
index 0e524893ca..83de8d789c 100644
--- a/cub/thread/thread_store.cuh
+++ b/cub/thread/thread_store.cuh
@@ -214,54 +214,74 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val)
 /**
  * Define a global ThreadStore() specialization for type
  */
-#define CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier)    \
-    template<>                                                                            \
-    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                    \
-    {                                                                                    \
-        const asm_type raw = reinterpret_cast<const asm_type&>(val);                    \
-        asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : :                        \
-            _CUB_ASM_PTR_(ptr),                                                            \
-            #reg_mod(raw));                                                             \
+#define CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier)        \
+    template<>                                                                              \
+    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        const asm_type raw = reinterpret_cast<const asm_type&>(val);                        \
+        asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : :                 \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            #reg_mod(raw));                                                                 \
     }
 
 /**
  * Define a global ThreadStore() specialization for the vector-1 type
  */
 #define CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier)    \
-    template<>                                                                            \
-    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                    \
-    {                                                                                    \
-        const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                \
-        asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : :                        \
-            _CUB_ASM_PTR_(ptr),                                                            \
-            #reg_mod(raw_x));                                                            \
+    template<>                                                                              \
+    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                    \
+        asm volatile ("st.global."#ptx_modifier"."#ptx_type" [%0], %1;" : :                 \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            #reg_mod(raw_x));                                                               \
     }
 
 /**
- * Define a volatile-shared ThreadStore() specialization for the vector-1 type
+ * Define a global ThreadStore() specialization for the vector-2 type
  */
-#define CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod)                \
-    template<>                                                                            \
-    void ThreadStore<PTX_STORE_VS, type*>(type* ptr, const type& val)                        \
-    {                                                                                    \
-        ThreadStore<PTX_STORE_VS>(                                                            \
-            (asm_type*) ptr,                                                            \
-            reinterpret_cast<const asm_type&>(val.x));                                    \
+#define CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier)    \
+    template<>                                                                              \
+    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                    \
+        const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);                    \
+        asm volatile ("st.global."#ptx_modifier".v2."#ptx_type" [%0], {%1, %2};" : :        \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            #reg_mod(raw_x),                                                                \
+            #reg_mod(raw_y));                                                               \
     }
 
 /**
- * Define a global ThreadStore() specialization for the vector-2 type
+ * Define a global ThreadStore() specialization for the vector-4 type
  */
-#define CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier)    \
-    template<>                                                                            \
-    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                    \
-    {                                                                                    \
-        const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                \
-        const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);                \
-        asm volatile ("st.global."#ptx_modifier".v2."#ptx_type" [%0], {%1, %2};" : :                \
-            _CUB_ASM_PTR_(ptr),                                                            \
-            #reg_mod(raw_x),                                                             \
-            #reg_mod(raw_y));                                                            \
+#define CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier)    \
+    template<>                                                                              \
+    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                    \
+        const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);                    \
+        const asm_type raw_z = reinterpret_cast<const asm_type&>(val.z);                    \
+        const asm_type raw_w = reinterpret_cast<const asm_type&>(val.w);                    \
+        asm volatile ("st.global."#ptx_modifier".v4."#ptx_type" [%0], {%1, %2, %3, %4};" : :        \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            #reg_mod(raw_x),                                                                \
+            #reg_mod(raw_y),                                                                \
+            #reg_mod(raw_z),                                                                \
+            #reg_mod(raw_w));                                                               \
+    }
+
+
+/**
+ * Define a volatile-shared ThreadStore() specialization for the vector-1 type
+ */
+#define CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod)                   \
+    template<>                                                                              \
+    void ThreadStore<PTX_STORE_VS, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        ThreadStore<PTX_STORE_VS>(                                                          \
+            (asm_type*) ptr,                                                                \
+            reinterpret_cast<const asm_type&>(val.x));                                      \
     }
 
 /**
@@ -269,48 +289,29 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val)
  * Performs separate references if the component_type is only 1 byte (otherwise we lose
  * performance due to the bitfield ops to disassemble the value)
  */
-#define CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod)                \
-    template<>                                                                            \
-    void ThreadStore<PTX_STORE_VS, type*>(type* ptr, const type& val)                        \
-    {                                                                                    \
-        if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100))                                                \
-        {                                                                                \
-            component_type *base_ptr = (component_type*) ptr;                            \
+#define CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod)                   \
+    template<>                                                                              \
+    void ThreadStore<PTX_STORE_VS, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100))                         \
+        {                                                                                   \
+            component_type *base_ptr = (component_type*) ptr;                               \
             ThreadStore<PTX_STORE_VS>(base_ptr, (component_type) val.x);                    \
             ThreadStore<PTX_STORE_VS>(base_ptr + 1, (component_type) val.y);                \
-        }                                                                                 \
-        else                                                                            \
-        {                                                                                \
-            const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);            \
-            const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);            \
-            asm volatile ("{"                                                                        \
+        }                                                                                   \
+        else                                                                                \
+        {                                                                                   \
+            const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                \
+            const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);                \
+            asm volatile ("{"                                                               \
                 "    .reg ."_CUB_ASM_PTR_SIZE_" t1;"                                        \
-                "    cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;"                        \
-                "    st.shared.volatile.v2."#ptx_type" [t1], {%1, %2};"                    \
-                "}" : :                                                                    \
-                _CUB_ASM_PTR_(ptr),                                                        \
-                #reg_mod(raw_x),                                                         \
-                #reg_mod(raw_y));                                                        \
-        }                                                                                \
-    }
-
-/**
- * Define a global ThreadStore() specialization for the vector-4 type
- */
-#define CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, cub_modifier, ptx_modifier)    \
-    template<>                                                                            \
-    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                    \
-    {                                                                                    \
-        const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                \
-        const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);                \
-        const asm_type raw_z = reinterpret_cast<const asm_type&>(val.z);                \
-        const asm_type raw_w = reinterpret_cast<const asm_type&>(val.w);                \
-        asm volatile ("st.global."#ptx_modifier".v4."#ptx_type" [%0], {%1, %2, %3, %4};" : :        \
-            _CUB_ASM_PTR_(ptr),                                                            \
-            #reg_mod(raw_x),                                                             \
-            #reg_mod(raw_y),                                                             \
-            #reg_mod(raw_z),                                                             \
-            #reg_mod(raw_w));                                                            \
+                "    cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;"                           \
+                "    st.shared.volatile.v2."#ptx_type" [t1], {%1, %2};"                     \
+                "}" : :                                                                     \
+                _CUB_ASM_PTR_(ptr),                                                         \
+                #reg_mod(raw_x),                                                            \
+                #reg_mod(raw_y));                                                           \
+        }                                                                                   \
     }
 
 /**
@@ -318,105 +319,105 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val)
  * Performs separate references if the component_type is only 1 byte (otherwise we lose
  * performance due to the bitfield ops to disassemble the value)
  */
-#define CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod)                \
-    template<>                                                                            \
-    void ThreadStore<PTX_STORE_VS, type*>(type* ptr, const type& val)                        \
-    {                                                                                    \
-        if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100))                                                \
-        {                                                                                \
-            component_type *base_ptr = (component_type*) ptr;                            \
+#define CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod)                   \
+    template<>                                                                              \
+    void ThreadStore<PTX_STORE_VS, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        if ((sizeof(component_type) == 1) || (CUDA_VERSION < 4100))                         \
+        {                                                                                   \
+            component_type *base_ptr = (component_type*) ptr;                               \
             ThreadStore<PTX_STORE_VS>(base_ptr, (component_type) val.x);                    \
             ThreadStore<PTX_STORE_VS>(base_ptr + 1, (component_type) val.y);                \
             ThreadStore<PTX_STORE_VS>(base_ptr + 2, (component_type) val.z);                \
             ThreadStore<PTX_STORE_VS>(base_ptr + 3, (component_type) val.w);                \
-        }                                                                                 \
-        else                                                                            \
-        {                                                                                \
-            const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);            \
-            const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);            \
-            const asm_type raw_z = reinterpret_cast<const asm_type&>(val.z);            \
-            const asm_type raw_w = reinterpret_cast<const asm_type&>(val.w);            \
-            asm volatile ("{"                                                                        \
+        }                                                                                   \
+        else                                                                                \
+        {                                                                                   \
+            const asm_type raw_x = reinterpret_cast<const asm_type&>(val.x);                \
+            const asm_type raw_y = reinterpret_cast<const asm_type&>(val.y);                \
+            const asm_type raw_z = reinterpret_cast<const asm_type&>(val.z);                \
+            const asm_type raw_w = reinterpret_cast<const asm_type&>(val.w);                \
+            asm volatile ("{"                                                               \
                 "    .reg ."_CUB_ASM_PTR_SIZE_" t1;"                                        \
-                "    cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;"                        \
-                "    st.volatile.shared.v4."#ptx_type" [t1], {%1, %2, %3, %4};"            \
-                "}" : :                                                                    \
-                _CUB_ASM_PTR_(ptr),                                                        \
-                #reg_mod(raw_x),                                                         \
-                #reg_mod(raw_y),                                                         \
-                #reg_mod(raw_z),                                                         \
-                #reg_mod(raw_w));                                                        \
-        }                                                                                \
+                "    cvta.to.shared."_CUB_ASM_PTR_SIZE_" t1, %0;"                           \
+                "    st.volatile.shared.v4."#ptx_type" [t1], {%1, %2, %3, %4};"             \
+                "}" : :                                                                     \
+                _CUB_ASM_PTR_(ptr),                                                         \
+                #reg_mod(raw_x),                                                            \
+                #reg_mod(raw_y),                                                            \
+                #reg_mod(raw_z),                                                            \
+                #reg_mod(raw_w));                                                           \
+        }                                                                                   \
     }
 
 /**
  * Define a ThreadStore() specialization for the 64-bit vector-4 type.
  * Uses two vector-2 Stores.
  */
-#define CUB_STORE_4L(type, half_type, cub_modifier)                                        \
-    template<>                                                                            \
-    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                    \
-    {                                                                                    \
-        const half_type* half_val = reinterpret_cast<const half_type*>(&val);            \
-        half_type* half_ptr = reinterpret_cast<half_type*>(ptr);                        \
-        ThreadStore<cub_modifier>(half_ptr, half_val[0]);                                \
-        ThreadStore<cub_modifier>(half_ptr + 1, half_val[1]);                            \
+#define CUB_STORE_4L(type, half_type, cub_modifier)                                         \
+    template<>                                                                              \
+    void ThreadStore<cub_modifier, type*>(type* ptr, const type& val)                       \
+    {                                                                                       \
+        const half_type* half_val = reinterpret_cast<const half_type*>(&val);               \
+        half_type* half_ptr = reinterpret_cast<half_type*>(ptr);                            \
+        ThreadStore<cub_modifier>(half_ptr, half_val[0]);                                   \
+        ThreadStore<cub_modifier>(half_ptr + 1, half_val[1]);                               \
     }
 
 /**
  * Define ThreadStore() specializations for the (non-vector) type
  */
-#define CUB_STORES_0(type, asm_type, ptx_type, reg_mod)                                    \
-    CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)                        \
-    CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)                        \
-    CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)                        \
+#define CUB_STORES_0(type, asm_type, ptx_type, reg_mod)                                     \
+    CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)                      \
+    CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)                      \
+    CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)                      \
     CUB_G_STORE_0(type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt)
 
 /**
  * Define ThreadStore() specializations for the vector-1 component_type
  */
-#define CUB_STORES_1(type, component_type, asm_type, ptx_type, reg_mod)                    \
-    CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod)                    \
-    CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)        \
-    CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)        \
-    CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)        \
+#define CUB_STORES_1(type, component_type, asm_type, ptx_type, reg_mod)                     \
+    CUB_VS_STORE_1(type, component_type, asm_type, ptx_type, reg_mod)                       \
+    CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)      \
+    CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)      \
+    CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)      \
     CUB_G_STORE_1(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt)
 
 /**
  * Define ThreadStore() specializations for the vector-2 component_type
  */
-#define CUB_STORES_2(type, component_type, asm_type, ptx_type, reg_mod)                    \
-    CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod)                    \
-    CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)        \
-    CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)        \
-    CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)        \
+#define CUB_STORES_2(type, component_type, asm_type, ptx_type, reg_mod)                     \
+    CUB_VS_STORE_2(type, component_type, asm_type, ptx_type, reg_mod)                       \
+    CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)      \
+    CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)      \
+    CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)      \
     CUB_G_STORE_2(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt)
 
 /**
  * Define ThreadStore() specializations for the vector-4 component_type
  */
-#define CUB_STORES_4(type, component_type, asm_type, ptx_type, reg_mod)                    \
-    CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod)                    \
-    CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)        \
-    CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)        \
-    CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)        \
+#define CUB_STORES_4(type, component_type, asm_type, ptx_type, reg_mod)                     \
+    CUB_VS_STORE_4(type, component_type, asm_type, ptx_type, reg_mod)                       \
+    CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WB, wb)      \
+    CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CG, cg)      \
+    CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_CS, cs)      \
     CUB_G_STORE_4(type, component_type, asm_type, ptx_type, reg_mod, PTX_STORE_WT, wt)
 
 /**
  * Define ThreadStore() specializations for the 256-bit vector-4 component_type
  */
-#define CUB_STORES_4L(type, half_type)                \
-    CUB_STORE_4L(type, half_type, PTX_STORE_VS)            \
-    CUB_STORE_4L(type, half_type, PTX_STORE_WB)            \
-    CUB_STORE_4L(type, half_type, PTX_STORE_CG)            \
-    CUB_STORE_4L(type, half_type, PTX_STORE_CS)            \
+#define CUB_STORES_4L(type, half_type)                      \
+    CUB_STORE_4L(type, half_type, PTX_STORE_VS)             \
+    CUB_STORE_4L(type, half_type, PTX_STORE_WB)             \
+    CUB_STORE_4L(type, half_type, PTX_STORE_CG)             \
+    CUB_STORE_4L(type, half_type, PTX_STORE_CS)             \
     CUB_STORE_4L(type, half_type, PTX_STORE_WT)
 
 /**
  * Define vector-0/1/2 ThreadStore() specializations for the component type
  */
-#define CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod)        \
-    CUB_STORES_0(component_type, asm_type, ptx_type, reg_mod)                        \
+#define CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod)     \
+    CUB_STORES_0(component_type, asm_type, ptx_type, reg_mod)                       \
     CUB_STORES_1(vec_prefix##1, component_type, asm_type, ptx_type, reg_mod)        \
     CUB_STORES_2(vec_prefix##2, component_type, asm_type, ptx_type, reg_mod)
 
@@ -424,12 +425,15 @@ __device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, const T& val)
  * Define vector-0/1/2/4 ThreadStore() specializations for the component type
  */
 #define CUB_STORES_0124(component_type, vec_prefix, asm_type, ptx_type, reg_mod)    \
-    CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod)            \
+    CUB_STORES_012(component_type, vec_prefix, asm_type, ptx_type, reg_mod)         \
     CUB_STORES_4(vec_prefix##4, component_type, asm_type, ptx_type, reg_mod)
 
 /**
  * Expand ThreadStore() implementations for primitive types.
  */
+
+#if CUB_PTX_ARCH >= 200
+
 // Signed
 CUB_STORES_0124(char, char, short, s8, h)
 CUB_STORES_0(signed char, short, s8, h)
@@ -464,6 +468,8 @@ CUB_STORES_4L(double4, double2);
     CUB_STORES_0124(unsigned long, ulong, unsigned long, u32, r)
 #endif
 
+#endif  // CUB_PTX_ARCH >= 200
+
 
 /**
  * Undefine macros
diff --git a/cub/util_allocator.cuh b/cub/util_allocator.cuh
index c30e9ac78a..2fa664dbc7 100644
--- a/cub/util_allocator.cuh
+++ b/cub/util_allocator.cuh
@@ -180,20 +180,20 @@ struct CachingDeviceAllocator : DeviceAllocator
      */
     struct BlockDescriptor
     {
-        DeviceOrdinal   device;        // device ordinal
+        int   device;        // device ordinal
         void*           d_ptr;      // Device pointer
         size_t          bytes;      // Size of allocation in bytes
         unsigned int    bin;        // Bin enumeration
 
         // Constructor
-        BlockDescriptor(void *d_ptr, DeviceOrdinal device) :
+        BlockDescriptor(void *d_ptr, int device) :
             d_ptr(d_ptr),
             bytes(0),
             bin(0),
             device(device) {}
 
         // Constructor
-        BlockDescriptor(size_t bytes, unsigned int bin, DeviceOrdinal device) :
+        BlockDescriptor(size_t bytes, unsigned int bin, int device) :
             d_ptr(NULL),
             bytes(bytes),
             bin(bin),
@@ -234,7 +234,7 @@ struct CachingDeviceAllocator : DeviceAllocator
     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
 
     /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<DeviceOrdinal, size_t> GpuCachedBytes;
+    typedef std::map<int, size_t> GpuCachedBytes;
 
 
     //---------------------------------------------------------------------
@@ -355,7 +355,7 @@ struct CachingDeviceAllocator : DeviceAllocator
     __host__ __device__ __forceinline__ cudaError_t DeviceAllocate(
         void** d_ptr,
         size_t bytes,
-        DeviceOrdinal device)
+        int device)
     {
     #ifdef __CUDA_ARCH__
         // Caching functionality only defined on host
@@ -363,7 +363,7 @@ struct CachingDeviceAllocator : DeviceAllocator
     #else
 
         bool locked                     = false;
-        DeviceOrdinal entrypoint_device = INVALID_DEVICE_ORDINAL;
+        int entrypoint_device = INVALID_DEVICE_ORDINAL;
         cudaError_t error               = cudaSuccess;
 
         // Round up to nearest bin size
@@ -472,7 +472,7 @@ struct CachingDeviceAllocator : DeviceAllocator
     #else
         cudaError_t error = cudaSuccess;
         do {
-            DeviceOrdinal current_device;
+            int current_device;
             if (CubDebug(error = cudaGetDevice(&current_device))) break;
             if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
         } while(0);
@@ -488,7 +488,7 @@ struct CachingDeviceAllocator : DeviceAllocator
      */
     __host__ __device__ __forceinline__ cudaError_t DeviceFree(
         void* d_ptr,
-        DeviceOrdinal device)
+        int device)
     {
     #ifdef __CUDA_ARCH__
         // Caching functionality only defined on host
@@ -496,7 +496,7 @@ struct CachingDeviceAllocator : DeviceAllocator
     #else
 
         bool locked                     = false;
-        DeviceOrdinal entrypoint_device = INVALID_DEVICE_ORDINAL;
+        int entrypoint_device = INVALID_DEVICE_ORDINAL;
         cudaError_t error               = cudaSuccess;
 
         BlockDescriptor search_key(d_ptr, device);
@@ -581,7 +581,7 @@ struct CachingDeviceAllocator : DeviceAllocator
         return CubDebug(cudaErrorInvalidConfiguration);
     #else
 
-        DeviceOrdinal current_device;
+        int current_device;
         cudaError_t error = cudaSuccess;
 
         do {
@@ -607,8 +607,8 @@ struct CachingDeviceAllocator : DeviceAllocator
 
         cudaError_t error                   = cudaSuccess;
         bool locked                         = false;
-        DeviceOrdinal entrypoint_device     = INVALID_DEVICE_ORDINAL;
-        DeviceOrdinal current_device        = INVALID_DEVICE_ORDINAL;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
 
         // Lock
         if (!locked) {
@@ -723,7 +723,7 @@ __host__ __device__ __forceinline__ cudaError_t DeviceAllocate(
 {
     if (device_allocator == NULL)
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
         // CUDA API not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
     #else
@@ -744,7 +744,7 @@ __host__ __device__ __forceinline__ cudaError_t DeviceFree(
 {
     if (device_allocator == NULL)
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
         // CUDA API not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
     #else
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
index ec89a00d0c..2582c80cfa 100644
--- a/cub/util_arch.cuh
+++ b/cub/util_arch.cuh
@@ -70,10 +70,8 @@ namespace cub {
 
 
 /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
-#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350))
-#define CUB_CNP_ENABLED 1
-#else
-#define CUB_CNP_ENABLED 0
+#if !defined(CUB_RUNTIME_ENABLED) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350))
+#define CUB_RUNTIME_ENABLED
 #endif
 
 
diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh
index f8ae3ae46f..df3077723b 100644
--- a/cub/util_debug.cuh
+++ b/cub/util_debug.cuh
@@ -75,7 +75,7 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
     if (error && !silent)
     {
     #if (CUB_PTX_ARCH == 0)
-        printf("CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
         fflush(stderr);
     #elif (CUB_PTX_ARCH >= 200)
         printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
diff --git a/cub/util_device.cuh b/cub/util_device.cuh
index 5e9a5f5179..97cbedd550 100644
--- a/cub/util_device.cuh
+++ b/cub/util_device.cuh
@@ -58,11 +58,6 @@ template <typename T>
 __global__ void EmptyKernel(void) { }
 
 
-/**
- * \brief Type for representing GPU device ordinals
- */
-typedef int DeviceOrdinal;
-
 /// Invalid device ordinal
 enum
 {
@@ -75,7 +70,7 @@ enum
  */
 __host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
 {
-#if !CUB_CNP_ENABLED
+#ifndef CUB_RUNTIME_ENABLED
 
     // CUDA API calls not supported from this device
     return cudaErrorInvalidConfiguration;
@@ -166,7 +161,7 @@ public:
     __host__ __device__ __forceinline__
     cudaError_t Init(int device_ordinal)
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
 
         // CUDA API calls not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
@@ -210,7 +205,7 @@ public:
     __host__ __device__ __forceinline__
     cudaError_t Init()
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
 
         // CUDA API calls not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
@@ -240,7 +235,7 @@ public:
         KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
         int                 block_threads)              ///< [in] Number of threads per thread block
     {
-    #if !CUB_CNP_ENABLED
+    #ifndef CUB_RUNTIME_ENABLED
 
         // CUDA API calls not supported from this device
         return CubDebug(cudaErrorInvalidConfiguration);
@@ -254,30 +249,49 @@ public:
             cudaFuncAttributes kernel_attrs;
             if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
 
+            // Number of warps per threadblock
             int block_warps = (block_threads +  warp_threads - 1) / warp_threads;
 
-            int block_allocated_warps = CUB_ROUND_UP_NEAREST(block_warps, warp_alloc_unit);
-
-            int block_allocated_regs = (regs_by_block) ?
-                CUB_ROUND_UP_NEAREST(
-                    block_allocated_warps * kernel_attrs.numRegs * warp_threads,
-                    reg_alloc_unit) :
-                block_allocated_warps * CUB_ROUND_UP_NEAREST(
-                    kernel_attrs.numRegs * warp_threads,
-                    reg_alloc_unit);
-
+            // Max warp occupancy
+            int max_warp_occupancy = (block_warps > 0) ?
+                max_sm_warps / block_warps :
+                max_sm_blocks;
+
+            // Maximum register occupancy
+            int max_reg_occupancy;
+            if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
+            {
+                // Prevent divide-by-zero
+                max_reg_occupancy = max_sm_blocks;
+            }
+            else if (regs_by_block)
+            {
+                // Allocates registers by threadblock
+                int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
+                max_reg_occupancy = max_sm_registers / block_regs;
+            }
+            else
+            {
+                // Allocates registers by warp
+                int sm_sides                = warp_alloc_unit;
+                int sm_registers_per_side   = max_sm_registers / sm_sides;
+                int regs_per_warp           = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
+                int warps_per_side          = sm_registers_per_side / regs_per_warp;
+                int warps                   = warps_per_side * sm_sides;
+                max_reg_occupancy           = warps / block_warps;
+            }
+
+            // Shared memory per threadblock
             int block_allocated_smem = CUB_ROUND_UP_NEAREST(
                 kernel_attrs.sharedSizeBytes,
                 smem_alloc_unit);
 
-            int max_warp_occupancy = max_sm_warps / block_warps;
-
+            // Max shared memory occupancy
             int max_smem_occupancy = (block_allocated_smem > 0) ?
-                    (smem_bytes / block_allocated_smem) :
-                    max_sm_blocks;
-
-            int max_reg_occupancy = max_sm_registers / block_allocated_regs;
+                (smem_bytes / block_allocated_smem) :
+                max_sm_blocks;
 
+            // Max occupancy
             max_sm_occupancy = CUB_MIN(
                 CUB_MIN(max_sm_blocks, max_warp_occupancy),
                 CUB_MIN(max_smem_occupancy, max_reg_occupancy));
diff --git a/cub/util_type.cuh b/cub/util_type.cuh
index c97aa2c248..fff020cef8 100644
--- a/cub/util_type.cuh
+++ b/cub/util_type.cuh
@@ -34,6 +34,7 @@
 #pragma once
 
 #include <iostream>
+#include <limits>
 
 #include "util_namespace.cuh"
 
@@ -499,7 +500,7 @@ template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER,
 
 template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType> {};
 
-template <> struct NumericTraits<char> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {};
 template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
 template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short> {};
 template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int> {};
diff --git a/cub/warp/warp_scan.cuh b/cub/warp/warp_scan.cuh
index 89cd6107d7..fbbe980258 100644
--- a/cub/warp/warp_scan.cuh
+++ b/cub/warp/warp_scan.cuh
@@ -205,24 +205,21 @@ private:
       * Constants and typedefs
       ******************************************************************************/
 
-    /// WarpScan algorithmic variants
-    enum WarpScanPolicy
-    {
-        SHFL_SCAN,          // Warp-synchronous SHFL-based scan
-        SMEM_SCAN,          // Warp-synchronous smem-based scan
-    };
-
     /// Constants
     enum
     {
         POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
     };
 
+    /// WarpScan algorithmic variants (would use an enum, but it causes GCC crash as of CUDA5)
+    static const int SHFL_SCAN = 0;          // Warp-synchronous SHFL-based scan
+    static const int SMEM_SCAN = 1;          // Warp-synchronous smem-based scan
+
 
     /// Use SHFL_SCAN if (architecture is >= SM30) and (T is a primitive) and (T is 4-bytes or smaller) and (LOGICAL_WARP_THREADS is a power-of-two)
-    static const WarpScanPolicy POLICY = ((CUB_PTX_ARCH >= 300) && Traits<T>::PRIMITIVE && (sizeof(T) <= 4) && POW_OF_TWO) ?
-                                            SHFL_SCAN :
-                                            SMEM_SCAN;
+    static const int POLICY = ((CUB_PTX_ARCH >= 300) && Traits<T>::PRIMITIVE && (sizeof(T) <= 4) && POW_OF_TWO) ?
+        SHFL_SCAN :
+        SMEM_SCAN;
 
 
 
@@ -512,9 +509,6 @@ private:
         {
             /// Warpscan layout: 1.5 warps-worth of elements for each warp.
             T warp_scan[WARPS][WARP_SMEM_ELEMENTS];
-
-            /// Single variable for broadcasting aggregate, etc.
-            T broadcast;
         };
 
 
@@ -522,33 +516,33 @@ private:
         static __device__ __forceinline__ T Broadcast(
             SmemStorage     &smem_storage,      ///< [in] Reference to shared memory allocation having layout type SmemStorage
             T               input,              ///< [in] The value to broadcast
-            unsigned int    src_lane)           ///< [in] Which warp lane is to do the broacasting
+            unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
         {
             unsigned int lane_id = (WARPS == 1) ? threadIdx.x : (threadIdx.x & (LOGICAL_WARP_THREADS - 1));
+            unsigned int warp_id = (WARPS == 1) ? 0 : (threadIdx.x / LOGICAL_WARP_THREADS);
 
             if (lane_id == src_lane)
             {
-                ThreadStore<PTX_STORE_VS>(&smem_storage.broadcast, input);
+                ThreadStore<PTX_STORE_VS>(smem_storage.warp_scan[warp_id], input);
             }
 
-            return ThreadLoad<PTX_LOAD_VS>(&smem_storage.broadcast);
+#if (CUB_PTX_ARCH <= 110)
+            __threadfence_block();
+#endif
+            return ThreadLoad<PTX_LOAD_VS>(smem_storage.warp_scan[warp_id]);
         }
 
-        /// Basic inclusive scan
+
+
+        /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
         template <
             bool HAS_IDENTITY,
             bool SHARE_FINAL,
-            typename ScanOp>
-        static __device__ __forceinline__ T BasicScan(
-            SmemStorage     &smem_storage,      ///< Reference to shared memory allocation having layout type SmemStorage
-            unsigned int    warp_id,            ///< Warp id
-            unsigned int    lane_id,            ///< thread-lane id
-            T               partial,            ///< Calling thread's input partial reduction
-            ScanOp          scan_op)            ///< Binary associative scan functor
+            int STEP>
+        struct Iteration
         {
-            // Iterate scan steps
-            #pragma unroll
-            for (int STEP = 0; STEP < STEPS; STEP++)
+            template <typename ScanOp>
+            static __device__ __forceinline__ void ScanStep(SmemStorage &smem_storage, unsigned int warp_id, unsigned int lane_id, T &partial, ScanOp scan_op)
             {
                 const int OFFSET = 1 << STEP;
 
@@ -561,8 +555,37 @@ private:
                     T addend = ThreadLoad<PTX_LOAD_VS>(&smem_storage.warp_scan[warp_id][HALF_WARP_THREADS + lane_id - OFFSET]);
                     partial = scan_op(addend, partial);
                 }
+
+                Iteration<HAS_IDENTITY, SHARE_FINAL, STEP + 1>::ScanStep(smem_storage, warp_id, lane_id, partial, scan_op);
             }
+        };
+
 
+        /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+        template <
+            bool HAS_IDENTITY,
+            bool SHARE_FINAL>
+        struct Iteration<HAS_IDENTITY, SHARE_FINAL, STEPS>
+        {
+            template <typename ScanOp>
+            static __device__ __forceinline__ void ScanStep(SmemStorage &smem_storage, unsigned int warp_id, unsigned int lane_id, T &partial, ScanOp scan_op) {}
+        };
+
+
+        /// Basic inclusive scan
+        template <
+            bool HAS_IDENTITY,
+            bool SHARE_FINAL,
+            typename ScanOp>
+        static __device__ __forceinline__ T BasicScan(
+            SmemStorage     &smem_storage,      ///< Reference to shared memory allocation having layout type SmemStorage
+            unsigned int    warp_id,            ///< Warp id
+            unsigned int    lane_id,            ///< thread-lane id
+            T               partial,            ///< Calling thread's input partial reduction
+            ScanOp          scan_op)            ///< Binary associative scan functor
+        {
+            // Iterate scan steps
+            Iteration<HAS_IDENTITY, SHARE_FINAL, 0>::ScanStep(smem_storage, warp_id, lane_id, partial, scan_op);
 
             if (SHARE_FINAL)
             {
diff --git a/docs/download_cub.html b/docs/download_cub.html
index e67862f65f..495332b463 100644
--- a/docs/download_cub.html
+++ b/docs/download_cub.html
@@ -37,14 +37,14 @@
 </head>
 
 <body 
-	onload="downloadURL('https://github.com/NVlabs/cub/archive/0.9.3.zip');" 
+	onload="downloadURL('https://github.com/NVlabs/cub/archive/0.9.4.zip');" 
 	style="color: rgb(102, 102, 102); font-family: Helvetica, arial, freesans, clean, sans-serif; font-size: 13px; font-style: normal; font-variant: normal; font-weight: 300; height: 18px;">
 
 <center>
 If your download doesn't start in 3s:
 <br><br>
-<a href="https://github.com/NVlabs/cub/archive/0.9.3.zip"><img src="download-icon.png" style="position:relative; bottom:-10px; border:0px;"/></a>
-<a href="https://github.com/NVlabs/cub/archive/0.9.3.zip"><em>Download CUB!</em></a>
+<a href="https://github.com/NVlabs/cub/archive/0.9.4.zip"><img src="download-icon.png" style="position:relative; bottom:-10px; border:0px;"/></a>
+<a href="https://github.com/NVlabs/cub/archive/0.9.4.zip"><em>Download CUB!</em></a>
 </center>
 
 </body>
diff --git a/docs/html/annotated.html b/docs/html/annotated.html
index bddba3a726..593cabb05b 100644
--- a/docs/html/annotated.html
+++ b/docs/html/annotated.html
@@ -165,8 +165,8 @@
 <tr id="row_0_19_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_enable_if.html" target="_self">EnableIf</a></td><td class="desc">Simple enable-if (similar to Boost)</td></tr>
 <tr id="row_0_20_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_equality.html" target="_self">Equality</a></td><td class="desc">Default equality functor</td></tr>
 <tr id="row_0_21_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_equals.html" target="_self">Equals</a></td><td class="desc">Type equality test</td></tr>
-<tr id="row_0_22_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_even_share.html" target="_self">GridEvenShare</a></td><td class="desc">A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains)</td></tr>
-<tr id="row_0_23_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_queue.html" target="_self">GridQueue</a></td><td class="desc">Abstraction for grid-wide queue management</td></tr>
+<tr id="row_0_22_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_even_share.html" target="_self">GridEvenShare</a></td><td class="desc"><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains)</td></tr>
+<tr id="row_0_23_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_queue.html" target="_self">GridQueue</a></td><td class="desc"><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> is a descriptor utility for dynamic queue management</td></tr>
 <tr id="row_0_24_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_if.html" target="_self">If</a></td><td class="desc">Type selection (<code>IF ? ThenType : ElseType</code>)</td></tr>
 <tr id="row_0_25_" class="even"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_int2_type.html" target="_self">Int2Type</a></td><td class="desc">Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)</td></tr>
 <tr id="row_0_26_"><td class="entry"><img src="ftv2blank.png" alt="&#160;" width="16" height="22" /><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_is_pointer.html" target="_self">IsPointer</a></td><td class="desc">Pointer vs. iterator</td></tr>
@@ -198,7 +198,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__discontinuity_8cuh.html b/docs/html/block__discontinuity_8cuh.html
index 01f77f1c21..049f5917e6 100644
--- a/docs/html/block__discontinuity_8cuh.html
+++ b/docs/html/block__discontinuity_8cuh.html
@@ -132,7 +132,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__exchange_8cuh.html b/docs/html/block__exchange_8cuh.html
index c8405b3dd9..7b2320624b 100644
--- a/docs/html/block__exchange_8cuh.html
+++ b/docs/html/block__exchange_8cuh.html
@@ -131,7 +131,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__histo__256_8cuh.html b/docs/html/block__histo__256_8cuh.html
index c936d6f224..511125cac7 100644
--- a/docs/html/block__histo__256_8cuh.html
+++ b/docs/html/block__histo__256_8cuh.html
@@ -128,9 +128,10 @@
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="enum-members"></a>
 Enumerations</h2></td></tr>
-<tr class="memitem:a0f61554b5c901fcc01adb8af3d9aacca"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">cub::BlockHisto256Algorithm</a> { <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d">cub::BLOCK_BYTE_HISTO_SORT</a>, 
-<a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46">cub::BLOCK_BYTE_HISTO_ATOMIC</a>
+<tr class="memitem:a0f61554b5c901fcc01adb8af3d9aacca"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">cub::BlockHisto256Algorithm</a> { <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8">cub::BLOCK_HISTO_256_SORT</a>, 
+<a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e">cub::BLOCK_HISTO_256_ATOMIC</a>
  }</td></tr>
+<tr class="memdesc:a0f61554b5c901fcc01adb8af3d9aacca"><td class="mdescLeft">&#160;</td><td class="mdescRight">BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.  <a href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">More...</a><br/></td></tr>
 <tr class="separator:a0f61554b5c901fcc01adb8af3d9aacca"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
@@ -139,7 +140,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__load_8cuh.html b/docs/html/block__load_8cuh.html
index 0722a35643..e6d377040a 100644
--- a/docs/html/block__load_8cuh.html
+++ b/docs/html/block__load_8cuh.html
@@ -207,7 +207,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__radix__sort_8cuh.html b/docs/html/block__radix__sort_8cuh.html
index 7dad993d62..1a8a72c5d5 100644
--- a/docs/html/block__radix__sort_8cuh.html
+++ b/docs/html/block__radix__sort_8cuh.html
@@ -132,7 +132,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__raking__layout_8cuh.html b/docs/html/block__raking__layout_8cuh.html
index c07cd9ac39..93ec80ec4d 100644
--- a/docs/html/block__raking__layout_8cuh.html
+++ b/docs/html/block__raking__layout_8cuh.html
@@ -131,7 +131,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__reduce_8cuh.html b/docs/html/block__reduce_8cuh.html
index 5d4110c901..72bccd69ea 100644
--- a/docs/html/block__reduce_8cuh.html
+++ b/docs/html/block__reduce_8cuh.html
@@ -144,7 +144,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__scan_8cuh.html b/docs/html/block__scan_8cuh.html
index cdc440eaf0..ee00ff73f2 100644
--- a/docs/html/block__scan_8cuh.html
+++ b/docs/html/block__scan_8cuh.html
@@ -144,7 +144,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/block__store_8cuh.html b/docs/html/block__store_8cuh.html
index c4ea519e96..6aa66714b7 100644
--- a/docs/html/block__store_8cuh.html
+++ b/docs/html/block__store_8cuh.html
@@ -150,14 +150,14 @@
 <tr class="memitem:ga44dddae56b59b69c3786d65bd0a5b111"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
 <tr class="memdesc:ga44dddae56b59b69c3786d65bd0a5b111"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly.  <a href="group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111">More...</a><br/></td></tr>
 <tr class="separator:ga44dddae56b59b69c3786d65bd0a5b111"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gac0bc7f0eae136804a5ec53c65d404c64">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
-<tr class="memdesc:gac0bc7f0eae136804a5ec53c65d404c64"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly using the specified cache modifier, guarded by range.  <a href="group___block_module.html#gac0bc7f0eae136804a5ec53c65d404c64">More...</a><br/></td></tr>
-<tr class="separator:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga6c1e557324be533a3adc4e2a9a57f555">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
-<tr class="memdesc:ga6c1e557324be533a3adc4e2a9a57f555"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly, guarded by range.  <a href="group___block_module.html#ga6c1e557324be533a3adc4e2a9a57f555">More...</a><br/></td></tr>
-<tr class="separator:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gabb1342fbedb7bc02751bcfdf33be9ce8">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
+<tr class="memdesc:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly using the specified cache modifier, guarded by range.  <a href="group___block_module.html#gabb1342fbedb7bc02751bcfdf33be9ce8">More...</a><br/></td></tr>
+<tr class="separator:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga71ed9e38c7d5c183024adb72d40dae1a">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
+<tr class="memdesc:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly, guarded by range.  <a href="group___block_module.html#ga71ed9e38c7d5c183024adb72d40dae1a">More...</a><br/></td></tr>
+<tr class="separator:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr><td colspan="2"><div class="groupHeader">Direct threadblock I/O (striped arrangement)</div></td></tr>
 <tr class="memitem:gac9496dc1855119887042c1f3879b8c2f"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
 <tr class="memitem:gac9496dc1855119887042c1f3879b8c2f"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gac9496dc1855119887042c1f3879b8c2f">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
@@ -167,13 +167,13 @@
 <tr class="memitem:ga23b34fe2b784fb75221a9d5e944cec26"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
 <tr class="memdesc:ga23b34fe2b784fb75221a9d5e944cec26"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly.  <a href="group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26">More...</a><br/></td></tr>
 <tr class="separator:ga23b34fe2b784fb75221a9d5e944cec26"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gadaae2b044498e68d6d09264a32368806"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gadaae2b044498e68d6d09264a32368806"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gadaae2b044498e68d6d09264a32368806">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
-<tr class="separator:gadaae2b044498e68d6d09264a32368806"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gacb7646081377a6dbfe8476ecad623554"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gacb7646081377a6dbfe8476ecad623554"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gacb7646081377a6dbfe8476ecad623554">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
-<tr class="memdesc:gacb7646081377a6dbfe8476ecad623554"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly, guarded by range.  <a href="group___block_module.html#gacb7646081377a6dbfe8476ecad623554">More...</a><br/></td></tr>
-<tr class="separator:gacb7646081377a6dbfe8476ecad623554"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga606eaef596511b99efa4417b1ac3f896"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga606eaef596511b99efa4417b1ac3f896"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga606eaef596511b99efa4417b1ac3f896">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
+<tr class="separator:ga606eaef596511b99efa4417b1ac3f896"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
+<tr class="memdesc:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly, guarded by range.  <a href="group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82">More...</a><br/></td></tr>
+<tr class="separator:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr><td colspan="2"><div class="groupHeader">Threadblock vectorized I/O (blocked arrangement)</div></td></tr>
 <tr class="memitem:ga013c3ab8214854f45e8d678958e7dde9"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD&gt; </td></tr>
 <tr class="memitem:ga013c3ab8214854f45e8d678958e7dde9"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga013c3ab8214854f45e8d678958e7dde9">cub::BlockStoreVectorized</a> (T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
@@ -190,7 +190,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/citelist.html b/docs/html/citelist.html
index d2a6f7801f..0b65343667 100644
--- a/docs/html/citelist.html
+++ b/docs/html/citelist.html
@@ -100,7 +100,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_discontinuity-members.html b/docs/html/classcub_1_1_block_discontinuity-members.html
index cf6d56f0c5..14340561d4 100644
--- a/docs/html/classcub_1_1_block_discontinuity-members.html
+++ b/docs/html/classcub_1_1_block_discontinuity-members.html
@@ -112,7 +112,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_discontinuity.html b/docs/html/classcub_1_1_block_discontinuity.html
index 5b4e24d7cc..0ad544bc6e 100644
--- a/docs/html/classcub_1_1_block_discontinuity.html
+++ b/docs/html/classcub_1_1_block_discontinuity.html
@@ -539,7 +539,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_exchange-members.html b/docs/html/classcub_1_1_block_exchange-members.html
index 657118a85d..06d181026b 100644
--- a/docs/html/classcub_1_1_block_exchange-members.html
+++ b/docs/html/classcub_1_1_block_exchange-members.html
@@ -112,7 +112,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_exchange.html b/docs/html/classcub_1_1_block_exchange.html
index 2d03629750..a226497cd7 100644
--- a/docs/html/classcub_1_1_block_exchange.html
+++ b/docs/html/classcub_1_1_block_exchange.html
@@ -374,7 +374,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_histo256-members.html b/docs/html/classcub_1_1_block_histo256-members.html
index 6f9d835d08..38733ef0b4 100644
--- a/docs/html/classcub_1_1_block_histo256-members.html
+++ b/docs/html/classcub_1_1_block_histo256-members.html
@@ -111,7 +111,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_histo256.html b/docs/html/classcub_1_1_block_histo256.html
index ecf3514e4e..bcdded0bd5 100644
--- a/docs/html/classcub_1_1_block_histo256.html
+++ b/docs/html/classcub_1_1_block_histo256.html
@@ -105,7 +105,7 @@
 </div><!--header-->
 <div class="contents">
 <a name="details" id="details"></a><h2 class="groupheader">Detailed description</h2>
-<div class="textblock"><h3>template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT&gt;<br/>
+<div class="textblock"><h3>template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT&gt;<br/>
 class cub::BlockHisto256&lt; BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM &gt;</h3>
 
 <p><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-bin histograms from 8b dat...">BlockHisto256</a> provides methods for constructing (and compositing into) 256-bin histograms from 8b data partitioned across threads within a CUDA thread block. </p>
@@ -122,13 +122,13 @@
   <table class="tparams">
     <tr><td class="paramname">BLOCK_THREADS</td><td>The threadblock size in threads </td></tr>
     <tr><td class="paramname">ITEMS_PER_THREAD</td><td>The number of items per thread </td></tr>
-    <tr><td class="paramname">ALGORITHM</td><td><b>[optional]</b> <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">cub::BlockHisto256Algorithm</a> enumerator specifying the underlying algorithm to use (default = <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d">cub::BLOCK_BYTE_HISTO_SORT</a>)</td></tr>
+    <tr><td class="paramname">ALGORITHM</td><td><b>[optional]</b> <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca" title="BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histogra...">cub::BlockHisto256Algorithm</a> enumerator specifying the underlying algorithm to use (default = <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8">cub::BLOCK_HISTO_256_SORT</a>)</td></tr>
   </table>
   </dd>
 </dl>
 <dl class="section user"><dt>Algorithm</dt><dd><a class="el" href="classcub_1_1_block_histo256.html" title="BlockHisto256 provides methods for constructing (and compositing into) 256-bin histograms from 8b dat...">BlockHisto256</a> can be (optionally) configured to use different algorithms:<ol type="1">
-<li><b><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d">cub::BLOCK_BYTE_HISTO_SORT</a></b>. Sorting followed by differentiation. <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">More...</a></li>
-<li><b><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46">cub::BLOCK_BYTE_HISTO_ATOMIC</a></b>. Use atomic addition to update byte counts directly. <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">More...</a></li>
+<li><b><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8">cub::BLOCK_HISTO_256_SORT</a></b>. Sorting followed by differentiation. <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">More...</a></li>
+<li><b><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e">cub::BLOCK_HISTO_256_ATOMIC</a></b>. Use atomic addition to update byte counts directly. <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">More...</a></li>
 </ol>
 </dd></dl>
 <dl class="section user"><dt>Usage Considerations</dt><dd><ul>
@@ -143,7 +143,7 @@
 <li>Every thread has a valid input (i.e., full <em>vs.</em> partial-tiles)</li>
 </ul>
 </li>
-<li>See <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">cub::BlockHisto256Algorithm</a> for performance details regarding algorithmic alternatives</li>
+<li>See <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca" title="BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histogra...">cub::BlockHisto256Algorithm</a> for performance details regarding algorithmic alternatives</li>
 </ul>
 </dd></dl>
 <dl class="section user"><dt>Examples</dt><dd></dd></dl>
@@ -221,7 +221,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT&gt; </div>
+template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT&gt; </div>
 <div class="memtemplate">
 template&lt;typename HistoCounter &gt; </div>
 <table class="mlabels">
@@ -250,7 +250,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT&gt; </div>
+template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT&gt; </div>
 <div class="memtemplate">
 template&lt;typename HistoCounter &gt; </div>
 <table class="mlabels">
@@ -312,7 +312,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_BYTE_HISTO_SORT&gt; </div>
+template&lt;int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockHisto256Algorithm ALGORITHM = BLOCK_HISTO_256_SORT&gt; </div>
 <div class="memtemplate">
 template&lt;typename HistoCounter &gt; </div>
 <table class="mlabels">
@@ -376,7 +376,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_load-members.html b/docs/html/classcub_1_1_block_load-members.html
index 465278200d..f4b2740a19 100644
--- a/docs/html/classcub_1_1_block_load-members.html
+++ b/docs/html/classcub_1_1_block_load-members.html
@@ -110,7 +110,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_load.html b/docs/html/classcub_1_1_block_load.html
index dd3e888270..02205062ff 100644
--- a/docs/html/classcub_1_1_block_load.html
+++ b/docs/html/classcub_1_1_block_load.html
@@ -318,7 +318,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_radix_sort-members.html b/docs/html/classcub_1_1_block_radix_sort-members.html
index bbc0773507..a2eeef51ed 100644
--- a/docs/html/classcub_1_1_block_radix_sort-members.html
+++ b/docs/html/classcub_1_1_block_radix_sort-members.html
@@ -114,7 +114,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_radix_sort.html b/docs/html/classcub_1_1_block_radix_sort.html
index 775838486e..a96fd472cd 100644
--- a/docs/html/classcub_1_1_block_radix_sort.html
+++ b/docs/html/classcub_1_1_block_radix_sort.html
@@ -619,7 +619,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_reduce-members.html b/docs/html/classcub_1_1_block_reduce-members.html
index 4bc0a44b48..e2ae48bb4a 100644
--- a/docs/html/classcub_1_1_block_reduce-members.html
+++ b/docs/html/classcub_1_1_block_reduce-members.html
@@ -114,7 +114,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_reduce.html b/docs/html/classcub_1_1_block_reduce.html
index f14f370f48..d6ec51477f 100644
--- a/docs/html/classcub_1_1_block_reduce.html
+++ b/docs/html/classcub_1_1_block_reduce.html
@@ -587,7 +587,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_scan-members.html b/docs/html/classcub_1_1_block_scan-members.html
index 044755783c..243a90e997 100644
--- a/docs/html/classcub_1_1_block_scan-members.html
+++ b/docs/html/classcub_1_1_block_scan-members.html
@@ -138,7 +138,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_scan.html b/docs/html/classcub_1_1_block_scan.html
index 4d1531db90..6b3ba08055 100644
--- a/docs/html/classcub_1_1_block_scan.html
+++ b/docs/html/classcub_1_1_block_scan.html
@@ -2639,7 +2639,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_store-members.html b/docs/html/classcub_1_1_block_store-members.html
index 368e30fa17..0458832331 100644
--- a/docs/html/classcub_1_1_block_store-members.html
+++ b/docs/html/classcub_1_1_block_store-members.html
@@ -105,12 +105,12 @@
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_block_store.html#a44f8ee5a77fc8cc348a68a6d233ae68d">SmemStorage</a> typedef</td><td class="entry"><a class="el" href="classcub_1_1_block_store.html">cub::BlockStore&lt; OutputIteratorRA, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER &gt;</a></td><td class="entry"></td></tr>
   <tr><td class="entry"><a class="el" href="classcub_1_1_block_store.html#a4d7858f738321f7130bc0be7d6fa46a5">Store</a>(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])</td><td class="entry"><a class="el" href="classcub_1_1_block_store.html">cub::BlockStore&lt; OutputIteratorRA, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_block_store.html#a54a10a5fcc8e755941ee5d293440b0af">Store</a>(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td><td class="entry"><a class="el" href="classcub_1_1_block_store.html">cub::BlockStore&lt; OutputIteratorRA, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_block_store.html#a2244c761e873e8e5334dccad9bea6657">Store</a>(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td><td class="entry"><a class="el" href="classcub_1_1_block_store.html">cub::BlockStore&lt; OutputIteratorRA, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
 </table></div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_block_store.html b/docs/html/classcub_1_1_block_store.html
index 7ebb0e00ea..a480d540e2 100644
--- a/docs/html/classcub_1_1_block_store.html
+++ b/docs/html/classcub_1_1_block_store.html
@@ -197,11 +197,10 @@
 __forceinline__ void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_block_store.html#a4d7858f738321f7130bc0be7d6fa46a5">Store</a> (<a class="el" href="classcub_1_1_block_store.html#a44f8ee5a77fc8cc348a68a6d233ae68d">SmemStorage</a> &amp;smem_storage, OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
 <tr class="memdesc:a4d7858f738321f7130bc0be7d6fa46a5"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock.  <a href="#a4d7858f738321f7130bc0be7d6fa46a5">More...</a><br/></td></tr>
 <tr class="separator:a4d7858f738321f7130bc0be7d6fa46a5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a54a10a5fcc8e755941ee5d293440b0af"><td class="memTemplParams" colspan="2">template&lt;typename SizeT &gt; </td></tr>
-<tr class="memitem:a54a10a5fcc8e755941ee5d293440b0af"><td class="memTemplItemLeft" align="right" valign="top">static __device__ <br class="typebreak"/>
-__forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="classcub_1_1_block_store.html#a54a10a5fcc8e755941ee5d293440b0af">Store</a> (<a class="el" href="classcub_1_1_block_store.html#a44f8ee5a77fc8cc348a68a6d233ae68d">SmemStorage</a> &amp;smem_storage, OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
-<tr class="memdesc:a54a10a5fcc8e755941ee5d293440b0af"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock, guarded by range.  <a href="#a54a10a5fcc8e755941ee5d293440b0af">More...</a><br/></td></tr>
-<tr class="separator:a54a10a5fcc8e755941ee5d293440b0af"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2244c761e873e8e5334dccad9bea6657"><td class="memItemLeft" align="right" valign="top">static __device__ <br class="typebreak"/>
+__forceinline__ void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_block_store.html#a2244c761e873e8e5334dccad9bea6657">Store</a> (<a class="el" href="classcub_1_1_block_store.html#a44f8ee5a77fc8cc348a68a6d233ae68d">SmemStorage</a> &amp;smem_storage, OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
+<tr class="memdesc:a2244c761e873e8e5334dccad9bea6657"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock, guarded by range.  <a href="#a2244c761e873e8e5334dccad9bea6657">More...</a><br/></td></tr>
+<tr class="separator:a2244c761e873e8e5334dccad9bea6657"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a class="anchor" id="a4d7858f738321f7130bc0be7d6fa46a5"></a>
@@ -256,13 +255,11 @@ <h2 class="groupheader">Member Function Documentation</h2>
 
 </div>
 </div>
-<a class="anchor" id="a54a10a5fcc8e755941ee5d293440b0af"></a>
+<a class="anchor" id="a2244c761e873e8e5334dccad9bea6657"></a>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
 template&lt;typename OutputIteratorRA , int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockStorePolicy POLICY = BLOCK_STORE_DIRECT, PtxStoreModifier MODIFIER = PTX_STORE_NONE&gt; </div>
-<div class="memtemplate">
-template&lt;typename SizeT &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
@@ -282,7 +279,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const SizeT &amp;&#160;</td>
+          <td class="paramtype">const int &amp;&#160;</td>
           <td class="paramname"><em>guarded_items</em>, </td>
         </tr>
         <tr>
@@ -305,12 +302,6 @@ <h2 class="groupheader">Member Function Documentation</h2>
 </div><div class="memdoc">
 
 <p>Store a tile of items across a threadblock, guarded by range. </p>
-<dl class="tparams"><dt>Template Parameters</dt><dd>
-  <table class="tparams">
-    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integer type for offsets </td></tr>
-  </table>
-  </dd>
-</dl>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
     <tr><td class="paramdir">[in]</td><td class="paramname">smem_storage</td><td>Reference to shared memory allocation having layout type SmemStorage </td></tr>
@@ -330,7 +321,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_device-members.html b/docs/html/classcub_1_1_device-members.html
index 0a0a8bb201..7963b637f8 100644
--- a/docs/html/classcub_1_1_device-members.html
+++ b/docs/html/classcub_1_1_device-members.html
@@ -127,7 +127,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_device.html b/docs/html/classcub_1_1_device.html
index f8313c98ac..37ea119db0 100644
--- a/docs/html/classcub_1_1_device.html
+++ b/docs/html/classcub_1_1_device.html
@@ -325,7 +325,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_device_allocator-members.html b/docs/html/classcub_1_1_device_allocator-members.html
index 21611c467d..1e9f92a210 100644
--- a/docs/html/classcub_1_1_device_allocator-members.html
+++ b/docs/html/classcub_1_1_device_allocator-members.html
@@ -110,7 +110,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_device_allocator.html b/docs/html/classcub_1_1_device_allocator.html
index 978af2f0c5..687fd490bc 100644
--- a/docs/html/classcub_1_1_device_allocator.html
+++ b/docs/html/classcub_1_1_device_allocator.html
@@ -223,7 +223,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_grid_even_share-members.html b/docs/html/classcub_1_1_grid_even_share-members.html
index 4928c13274..f47d967e36 100644
--- a/docs/html/classcub_1_1_grid_even_share-members.html
+++ b/docs/html/classcub_1_1_grid_even_share-members.html
@@ -107,6 +107,8 @@
   <tr><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#af5daeadacd071a19611f342053b1978b">block_oob</a></td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#a214c0fc54f5ade75049ade09a08becf5">BlockInit</a>()</td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#a0f0c30d111858d38fc39dd223e8ca676">grid_size</a></td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#a34167aebdf4b8f6e0ab0d8dbee988e91">GridEvenShare</a>(SizeT num_items)</td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#ae37f606dd91894e9341e82affe757b0b">GridEvenShare</a>()</td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#a239f96ab615df22e26c53cde6000e39f">GridInit</a>(SizeT num_items, int max_grid_size, int schedule_granularity)</td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#a80a9d9d0c146b1aa7b26137553941c83">num_items</a></td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html#a19880a9aa9fc18046d0a4d20fb1126d5">Print</a>()</td><td class="entry"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
@@ -114,7 +116,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_grid_even_share.html b/docs/html/classcub_1_1_grid_even_share.html
index a7f66b9335..803c58a5db 100644
--- a/docs/html/classcub_1_1_grid_even_share.html
+++ b/docs/html/classcub_1_1_grid_even_share.html
@@ -108,9 +108,9 @@
 <div class="textblock"><h3>template&lt;typename SizeT&gt;<br/>
 class cub::GridEvenShare&lt; SizeT &gt;</h3>
 
-<p>A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). </p>
-<dl class="section user"><dt>Overview</dt><dd><a class="el" href="classcub_1_1_grid_even_share.html" title="A descriptor utility for distributing input among CUDA threadblocks in an &quot;even-share&quot; fashion...">GridEvenShare</a> indicates which sections of input are to be mapped onto which threadblocks. Threadblocks may receive one of three different amounts of work: "big", "normal", and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit for the last threadblock may be partially-full if the input is not an even multiple of the scheduling grain size.</dd></dl>
-<dl class="section user"><dt></dt><dd>Before invoking a child grid, a parent thread will typically construct and initialize an instance of <a class="el" href="classcub_1_1_grid_even_share.html" title="A descriptor utility for distributing input among CUDA threadblocks in an &quot;even-share&quot; fashion...">GridEvenShare</a> using <code><a class="el" href="classcub_1_1_grid_even_share.html#a239f96ab615df22e26c53cde6000e39f" title="Initializes the grid-specific members num_items and grid_size. To be called prior prior to kernel lau...">GridInit()</a></code>. The instance can be passed to child threadblocks which can initialize their per-threadblock offsets using <code><a class="el" href="classcub_1_1_grid_even_share.html#a214c0fc54f5ade75049ade09a08becf5" title="Initializes the threadblock-specific details (e.g., to be called by each threadblock after startup)...">BlockInit()</a></code>.</dd></dl>
+<p><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). </p>
+<dl class="section user"><dt>Overview</dt><dd><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> indicates which sections of input are to be mapped onto which threadblocks. Threadblocks may receive one of three different amounts of work: "big", "normal", and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit for the last threadblock may be partially-full if the input is not an even multiple of the scheduling grain size.</dd></dl>
+<dl class="section user"><dt></dt><dd>Before invoking a child grid, a parent thread will typically construct and initialize an instance of <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> using <code><a class="el" href="classcub_1_1_grid_even_share.html#a239f96ab615df22e26c53cde6000e39f" title="Initializes the grid-specific members num_items and grid_size. To be called prior prior to kernel lau...">GridInit()</a></code>. The instance can be passed to child threadblocks which can initialize their per-threadblock offsets using <code><a class="el" href="classcub_1_1_grid_even_share.html#a214c0fc54f5ade75049ade09a08becf5" title="Initializes the threadblock-specific details (e.g., to be called by each threadblock after startup)...">BlockInit()</a></code>.</dd></dl>
 <dl class="tparams"><dt>Template Parameters</dt><dd>
   <table class="tparams">
     <tr><td class="paramname">SizeT</td><td>Integer type for array indexing </td></tr>
@@ -120,6 +120,14 @@
 </div><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
 Public Methods</h2></td></tr>
+<tr class="memitem:a34167aebdf4b8f6e0ab0d8dbee988e91"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a34167aebdf4b8f6e0ab0d8dbee988e91"></a>
+__device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_even_share.html#a34167aebdf4b8f6e0ab0d8dbee988e91">GridEvenShare</a> (SizeT <a class="el" href="classcub_1_1_grid_even_share.html#a80a9d9d0c146b1aa7b26137553941c83">num_items</a>)</td></tr>
+<tr class="memdesc:a34167aebdf4b8f6e0ab0d8dbee988e91"><td class="mdescLeft">&#160;</td><td class="mdescRight">Block-based constructor for single-block grids. <br/></td></tr>
+<tr class="separator:a34167aebdf4b8f6e0ab0d8dbee988e91"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae37f606dd91894e9341e82affe757b0b"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ae37f606dd91894e9341e82affe757b0b"></a>
+__host__ __device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_even_share.html#ae37f606dd91894e9341e82affe757b0b">GridEvenShare</a> ()</td></tr>
+<tr class="memdesc:ae37f606dd91894e9341e82affe757b0b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Default constructor. Zero-initializes block-specific fields. <br/></td></tr>
+<tr class="separator:ae37f606dd91894e9341e82affe757b0b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a214c0fc54f5ade75049ade09a08becf5"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a214c0fc54f5ade75049ade09a08becf5"></a>
 __device__ __forceinline__ void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_even_share.html#a214c0fc54f5ade75049ade09a08becf5">BlockInit</a> ()</td></tr>
 <tr class="memdesc:a214c0fc54f5ade75049ade09a08becf5"><td class="mdescLeft">&#160;</td><td class="mdescRight">Initializes the threadblock-specific details (e.g., to be called by each threadblock after startup) <br/></td></tr>
@@ -231,13 +239,13 @@ <h2 class="groupheader">Member Function Documentation</h2>
 </div>
 </div>
 <hr/>The documentation for this class was generated from the following file:<ul>
-<li>grid_even_share.cuh</li>
+<li><a class="el" href="grid__even__share_8cuh.html">grid_even_share.cuh</a></li>
 </ul>
 </div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_grid_queue-members.html b/docs/html/classcub_1_1_grid_queue-members.html
index 99415e20e1..6d2a7e1a09 100644
--- a/docs/html/classcub_1_1_grid_queue-members.html
+++ b/docs/html/classcub_1_1_grid_queue-members.html
@@ -116,7 +116,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_grid_queue.html b/docs/html/classcub_1_1_grid_queue.html
index 40df640bd0..d1cf7d5365 100644
--- a/docs/html/classcub_1_1_grid_queue.html
+++ b/docs/html/classcub_1_1_grid_queue.html
@@ -107,22 +107,28 @@
 <div class="textblock"><h3>template&lt;typename SizeT&gt;<br/>
 class cub::GridQueue&lt; SizeT &gt;</h3>
 
-<p>Abstraction for grid-wide queue management. </p>
-<dl class="section user"><dt>Overview</dt><dd><a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a> descriptors provides abstractions for "filling" or "draining" globally-shared vectors.</dd></dl>
-<dl class="section user"><dt></dt><dd>A "filling" <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a> works by atomically-adding to a zero-initialized counter, returning a unique offset for the calling thread to write its items. The <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a> maintains the total "fill-size". The fill counter must be reset using <a class="el" href="classcub_1_1_grid_queue.html#a80a28b09434c0cf7555b6b4e8744d78f" title="This operation resets the fill counter. To be called by the host or by a kernel prior to that which w...">GridQueue::ResetFill</a> by the host or kernel instance prior to the kernel instance that will be filling.</dd></dl>
-<dl class="section user"><dt></dt><dd>Similarly a "draining" <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a> works by works by atomically-incrementing a zero-initialized counter, returning a unique offset for the calling thread to read its items. Threads can safely drain until the array's logical fill-size is exceeded. The drain counter must be reset using <a class="el" href="classcub_1_1_grid_queue.html#aaf7645e10ebf16253cc45013d159bf91" title="This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining ...">GridQueue::ResetDrain</a> or <a class="el" href="classcub_1_1_grid_queue.html#aa4a0cb5753c50994e639d483c8db1320" title="This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.">GridQueue::ResetDrainAfterFill</a> by the host or kernel instance prior to the kernel instance that will be filling. (For dynamic work distribution of existing data, the corresponding fill-size is simply the number of elements in the array.)</dd></dl>
-<dl class="section user"><dt></dt><dd>Iterative work management can be implemented simply with a pair of flip-flopping work buffers, each with an associated set of fill and drain <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a> descriptors. </dd></dl>
+<p><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> is a descriptor utility for dynamic queue management. </p>
+<dl class="section user"><dt>Overview</dt><dd><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptors provides abstractions for "filling" or "draining" globally-shared vectors.</dd></dl>
+<dl class="section user"><dt></dt><dd>A "filling" <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> works by atomically-adding to a zero-initialized counter, returning a unique offset for the calling thread to write its items. The <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> maintains the total "fill-size". The fill counter must be reset using <a class="el" href="classcub_1_1_grid_queue.html#a80a28b09434c0cf7555b6b4e8744d78f" title="This operation resets the fill counter. To be called by the host or by a kernel prior to that which w...">GridQueue::ResetFill</a> by the host or kernel instance prior to the kernel instance that will be filling.</dd></dl>
+<dl class="section user"><dt></dt><dd>Similarly a "draining" <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> works by works by atomically-incrementing a zero-initialized counter, returning a unique offset for the calling thread to read its items. Threads can safely drain until the array's logical fill-size is exceeded. The drain counter must be reset using <a class="el" href="classcub_1_1_grid_queue.html#aaf7645e10ebf16253cc45013d159bf91" title="This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining ...">GridQueue::ResetDrain</a> or <a class="el" href="classcub_1_1_grid_queue.html#aa4a0cb5753c50994e639d483c8db1320" title="This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.">GridQueue::ResetDrainAfterFill</a> by the host or kernel instance prior to the kernel instance that will be filling. (For dynamic work distribution of existing data, the corresponding fill-size is simply the number of elements in the array.)</dd></dl>
+<dl class="section user"><dt></dt><dd>Iterative work management can be implemented simply with a pair of flip-flopping work buffers, each with an associated set of fill and drain <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptors.</dd></dl>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">SizeT</td><td>Integer type for array indexing </td></tr>
+  </table>
+  </dd>
+</dl>
 </div><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-methods"></a>
 Public Methods</h2></td></tr>
 <tr class="memitem:a94f344fcd45b02cd2d6e9b04d8f2665a"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a94f344fcd45b02cd2d6e9b04d8f2665a"></a>
 __host__ __device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html#a94f344fcd45b02cd2d6e9b04d8f2665a">GridQueue</a> ()</td></tr>
-<tr class="memdesc:a94f344fcd45b02cd2d6e9b04d8f2665a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructs an invalid <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a> descriptor. Does not allocate any queue resources. <br/></td></tr>
+<tr class="memdesc:a94f344fcd45b02cd2d6e9b04d8f2665a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructs an invalid <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor. Does not allocate any queue resources. <br/></td></tr>
 <tr class="separator:a94f344fcd45b02cd2d6e9b04d8f2665a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a8e3b217f201534c3056b564e0961aa2f"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a8e3b217f201534c3056b564e0961aa2f"></a>
 __host__ __device__ <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html#a8e3b217f201534c3056b564e0961aa2f">Allocate</a> (<a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
-<tr class="memdesc:a8e3b217f201534c3056b564e0961aa2f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Allocates the global resources necessary for this <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a>. <br/></td></tr>
+<tr class="memdesc:a8e3b217f201534c3056b564e0961aa2f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Allocates the global resources necessary for this <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a>. <br/></td></tr>
 <tr class="separator:a8e3b217f201534c3056b564e0961aa2f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af6c8e3ff3fdb6f3e1ce7713d417cdf62"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="af6c8e3ff3fdb6f3e1ce7713d417cdf62"></a>
 __device__ __forceinline__ SizeT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html#af6c8e3ff3fdb6f3e1ce7713d417cdf62">Drain</a> (SizeT num_items)</td></tr>
@@ -140,12 +146,12 @@
 <tr class="memitem:aff77aa7ee352b3bf62725f0e6f41502a"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="aff77aa7ee352b3bf62725f0e6f41502a"></a>
 __host__ __device__ <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html#aff77aa7ee352b3bf62725f0e6f41502a">Free</a> (<a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
-<tr class="memdesc:aff77aa7ee352b3bf62725f0e6f41502a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees the global resources used by this <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a>. <br/></td></tr>
+<tr class="memdesc:aff77aa7ee352b3bf62725f0e6f41502a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees the global resources used by this <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a>. <br/></td></tr>
 <tr class="separator:aff77aa7ee352b3bf62725f0e6f41502a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aaf7645e10ebf16253cc45013d159bf91"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="aaf7645e10ebf16253cc45013d159bf91"></a>
 __host__ __device__ <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html#aaf7645e10ebf16253cc45013d159bf91">ResetDrain</a> (SizeT fill_size, cudaStream_t stream=0)</td></tr>
-<tr class="memdesc:aaf7645e10ebf16253cc45013d159bf91"><td class="mdescLeft">&#160;</td><td class="mdescRight">This operation sets the fill-size and resets the drain counter, preparing the <a class="el" href="classcub_1_1_grid_queue.html" title="Abstraction for grid-wide queue management.">GridQueue</a> for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. <br/></td></tr>
+<tr class="memdesc:aaf7645e10ebf16253cc45013d159bf91"><td class="mdescLeft">&#160;</td><td class="mdescRight">This operation sets the fill-size and resets the drain counter, preparing the <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. <br/></td></tr>
 <tr class="separator:aaf7645e10ebf16253cc45013d159bf91"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa4a0cb5753c50994e639d483c8db1320"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="aa4a0cb5753c50994e639d483c8db1320"></a>
 __host__ __device__ <br class="typebreak"/>
@@ -159,13 +165,13 @@
 <tr class="separator:a80a28b09434c0cf7555b6b4e8744d78f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <hr/>The documentation for this class was generated from the following file:<ul>
-<li>grid_queue.cuh</li>
+<li><a class="el" href="grid__queue_8cuh.html">grid_queue.cuh</a></li>
 </ul>
 </div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_tex_iterator_r_a-members.html b/docs/html/classcub_1_1_tex_iterator_r_a-members.html
index b8d5dde546..755cb419e3 100644
--- a/docs/html/classcub_1_1_tex_iterator_r_a-members.html
+++ b/docs/html/classcub_1_1_tex_iterator_r_a-members.html
@@ -103,15 +103,15 @@
 
 <p>This is the complete list of members for <a class="el" href="classcub_1_1_tex_iterator_r_a.html">cub::TexIteratorRA&lt; T &gt;</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#a40805822c8f8966dd07221ccf9c5231b">BindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html">cub::TexIteratorRA&lt; T &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#a5f6750afce62e3b947dbb2c6f12b4364">BindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html">cub::TexIteratorRA&lt; T &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#a5e210ab4fc1c16ef4fadffb56eba1319">TexBindingTag</a> typedef</td><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html">cub::TexIteratorRA&lt; T &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#a3f7b0ec2266c97661379c72265e75c25">TexIteratorRA</a>(T *ptr)</td><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html">cub::TexIteratorRA&lt; T &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#ab33bd552a21c85dea863b75b09aa6e87">UnbindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html">cub::TexIteratorRA&lt; T &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#abfc8c42e2da2beffb05ac81ec89ed8b9">UnbindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_iterator_r_a.html">cub::TexIteratorRA&lt; T &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
 </table></div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_tex_iterator_r_a.html b/docs/html/classcub_1_1_tex_iterator_r_a.html
index 1e1f17549a..550586b6da 100644
--- a/docs/html/classcub_1_1_tex_iterator_r_a.html
+++ b/docs/html/classcub_1_1_tex_iterator_r_a.html
@@ -136,16 +136,14 @@
 <tr class="memitem:a3f7b0ec2266c97661379c72265e75c25"><td class="memItemLeft" align="right" valign="top">__host__ __device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#a3f7b0ec2266c97661379c72265e75c25">TexIteratorRA</a> (T *ptr)</td></tr>
 <tr class="memdesc:a3f7b0ec2266c97661379c72265e75c25"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor.  <a href="#a3f7b0ec2266c97661379c72265e75c25">More...</a><br/></td></tr>
 <tr class="separator:a3f7b0ec2266c97661379c72265e75c25"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a40805822c8f8966dd07221ccf9c5231b"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a40805822c8f8966dd07221ccf9c5231b"></a>
-__host__ __forceinline__ <br class="typebreak"/>
-cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#a40805822c8f8966dd07221ccf9c5231b">BindTexture</a> ()</td></tr>
-<tr class="memdesc:a40805822c8f8966dd07221ccf9c5231b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Bind iterator to texture reference. <br/></td></tr>
-<tr class="separator:a40805822c8f8966dd07221ccf9c5231b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab33bd552a21c85dea863b75b09aa6e87"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ab33bd552a21c85dea863b75b09aa6e87"></a>
-__host__ __forceinline__ <br class="typebreak"/>
-cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#ab33bd552a21c85dea863b75b09aa6e87">UnbindTexture</a> ()</td></tr>
-<tr class="memdesc:ab33bd552a21c85dea863b75b09aa6e87"><td class="mdescLeft">&#160;</td><td class="mdescRight">Unbind iterator to texture reference. <br/></td></tr>
-<tr class="separator:ab33bd552a21c85dea863b75b09aa6e87"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5f6750afce62e3b947dbb2c6f12b4364"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a5f6750afce62e3b947dbb2c6f12b4364"></a>
+cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#a5f6750afce62e3b947dbb2c6f12b4364">BindTexture</a> ()</td></tr>
+<tr class="memdesc:a5f6750afce62e3b947dbb2c6f12b4364"><td class="mdescLeft">&#160;</td><td class="mdescRight">Bind iterator to texture reference. <br/></td></tr>
+<tr class="separator:a5f6750afce62e3b947dbb2c6f12b4364"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abfc8c42e2da2beffb05ac81ec89ed8b9"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="abfc8c42e2da2beffb05ac81ec89ed8b9"></a>
+cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_iterator_r_a.html#abfc8c42e2da2beffb05ac81ec89ed8b9">UnbindTexture</a> ()</td></tr>
+<tr class="memdesc:abfc8c42e2da2beffb05ac81ec89ed8b9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Unbind iterator to texture reference. <br/></td></tr>
+<tr class="separator:abfc8c42e2da2beffb05ac81ec89ed8b9"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <a class="anchor" id="a3f7b0ec2266c97661379c72265e75c25"></a>
@@ -189,7 +187,7 @@ <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_tex_transform_iterator_r_a-members.html b/docs/html/classcub_1_1_tex_transform_iterator_r_a-members.html
index 108d115025..82f79048ce 100644
--- a/docs/html/classcub_1_1_tex_transform_iterator_r_a-members.html
+++ b/docs/html/classcub_1_1_tex_transform_iterator_r_a-members.html
@@ -103,15 +103,15 @@
 
 <p>This is the complete list of members for <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a9ff64f31b9d4ac34a04de1905f64f898">BindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#aaa167451624a1326a9d66b927a861c18">BindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#ad3c3a602ae30d4badc2710dfa9148d27">TexBindingTag</a> typedef</td><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a5521ef0385c88805c7ec4f75283d119d">TexTransformIteratorRA</a>(InputType *ptr, ConversionOp conversion_op)</td><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a30fbb31a0a53d87f47174ee1889a6435">UnbindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#abd030991795b9c9ca31f4293486d206a">UnbindTexture</a>()</td><td class="entry"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
 </table></div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_tex_transform_iterator_r_a.html b/docs/html/classcub_1_1_tex_transform_iterator_r_a.html
index 27ec823978..0110b14bff 100644
--- a/docs/html/classcub_1_1_tex_transform_iterator_r_a.html
+++ b/docs/html/classcub_1_1_tex_transform_iterator_r_a.html
@@ -136,16 +136,14 @@
 <tr class="memitem:a5521ef0385c88805c7ec4f75283d119d"><td class="memItemLeft" align="right" valign="top">__host__ __device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a5521ef0385c88805c7ec4f75283d119d">TexTransformIteratorRA</a> (InputType *ptr, ConversionOp conversion_op)</td></tr>
 <tr class="memdesc:a5521ef0385c88805c7ec4f75283d119d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor.  <a href="#a5521ef0385c88805c7ec4f75283d119d">More...</a><br/></td></tr>
 <tr class="separator:a5521ef0385c88805c7ec4f75283d119d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9ff64f31b9d4ac34a04de1905f64f898"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a9ff64f31b9d4ac34a04de1905f64f898"></a>
-__host__ __forceinline__ <br class="typebreak"/>
-cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a9ff64f31b9d4ac34a04de1905f64f898">BindTexture</a> ()</td></tr>
-<tr class="memdesc:a9ff64f31b9d4ac34a04de1905f64f898"><td class="mdescLeft">&#160;</td><td class="mdescRight">Bind iterator to texture reference. <br/></td></tr>
-<tr class="separator:a9ff64f31b9d4ac34a04de1905f64f898"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a30fbb31a0a53d87f47174ee1889a6435"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a30fbb31a0a53d87f47174ee1889a6435"></a>
-__host__ __forceinline__ <br class="typebreak"/>
-cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a30fbb31a0a53d87f47174ee1889a6435">UnbindTexture</a> ()</td></tr>
-<tr class="memdesc:a30fbb31a0a53d87f47174ee1889a6435"><td class="mdescLeft">&#160;</td><td class="mdescRight">Unbind iterator to texture reference. <br/></td></tr>
-<tr class="separator:a30fbb31a0a53d87f47174ee1889a6435"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aaa167451624a1326a9d66b927a861c18"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="aaa167451624a1326a9d66b927a861c18"></a>
+cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#aaa167451624a1326a9d66b927a861c18">BindTexture</a> ()</td></tr>
+<tr class="memdesc:aaa167451624a1326a9d66b927a861c18"><td class="mdescLeft">&#160;</td><td class="mdescRight">Bind iterator to texture reference. <br/></td></tr>
+<tr class="separator:aaa167451624a1326a9d66b927a861c18"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abd030991795b9c9ca31f4293486d206a"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="abd030991795b9c9ca31f4293486d206a"></a>
+cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#abd030991795b9c9ca31f4293486d206a">UnbindTexture</a> ()</td></tr>
+<tr class="memdesc:abd030991795b9c9ca31f4293486d206a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Unbind iterator to texture reference. <br/></td></tr>
+<tr class="separator:abd030991795b9c9ca31f4293486d206a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <a class="anchor" id="a5521ef0385c88805c7ec4f75283d119d"></a>
@@ -200,7 +198,7 @@ <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_transform_iterator_r_a-members.html b/docs/html/classcub_1_1_transform_iterator_r_a-members.html
index c9671ab9e2..cf8b0e1d2f 100644
--- a/docs/html/classcub_1_1_transform_iterator_r_a-members.html
+++ b/docs/html/classcub_1_1_transform_iterator_r_a-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_transform_iterator_r_a.html b/docs/html/classcub_1_1_transform_iterator_r_a.html
index 78ad6287cf..6eb8a70cee 100644
--- a/docs/html/classcub_1_1_transform_iterator_r_a.html
+++ b/docs/html/classcub_1_1_transform_iterator_r_a.html
@@ -177,7 +177,7 @@ <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_warp_reduce-members.html b/docs/html/classcub_1_1_warp_reduce-members.html
index 428db07e53..a5eba43f57 100644
--- a/docs/html/classcub_1_1_warp_reduce-members.html
+++ b/docs/html/classcub_1_1_warp_reduce-members.html
@@ -112,7 +112,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_warp_reduce.html b/docs/html/classcub_1_1_warp_reduce.html
index 16310b4349..4488c351be 100644
--- a/docs/html/classcub_1_1_warp_reduce.html
+++ b/docs/html/classcub_1_1_warp_reduce.html
@@ -449,7 +449,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_warp_scan-members.html b/docs/html/classcub_1_1_warp_scan-members.html
index 4bb9a497d7..6c3d1dbd54 100644
--- a/docs/html/classcub_1_1_warp_scan-members.html
+++ b/docs/html/classcub_1_1_warp_scan-members.html
@@ -123,7 +123,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classcub_1_1_warp_scan.html b/docs/html/classcub_1_1_warp_scan.html
index eecf843fdc..09eb5c55fb 100644
--- a/docs/html/classcub_1_1_warp_scan.html
+++ b/docs/html/classcub_1_1_warp_scan.html
@@ -1425,7 +1425,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/classes.html b/docs/html/classes.html
index a965777921..f00fcd64cb 100644
--- a/docs/html/classes.html
+++ b/docs/html/classes.html
@@ -136,7 +136,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/cub_8cuh.html b/docs/html/cub_8cuh.html
index 0631a98dd5..b6f1bdc375 100644
--- a/docs/html/cub_8cuh.html
+++ b/docs/html/cub_8cuh.html
@@ -128,7 +128,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/device__histo__256_8cuh.html b/docs/html/device__histo__256_8cuh.html
index 0926ee61f3..250fe60cdb 100644
--- a/docs/html/device__histo__256_8cuh.html
+++ b/docs/html/device__histo__256_8cuh.html
@@ -105,12 +105,12 @@
 <div class="contents">
 <div class="textblock"><code>#include &lt;stdio.h&gt;</code><br/>
 <code>#include &lt;iterator&gt;</code><br/>
-<code>#include &quot;tiles/tiles_histo_256.cuh&quot;</code><br/>
+<code>#include &quot;persistent_block/persistent_block_histo_256.cuh&quot;</code><br/>
 <code>#include &quot;<a class="el" href="block__load_8cuh.html">../block/block_load.cuh</a>&quot;</code><br/>
 <code>#include &quot;<a class="el" href="thread__reduce_8cuh.html">../thread/thread_reduce.cuh</a>&quot;</code><br/>
 <code>#include &quot;../util_allocator.cuh&quot;</code><br/>
-<code>#include &quot;../grid/grid_even_share.cuh&quot;</code><br/>
-<code>#include &quot;../grid/grid_queue.cuh&quot;</code><br/>
+<code>#include &quot;<a class="el" href="grid__even__share_8cuh.html">../grid/grid_even_share.cuh</a>&quot;</code><br/>
+<code>#include &quot;<a class="el" href="grid__queue_8cuh.html">../grid/grid_queue.cuh</a>&quot;</code><br/>
 <code>#include &quot;<a class="el" href="util__debug_8cuh.html">../util_debug.cuh</a>&quot;</code><br/>
 <code>#include &quot;<a class="el" href="util__iterator_8cuh.html">../util_iterator.cuh</a>&quot;</code><br/>
 </div><table class="memberdecls">
@@ -137,7 +137,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/device__reduce_8cuh.html b/docs/html/device__reduce_8cuh.html
index 6c31098e83..79734493ef 100644
--- a/docs/html/device__reduce_8cuh.html
+++ b/docs/html/device__reduce_8cuh.html
@@ -105,10 +105,11 @@
 <div class="contents">
 <div class="textblock"><code>#include &lt;stdio.h&gt;</code><br/>
 <code>#include &lt;iterator&gt;</code><br/>
-<code>#include &quot;tiles/tiles_reduce.cuh&quot;</code><br/>
+<code>#include &quot;persistent_block/persistent_block_reduce.cuh&quot;</code><br/>
 <code>#include &quot;../util_allocator.cuh&quot;</code><br/>
-<code>#include &quot;../grid/grid_even_share.cuh&quot;</code><br/>
-<code>#include &quot;../grid/grid_queue.cuh&quot;</code><br/>
+<code>#include &quot;<a class="el" href="grid__mapping_8cuh.html">../grid/grid_mapping.cuh</a>&quot;</code><br/>
+<code>#include &quot;<a class="el" href="grid__even__share_8cuh.html">../grid/grid_even_share.cuh</a>&quot;</code><br/>
+<code>#include &quot;<a class="el" href="grid__queue_8cuh.html">../grid/grid_queue.cuh</a>&quot;</code><br/>
 </div><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="nested-classes"></a>
 Classes</h2></td></tr>
@@ -133,7 +134,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/dir_011e1c944d88f71be72e1e24a5fda7cf.html b/docs/html/dir_011e1c944d88f71be72e1e24a5fda7cf.html
index 57ef8bb1fb..8624b0511d 100644
--- a/docs/html/dir_011e1c944d88f71be72e1e24a5fda7cf.html
+++ b/docs/html/dir_011e1c944d88f71be72e1e24a5fda7cf.html
@@ -120,7 +120,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/dir_18fc672d63781b5a743137aee24ff656.html b/docs/html/dir_18fc672d63781b5a743137aee24ff656.html
index 193a631bd6..e73f55c96f 100644
--- a/docs/html/dir_18fc672d63781b5a743137aee24ff656.html
+++ b/docs/html/dir_18fc672d63781b5a743137aee24ff656.html
@@ -106,7 +106,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/dir_bb50a5ef59f19d030d06415663184d05.html b/docs/html/dir_bb50a5ef59f19d030d06415663184d05.html
index b3b3243b01..01fbef20e4 100644
--- a/docs/html/dir_bb50a5ef59f19d030d06415663184d05.html
+++ b/docs/html/dir_bb50a5ef59f19d030d06415663184d05.html
@@ -112,7 +112,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/dir_cb3a671affffe7eeb3fdf5ae58e42cc8.html b/docs/html/dir_cb3a671affffe7eeb3fdf5ae58e42cc8.html
index 1a398c5d01..565e206acb 100644
--- a/docs/html/dir_cb3a671affffe7eeb3fdf5ae58e42cc8.html
+++ b/docs/html/dir_cb3a671affffe7eeb3fdf5ae58e42cc8.html
@@ -106,7 +106,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/dir_d583f216f1aafe19404e836b0c097ad2.html b/docs/html/dir_d583f216f1aafe19404e836b0c097ad2.html
index 7cbd05cbb7..e97b28b908 100644
--- a/docs/html/dir_d583f216f1aafe19404e836b0c097ad2.html
+++ b/docs/html/dir_d583f216f1aafe19404e836b0c097ad2.html
@@ -129,7 +129,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/dir_f2530eaff2fd0556614e1cfc33864b65.html b/docs/html/dir_f2530eaff2fd0556614e1cfc33864b65.html
index d1db0a4c74..faae5a3fa1 100644
--- a/docs/html/dir_f2530eaff2fd0556614e1cfc33864b65.html
+++ b/docs/html/dir_f2530eaff2fd0556614e1cfc33864b65.html
@@ -97,18 +97,18 @@
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="files"></a>
 Files</h2></td></tr>
-<tr class="memitem:grid__even__share_8cuh"><td class="memItemLeft" align="right" valign="top">file &#160;</td><td class="memItemRight" valign="bottom"><b>grid_even_share.cuh</b></td></tr>
+<tr class="memitem:grid__even__share_8cuh"><td class="memItemLeft" align="right" valign="top">file &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="grid__even__share_8cuh.html">grid_even_share.cuh</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:grid__mapping_8cuh"><td class="memItemLeft" align="right" valign="top">file &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="grid__mapping_8cuh.html">grid_mapping.cuh</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:grid__queue_8cuh"><td class="memItemLeft" align="right" valign="top">file &#160;</td><td class="memItemRight" valign="bottom"><b>grid_queue.cuh</b></td></tr>
+<tr class="memitem:grid__queue_8cuh"><td class="memItemLeft" align="right" valign="top">file &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="grid__queue_8cuh.html">grid_queue.cuh</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 </div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/download_cub.html b/docs/html/download_cub.html
index e67862f65f..495332b463 100644
--- a/docs/html/download_cub.html
+++ b/docs/html/download_cub.html
@@ -37,14 +37,14 @@
 </head>
 
 <body 
-	onload="downloadURL('https://github.com/NVlabs/cub/archive/0.9.3.zip');" 
+	onload="downloadURL('https://github.com/NVlabs/cub/archive/0.9.4.zip');" 
 	style="color: rgb(102, 102, 102); font-family: Helvetica, arial, freesans, clean, sans-serif; font-size: 13px; font-style: normal; font-variant: normal; font-weight: 300; height: 18px;">
 
 <center>
 If your download doesn't start in 3s:
 <br><br>
-<a href="https://github.com/NVlabs/cub/archive/0.9.3.zip"><img src="download-icon.png" style="position:relative; bottom:-10px; border:0px;"/></a>
-<a href="https://github.com/NVlabs/cub/archive/0.9.3.zip"><em>Download CUB!</em></a>
+<a href="https://github.com/NVlabs/cub/archive/0.9.4.zip"><img src="download-icon.png" style="position:relative; bottom:-10px; border:0px;"/></a>
+<a href="https://github.com/NVlabs/cub/archive/0.9.4.zip"><em>Download CUB!</em></a>
 </center>
 
 </body>
diff --git a/docs/html/files.html b/docs/html/files.html
index cb15fbdace..ec3d11b7f1 100644
--- a/docs/html/files.html
+++ b/docs/html/files.html
@@ -110,26 +110,28 @@
 <tr id="row_9_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="cub_8cuh.html" target="_self">cub.cuh</a></td><td class="desc"></td></tr>
 <tr id="row_10_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="device__histo__256_8cuh.html" target="_self">device_histo_256.cuh</a></td><td class="desc"></td></tr>
 <tr id="row_11_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="device__reduce_8cuh.html" target="_self">device_reduce.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_12_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="grid__mapping_8cuh.html" target="_self">grid_mapping.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_13_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__load_8cuh.html" target="_self">thread_load.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_14_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__operators_8cuh.html" target="_self">thread_operators.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_15_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__reduce_8cuh.html" target="_self">thread_reduce.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_16_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__scan_8cuh.html" target="_self">thread_scan.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_17_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__store_8cuh.html" target="_self">thread_store.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_18_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__arch_8cuh.html" target="_self">util_arch.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_19_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__debug_8cuh.html" target="_self">util_debug.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_20_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__device_8cuh.html" target="_self">util_device.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_21_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__iterator_8cuh.html" target="_self">util_iterator.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_22_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__type_8cuh.html" target="_self">util_type.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_23_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="warp__reduce_8cuh.html" target="_self">warp_reduce.cuh</a></td><td class="desc"></td></tr>
-<tr id="row_24_" class="even"><td class="entry"><img src="ftv2lastnode.png" alt="\" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="warp__scan_8cuh.html" target="_self">warp_scan.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_12_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="grid__even__share_8cuh.html" target="_self">grid_even_share.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_13_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="grid__mapping_8cuh.html" target="_self">grid_mapping.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_14_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="grid__queue_8cuh.html" target="_self">grid_queue.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_15_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__load_8cuh.html" target="_self">thread_load.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_16_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__operators_8cuh.html" target="_self">thread_operators.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_17_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__reduce_8cuh.html" target="_self">thread_reduce.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_18_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__scan_8cuh.html" target="_self">thread_scan.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_19_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="thread__store_8cuh.html" target="_self">thread_store.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_20_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__arch_8cuh.html" target="_self">util_arch.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_21_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__debug_8cuh.html" target="_self">util_debug.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_22_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__device_8cuh.html" target="_self">util_device.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_23_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__iterator_8cuh.html" target="_self">util_iterator.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_24_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="util__type_8cuh.html" target="_self">util_type.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_25_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="warp__reduce_8cuh.html" target="_self">warp_reduce.cuh</a></td><td class="desc"></td></tr>
+<tr id="row_26_" class="even"><td class="entry"><img src="ftv2lastnode.png" alt="\" width="16" height="22" /><img src="ftv2doc.png" alt="*" width="24" height="22" /><a class="el" href="warp__scan_8cuh.html" target="_self">warp_scan.cuh</a></td><td class="desc"></td></tr>
 </table>
 </div><!-- directory -->
 </div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/functions.html b/docs/html/functions.html
index 99a768f649..82acc2b19e 100644
--- a/docs/html/functions.html
+++ b/docs/html/functions.html
@@ -139,8 +139,8 @@ <h3><a class="anchor" id="index_a"></a>- a -</h3><ul>
 
 <h3><a class="anchor" id="index_b"></a>- b -</h3><ul>
 <li>BindTexture()
-: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#a40805822c8f8966dd07221ccf9c5231b">cub::TexIteratorRA&lt; T &gt;</a>
-, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a9ff64f31b9d4ac34a04de1905f64f898">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
+: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#a5f6750afce62e3b947dbb2c6f12b4364">cub::TexIteratorRA&lt; T &gt;</a>
+, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#aaa167451624a1326a9d66b927a861c18">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
 </li>
 <li>block_offset
 : <a class="el" href="classcub_1_1_grid_even_share.html#a505eab67be1e46bf91855488294e8289">cub::GridEvenShare&lt; SizeT &gt;</a>
@@ -176,10 +176,10 @@ <h3><a class="anchor" id="index_c"></a>- c -</h3><ul>
 <h3><a class="anchor" id="index_d"></a>- d -</h3><ul>
 <li>DeviceAllocate()
 : <a class="el" href="classcub_1_1_device_allocator.html#af3c1742fec6b37f2737d951917358099">cub::DeviceAllocator</a>
-, <a class="el" href="structcub_1_1_caching_device_allocator.html#a80902287c5d123f3541cf0002604a8fe">cub::CachingDeviceAllocator</a>
+, <a class="el" href="structcub_1_1_caching_device_allocator.html#a8b781537d9955246b873b7a79d95bbe9">cub::CachingDeviceAllocator</a>
 </li>
 <li>DeviceFree()
-: <a class="el" href="structcub_1_1_caching_device_allocator.html#a35cf35b2610d3c1bb2f7a6ae5d814385">cub::CachingDeviceAllocator</a>
+: <a class="el" href="structcub_1_1_caching_device_allocator.html#a1a56292df1ed4fa9ec233d784afa7858">cub::CachingDeviceAllocator</a>
 , <a class="el" href="classcub_1_1_device_allocator.html#a527d9c0c93c177359f5ad823c7d808e2">cub::DeviceAllocator</a>
 , <a class="el" href="structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b">cub::CachingDeviceAllocator</a>
 </li>
@@ -223,7 +223,7 @@ <h3><a class="anchor" id="index_f"></a>- f -</h3><ul>
 : <a class="el" href="classcub_1_1_grid_queue.html#aff77aa7ee352b3bf62725f0e6f41502a">cub::GridQueue&lt; SizeT &gt;</a>
 </li>
 <li>FreeAllCached()
-: <a class="el" href="structcub_1_1_caching_device_allocator.html#af55929d787fe0b785da1ae4ceeffea5b">cub::CachingDeviceAllocator</a>
+: <a class="el" href="structcub_1_1_caching_device_allocator.html#af25daf8ac1dbce0d8504a6a53c2277ad">cub::CachingDeviceAllocator</a>
 </li>
 </ul>
 
@@ -235,6 +235,9 @@ <h3><a class="anchor" id="index_g"></a>- g -</h3><ul>
 <li>grid_size
 : <a class="el" href="classcub_1_1_grid_even_share.html#a0f0c30d111858d38fc39dd223e8ca676">cub::GridEvenShare&lt; SizeT &gt;</a>
 </li>
+<li>GridEvenShare()
+: <a class="el" href="classcub_1_1_grid_even_share.html#ae37f606dd91894e9341e82affe757b0b">cub::GridEvenShare&lt; SizeT &gt;</a>
+</li>
 <li>GridInit()
 : <a class="el" href="classcub_1_1_grid_even_share.html#a239f96ab615df22e26c53cde6000e39f">cub::GridEvenShare&lt; SizeT &gt;</a>
 </li>
@@ -327,6 +330,9 @@ <h3><a class="anchor" id="index_m"></a>- m -</h3><ul>
 <li>MultiChannelAtomic()
 : <a class="el" href="structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a">cub::DeviceHisto256</a>
 </li>
+<li>MultiChannelGlobalAtomic()
+: <a class="el" href="structcub_1_1_device_histo256.html#a112463c78c6a51d41be4600421873126">cub::DeviceHisto256</a>
+</li>
 </ul>
 
 
@@ -423,30 +429,33 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 <li>SingleChannelAtomic()
 : <a class="el" href="structcub_1_1_device_histo256.html#a792cb350cb09da5299f76806462d84db">cub::DeviceHisto256</a>
 </li>
+<li>SingleChannelGlobalAtomic()
+: <a class="el" href="structcub_1_1_device_histo256.html#a3b832f1fc945bdce122edc8a64cf6ec6">cub::DeviceHisto256</a>
+</li>
 <li>sm_count
 : <a class="el" href="classcub_1_1_device.html#a68885a90ea0badcffc2a3511b0b4fd5c">cub::Device</a>
 </li>
 <li>sm_version
 : <a class="el" href="classcub_1_1_device.html#ac9b36b3b0edb207aeee60110fbee6406">cub::Device</a>
 </li>
-<li>smem_alloc_unit
-: <a class="el" href="classcub_1_1_device.html#aca17efd166d916008f0fc378adc00b1f">cub::Device</a>
-</li>
 <li>SMEM_ALLOC_UNIT
 : <a class="el" href="structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063af554a252a393bce18200236640959bab">cub::ArchProps&lt; SM_ARCH &gt;</a>
 </li>
+<li>smem_alloc_unit
+: <a class="el" href="classcub_1_1_device.html#aca17efd166d916008f0fc378adc00b1f">cub::Device</a>
+</li>
 <li>SMEM_BANK_BYTES
 : <a class="el" href="structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063ab56aa4deb8408a51e7cc139c2c1e14f5">cub::ArchProps&lt; SM_ARCH &gt;</a>
 </li>
 <li>smem_bank_bytes
 : <a class="el" href="classcub_1_1_device.html#a909d34dd31bdb6f754a603e8eb2b10ab">cub::Device</a>
 </li>
-<li>SMEM_BANKS
-: <a class="el" href="structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063a55ab5a77ae95f67255bbec10cd3993e1">cub::ArchProps&lt; SM_ARCH &gt;</a>
-</li>
 <li>smem_banks
 : <a class="el" href="classcub_1_1_device.html#a2e835add85abdf0eb640c551abec2a0b">cub::Device</a>
 </li>
+<li>SMEM_BANKS
+: <a class="el" href="structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063a55ab5a77ae95f67255bbec10cd3993e1">cub::ArchProps&lt; SM_ARCH &gt;</a>
+</li>
 <li>SMEM_BYTES
 : <a class="el" href="structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063a993d6ee811a054da1a8579132c6301a1">cub::ArchProps&lt; SM_ARCH &gt;</a>
 </li>
@@ -454,7 +463,7 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 : <a class="el" href="classcub_1_1_device.html#a838c9a2e207c38ef24e79a40bccdd1ac">cub::Device</a>
 </li>
 <li>SmemStorage
-: <a class="el" href="classcub_1_1_block_store.html#a44f8ee5a77fc8cc348a68a6d233ae68d">cub::BlockStore&lt; OutputIteratorRA, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER &gt;</a>
+: <a class="el" href="classcub_1_1_block_scan.html#ae85612707bea2533173ee665183f9c82">cub::BlockScan&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
 , <a class="el" href="classcub_1_1_block_discontinuity.html#a855c92f9c3869909913860fa11e755a4">cub::BlockDiscontinuity&lt; T, BLOCK_THREADS &gt;</a>
 , <a class="el" href="classcub_1_1_block_exchange.html#ad91573946e4abe5ae5e34277ded1c215">cub::BlockExchange&lt; T, BLOCK_THREADS, ITEMS_PER_THREAD &gt;</a>
 , <a class="el" href="classcub_1_1_block_histo256.html#a620661d7ec6ab2fc9d0d843329c901ff">cub::BlockHisto256&lt; BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM &gt;</a>
@@ -462,7 +471,7 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 , <a class="el" href="structcub_1_1_block_raking_layout.html#a9455ea5b5ba52b8fcf26cfded59f70c9">cub::BlockRakingLayout&lt; T, BLOCK_THREADS, BLOCK_STRIPS &gt;</a>
 , <a class="el" href="classcub_1_1_block_radix_sort.html#ab42b81b7a6587ecaca18f232104b3f3c">cub::BlockRadixSort&lt; KeyType, BLOCK_THREADS, ITEMS_PER_THREAD, ValueType, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG &gt;</a>
 , <a class="el" href="classcub_1_1_block_reduce.html#a0f83f9d65de0d6d33e311bcd9e8d41b7">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
-, <a class="el" href="classcub_1_1_block_scan.html#ae85612707bea2533173ee665183f9c82">cub::BlockScan&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
+, <a class="el" href="classcub_1_1_block_store.html#a44f8ee5a77fc8cc348a68a6d233ae68d">cub::BlockStore&lt; OutputIteratorRA, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER &gt;</a>
 , <a class="el" href="classcub_1_1_warp_scan.html#a2bfa864e963cb4965139ac1b6c66d1b7">cub::WarpScan&lt; T, WARPS, LOGICAL_WARP_THREADS &gt;</a>
 , <a class="el" href="classcub_1_1_warp_reduce.html#a71f87b76f0deeaa5ba3a708374d19c8c">cub::WarpReduce&lt; T, WARPS, LOGICAL_WARP_THREADS &gt;</a>
 </li>
@@ -473,7 +482,7 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 : <a class="el" href="classcub_1_1_block_radix_sort.html#ad6cc88c2ae0d36c4c98b901748645e10">cub::BlockRadixSort&lt; KeyType, BLOCK_THREADS, ITEMS_PER_THREAD, ValueType, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG &gt;</a>
 </li>
 <li>SortStriped()
-: <a class="el" href="classcub_1_1_block_radix_sort.html#aea1b6f367da1b17ed80f65d511aec15e">cub::BlockRadixSort&lt; KeyType, BLOCK_THREADS, ITEMS_PER_THREAD, ValueType, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG &gt;</a>
+: <a class="el" href="classcub_1_1_block_radix_sort.html#a6d3cd483dfc0cce5cc8b1ca538b937ad">cub::BlockRadixSort&lt; KeyType, BLOCK_THREADS, ITEMS_PER_THREAD, ValueType, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG &gt;</a>
 </li>
 <li>Store()
 : <a class="el" href="classcub_1_1_block_store.html#a4d7858f738321f7130bc0be7d6fa46a5">cub::BlockStore&lt; OutputIteratorRA, BLOCK_THREADS, ITEMS_PER_THREAD, POLICY, MODIFIER &gt;</a>
@@ -483,8 +492,7 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 </li>
 <li>Sum()
 : <a class="el" href="classcub_1_1_block_reduce.html#a8cc68b77f25c80b972bdedced6878214">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
-, <a class="el" href="classcub_1_1_warp_reduce.html#a0c1ad8bee00bc64f14f3b0d762b84363">cub::WarpReduce&lt; T, WARPS, LOGICAL_WARP_THREADS &gt;</a>
-, <a class="el" href="classcub_1_1_block_reduce.html#af81b914f22d1c1608d46d12e1c1d888e">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
+, <a class="el" href="classcub_1_1_warp_reduce.html#a50e4be93dbd298b7c2b5080d1449cf8c">cub::WarpReduce&lt; T, WARPS, LOGICAL_WARP_THREADS &gt;</a>
 </li>
 </ul>
 
@@ -513,8 +521,8 @@ <h3><a class="anchor" id="index_t"></a>- t -</h3><ul>
 
 <h3><a class="anchor" id="index_u"></a>- u -</h3><ul>
 <li>UnbindTexture()
-: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#ab33bd552a21c85dea863b75b09aa6e87">cub::TexIteratorRA&lt; T &gt;</a>
-, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a30fbb31a0a53d87f47174ee1889a6435">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
+: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#abfc8c42e2da2beffb05ac81ec89ed8b9">cub::TexIteratorRA&lt; T &gt;</a>
+, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#abd030991795b9c9ca31f4293486d206a">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
 </li>
 <li>UNGUARDED
 : <a class="el" href="structcub_1_1_block_raking_layout.html#a2aacb46e7388e201bf1aacc4267fcba4a26085e779253b376e60b904389eca382">cub::BlockRakingLayout&lt; T, BLOCK_THREADS, BLOCK_STRIPS &gt;</a>
@@ -550,7 +558,7 @@ <h3><a class="anchor" id="index_0x7e"></a>- ~ -</h3><ul>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/functions_eval.html b/docs/html/functions_eval.html
index dc8a232103..07f7eca92c 100644
--- a/docs/html/functions_eval.html
+++ b/docs/html/functions_eval.html
@@ -167,7 +167,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/functions_func.html b/docs/html/functions_func.html
index 79b0143c78..eee84cd7e1 100644
--- a/docs/html/functions_func.html
+++ b/docs/html/functions_func.html
@@ -134,8 +134,8 @@ <h3><a class="anchor" id="index_a"></a>- a -</h3><ul>
 
 <h3><a class="anchor" id="index_b"></a>- b -</h3><ul>
 <li>BindTexture()
-: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#a40805822c8f8966dd07221ccf9c5231b">cub::TexIteratorRA&lt; T &gt;</a>
-, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a9ff64f31b9d4ac34a04de1905f64f898">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
+: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#a5f6750afce62e3b947dbb2c6f12b4364">cub::TexIteratorRA&lt; T &gt;</a>
+, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#aaa167451624a1326a9d66b927a861c18">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
 </li>
 <li>BlockedToStriped()
 : <a class="el" href="classcub_1_1_block_exchange.html#a068f68d3f9d5c53920eeae82594d6935">cub::BlockExchange&lt; T, BLOCK_THREADS, ITEMS_PER_THREAD &gt;</a>
@@ -162,10 +162,10 @@ <h3><a class="anchor" id="index_c"></a>- c -</h3><ul>
 <h3><a class="anchor" id="index_d"></a>- d -</h3><ul>
 <li>DeviceAllocate()
 : <a class="el" href="classcub_1_1_device_allocator.html#af3c1742fec6b37f2737d951917358099">cub::DeviceAllocator</a>
-, <a class="el" href="structcub_1_1_caching_device_allocator.html#a80902287c5d123f3541cf0002604a8fe">cub::CachingDeviceAllocator</a>
+, <a class="el" href="structcub_1_1_caching_device_allocator.html#a8b781537d9955246b873b7a79d95bbe9">cub::CachingDeviceAllocator</a>
 </li>
 <li>DeviceFree()
-: <a class="el" href="structcub_1_1_caching_device_allocator.html#a35cf35b2610d3c1bb2f7a6ae5d814385">cub::CachingDeviceAllocator</a>
+: <a class="el" href="structcub_1_1_caching_device_allocator.html#a1a56292df1ed4fa9ec233d784afa7858">cub::CachingDeviceAllocator</a>
 , <a class="el" href="classcub_1_1_device_allocator.html#a527d9c0c93c177359f5ad823c7d808e2">cub::DeviceAllocator</a>
 , <a class="el" href="structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b">cub::CachingDeviceAllocator</a>
 </li>
@@ -209,12 +209,15 @@ <h3><a class="anchor" id="index_f"></a>- f -</h3><ul>
 : <a class="el" href="classcub_1_1_grid_queue.html#aff77aa7ee352b3bf62725f0e6f41502a">cub::GridQueue&lt; SizeT &gt;</a>
 </li>
 <li>FreeAllCached()
-: <a class="el" href="structcub_1_1_caching_device_allocator.html#af55929d787fe0b785da1ae4ceeffea5b">cub::CachingDeviceAllocator</a>
+: <a class="el" href="structcub_1_1_caching_device_allocator.html#af25daf8ac1dbce0d8504a6a53c2277ad">cub::CachingDeviceAllocator</a>
 </li>
 </ul>
 
 
 <h3><a class="anchor" id="index_g"></a>- g -</h3><ul>
+<li>GridEvenShare()
+: <a class="el" href="classcub_1_1_grid_even_share.html#a34167aebdf4b8f6e0ab0d8dbee988e91">cub::GridEvenShare&lt; SizeT &gt;</a>
+</li>
 <li>GridInit()
 : <a class="el" href="classcub_1_1_grid_even_share.html#a239f96ab615df22e26c53cde6000e39f">cub::GridEvenShare&lt; SizeT &gt;</a>
 </li>
@@ -271,6 +274,9 @@ <h3><a class="anchor" id="index_m"></a>- m -</h3><ul>
 <li>MultiChannelAtomic()
 : <a class="el" href="structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a">cub::DeviceHisto256</a>
 </li>
+<li>MultiChannelGlobalAtomic()
+: <a class="el" href="structcub_1_1_device_histo256.html#a112463c78c6a51d41be4600421873126">cub::DeviceHisto256</a>
+</li>
 </ul>
 
 
@@ -332,8 +338,11 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 <li>SingleChannelAtomic()
 : <a class="el" href="structcub_1_1_device_histo256.html#a792cb350cb09da5299f76806462d84db">cub::DeviceHisto256</a>
 </li>
+<li>SingleChannelGlobalAtomic()
+: <a class="el" href="structcub_1_1_device_histo256.html#a3b832f1fc945bdce122edc8a64cf6ec6">cub::DeviceHisto256</a>
+</li>
 <li>SortBlocked()
-: <a class="el" href="classcub_1_1_block_radix_sort.html#a779bfcd00c57f6b97cbbb8a0aafb616a">cub::BlockRadixSort&lt; KeyType, BLOCK_THREADS, ITEMS_PER_THREAD, ValueType, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG &gt;</a>
+: <a class="el" href="classcub_1_1_block_radix_sort.html#acd0282613cbd6ad6c52ab1bb05b82def">cub::BlockRadixSort&lt; KeyType, BLOCK_THREADS, ITEMS_PER_THREAD, ValueType, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG &gt;</a>
 </li>
 <li>SortBlockedToStriped()
 : <a class="el" href="classcub_1_1_block_radix_sort.html#ad6cc88c2ae0d36c4c98b901748645e10">cub::BlockRadixSort&lt; KeyType, BLOCK_THREADS, ITEMS_PER_THREAD, ValueType, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG &gt;</a>
@@ -348,11 +357,11 @@ <h3><a class="anchor" id="index_s"></a>- s -</h3><ul>
 : <a class="el" href="classcub_1_1_block_exchange.html#ad8000bf73c3ce935018f32451985ae37">cub::BlockExchange&lt; T, BLOCK_THREADS, ITEMS_PER_THREAD &gt;</a>
 </li>
 <li>Sum()
-: <a class="el" href="classcub_1_1_block_reduce.html#af81b914f22d1c1608d46d12e1c1d888e">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
+: <a class="el" href="classcub_1_1_block_reduce.html#adfc657d856d583321d2604589c52b43a">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
 , <a class="el" href="classcub_1_1_warp_reduce.html#a50e4be93dbd298b7c2b5080d1449cf8c">cub::WarpReduce&lt; T, WARPS, LOGICAL_WARP_THREADS &gt;</a>
 , <a class="el" href="classcub_1_1_block_reduce.html#a8cc68b77f25c80b972bdedced6878214">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
 , <a class="el" href="classcub_1_1_warp_reduce.html#a0c1ad8bee00bc64f14f3b0d762b84363">cub::WarpReduce&lt; T, WARPS, LOGICAL_WARP_THREADS &gt;</a>
-, <a class="el" href="classcub_1_1_block_reduce.html#adfc657d856d583321d2604589c52b43a">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
+, <a class="el" href="classcub_1_1_block_reduce.html#af81b914f22d1c1608d46d12e1c1d888e">cub::BlockReduce&lt; T, BLOCK_THREADS, ALGORITHM &gt;</a>
 </li>
 </ul>
 
@@ -372,8 +381,8 @@ <h3><a class="anchor" id="index_t"></a>- t -</h3><ul>
 
 <h3><a class="anchor" id="index_u"></a>- u -</h3><ul>
 <li>UnbindTexture()
-: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#ab33bd552a21c85dea863b75b09aa6e87">cub::TexIteratorRA&lt; T &gt;</a>
-, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#a30fbb31a0a53d87f47174ee1889a6435">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
+: <a class="el" href="classcub_1_1_tex_iterator_r_a.html#abfc8c42e2da2beffb05ac81ec89ed8b9">cub::TexIteratorRA&lt; T &gt;</a>
+, <a class="el" href="classcub_1_1_tex_transform_iterator_r_a.html#abd030991795b9c9ca31f4293486d206a">cub::TexTransformIteratorRA&lt; OutputType, ConversionOp, InputType &gt;</a>
 </li>
 </ul>
 
@@ -390,7 +399,7 @@ <h3><a class="anchor" id="index_0x7e"></a>- ~ -</h3><ul>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/functions_type.html b/docs/html/functions_type.html
index db7e256c3d..d4e7b8ca7c 100644
--- a/docs/html/functions_type.html
+++ b/docs/html/functions_type.html
@@ -129,7 +129,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/functions_vars.html b/docs/html/functions_vars.html
index 497f9288d6..a42f0a649a 100644
--- a/docs/html/functions_vars.html
+++ b/docs/html/functions_vars.html
@@ -176,7 +176,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/globals.html b/docs/html/globals.html
index 54f41c5382..e38610f7c0 100644
--- a/docs/html/globals.html
+++ b/docs/html/globals.html
@@ -99,9 +99,6 @@
 
 <div class="contents">
 <div class="textblock">Here is a list of all documented file members with links to the documentation:</div><ul>
-<li>CUB_CNP_ENABLED
-: <a class="el" href="group___util_module.html#ga9b4969946073d1c94ec927991f8b3a6b">util_arch.cuh</a>
-</li>
 <li>CUB_DEFINE_DETECT_NESTED_TYPE
 : <a class="el" href="group___util_module.html#ga84bf1a8ed872e12577935eb28feca420">util_type.cuh</a>
 </li>
@@ -111,6 +108,9 @@
 <li>CUB_PTX_ARCH
 : <a class="el" href="group___util_module.html#ga76769ce380739b38e0bd00f57e0dbe4f">util_arch.cuh</a>
 </li>
+<li>CUB_RUNTIME_ENABLED
+: <a class="el" href="group___util_module.html#gac029dc29df3c1871ab3b8a38f50904a6">util_arch.cuh</a>
+</li>
 <li>CubDebug
 : <a class="el" href="group___util_module.html#ga84c3a4c178bf6593e0fad2b763606236">util_debug.cuh</a>
 </li>
@@ -128,7 +128,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/globals_defs.html b/docs/html/globals_defs.html
index 7d90a17ca1..6e028340b0 100644
--- a/docs/html/globals_defs.html
+++ b/docs/html/globals_defs.html
@@ -99,9 +99,6 @@
 
 <div class="contents">
 &#160;<ul>
-<li>CUB_CNP_ENABLED
-: <a class="el" href="group___util_module.html#ga9b4969946073d1c94ec927991f8b3a6b">util_arch.cuh</a>
-</li>
 <li>CUB_DEFINE_DETECT_NESTED_TYPE
 : <a class="el" href="group___util_module.html#ga84bf1a8ed872e12577935eb28feca420">util_type.cuh</a>
 </li>
@@ -111,6 +108,9 @@
 <li>CUB_PTX_ARCH
 : <a class="el" href="group___util_module.html#ga76769ce380739b38e0bd00f57e0dbe4f">util_arch.cuh</a>
 </li>
+<li>CUB_RUNTIME_ENABLED
+: <a class="el" href="group___util_module.html#gac029dc29df3c1871ab3b8a38f50904a6">util_arch.cuh</a>
+</li>
 <li>CubDebug
 : <a class="el" href="group___util_module.html#ga84c3a4c178bf6593e0fad2b763606236">util_debug.cuh</a>
 </li>
@@ -128,7 +128,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/grid__even__share_8cuh.html b/docs/html/grid__even__share_8cuh.html
new file mode 100644
index 0000000000..f944e3d79f
--- /dev/null
+++ b/docs/html/grid__even__share_8cuh.html
@@ -0,0 +1,135 @@
+<!-- HTML header for doxygen 1.8.3.1-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.3.1"/>
+<title>CUB: grid_even_share.cuh File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { searchBox.OnSelectItem(0); });
+</script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="extra_stylesheet.css" rel="stylesheet" type="text/css"/>
+<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />
+<script type="text/javascript">
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-38890655-1']);
+  _gaq.push(['_trackPageview']);
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">CUB
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.3.1 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+  <div id="navrow1" class="tabs">
+    <ul class="tablist">
+      <li><a href="index.html"><span>Main&#160;Page</span></a></li>
+      <li><a href="modules.html"><span>Modules</span></a></li>
+      <li><a href="annotated.html"><span>Classes</span></a></li>
+      <li>
+        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+      </li>
+    </ul>
+  </div>
+  <div id="navrow2" class="tabs2">
+    <ul class="tablist">
+      <li><a href="files.html"><span>File&#160;List</span></a></li>
+      <li><a href="globals.html"><span>File&#160;Members</span></a></li>
+    </ul>
+  </div>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Classes</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Namespaces</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark">&#160;</span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(5)"><span class="SelectionMark">&#160;</span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(6)"><span class="SelectionMark">&#160;</span>Typedefs</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(7)"><span class="SelectionMark">&#160;</span>Enumerations</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(8)"><span class="SelectionMark">&#160;</span>Enumerator</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(9)"><span class="SelectionMark">&#160;</span>Groups</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(10)"><span class="SelectionMark">&#160;</span>Pages</a></div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_d583f216f1aafe19404e836b0c097ad2.html">cub</a></li><li class="navelem"><a class="el" href="dir_f2530eaff2fd0556614e1cfc33864b65.html">grid</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#namespaces">Namespaces</a>  </div>
+  <div class="headertitle">
+<div class="title">grid_even_share.cuh File Reference</div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &quot;../util_namespace.cuh&quot;</code><br/>
+<code>#include &quot;../util_macro.cuh&quot;</code><br/>
+<code>#include &quot;../util_allocator.cuh&quot;</code><br/>
+</div><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).  <a href="classcub_1_1_grid_even_share.html#details">More...</a><br/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
+Namespaces</h2></td></tr>
+<tr class="memitem:namespacecub"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacecub.html">cub</a></td></tr>
+<tr class="memdesc:namespacecub"><td class="mdescLeft">&#160;</td><td class="mdescRight">Optional outer namespace(s) <br/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
+<div class="textblock"><p><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">cub::GridEvenShare</a> is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). </p>
+</div></div><!-- contents -->
+<!-- HTML footer for doxygen 1.8.3.1-->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+<img class="footer" src="doxygen.png" alt="doxygen"/>
+</a> 1.8.3.1
+<br>
+&copy; 2013 NVIDIA Corporation
+</small></address>
+</body>
+</html>
diff --git a/docs/html/grid__mapping_8cuh.html b/docs/html/grid__mapping_8cuh.html
index 6949a8f1d8..d81e087b05 100644
--- a/docs/html/grid__mapping_8cuh.html
+++ b/docs/html/grid__mapping_8cuh.html
@@ -98,12 +98,15 @@
 <div class="header">
   <div class="summary">
 <a href="#namespaces">Namespaces</a> &#124;
-<a href="#enum-members">Enumerations</a>  </div>
+<a href="#enum-members">Enumerations</a> &#124;
+<a href="#func-members">Functions</a>  </div>
   <div class="headertitle">
 <div class="title">grid_mapping.cuh File Reference</div>  </div>
 </div><!--header-->
 <div class="contents">
-<div class="textblock"><code>#include &quot;../util_namespace.cuh&quot;</code><br/>
+<div class="textblock"><code>#include &quot;<a class="el" href="grid__even__share_8cuh.html">grid_even_share.cuh</a>&quot;</code><br/>
+<code>#include &quot;<a class="el" href="grid__queue_8cuh.html">grid_queue.cuh</a>&quot;</code><br/>
+<code>#include &quot;../util_namespace.cuh&quot;</code><br/>
 </div><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
 Namespaces</h2></td></tr>
@@ -116,15 +119,43 @@
 <tr class="memitem:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97">cub::GridMappingStrategy</a> { <a class="el" href="group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a8f282fc1684924a8503850b8b302d3a5">cub::GRID_MAPPING_EVEN_SHARE</a>, 
 <a class="el" href="group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a6857b21076c6af5e9b1f4a55c0bdc971">cub::GRID_MAPPING_DYNAMIC</a>
  }</td></tr>
+<tr class="memdesc:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97" title="cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device...">cub::GridMappingStrategy</a> enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.  <a href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97">More...</a><br/></td></tr>
 <tr class="separator:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
+Functions</h2></td></tr>
+<tr class="memitem:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f">cub::ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)</td></tr>
+<tr class="memdesc:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Dispatches tiles of work from the given input range to the specified thread block abstraction.  <a href="group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f">More...</a><br/></td></tr>
+<tr class="separator:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga771bd863d4c690a25c267537e2b8cbf9">cub::ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)</td></tr>
+<tr class="memdesc:ga771bd863d4c690a25c267537e2b8cbf9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)  <a href="group___grid_module.html#ga771bd863d4c690a25c267537e2b8cbf9">More...</a><br/></td></tr>
+<tr class="separator:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga39f156542d92e308412515e349e90300"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga39f156542d92e308412515e349e90300"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga39f156542d92e308412515e349e90300">cub::ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)</td></tr>
+<tr class="memdesc:ga39f156542d92e308412515e349e90300"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)  <a href="group___grid_module.html#ga39f156542d92e308412515e349e90300">More...</a><br/></td></tr>
+<tr class="separator:ga39f156542d92e308412515e349e90300"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca">cub::ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)</td></tr>
+<tr class="memdesc:gad719cb805d4b53ddca9943abbde1c0ca"><td class="mdescLeft">&#160;</td><td class="mdescRight">Dispatches tiles of work from the given input range to the specified thread block abstraction. The first tile given to each thread block is flagged as such.  <a href="group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca">More...</a><br/></td></tr>
+<tr class="separator:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gabae8550f492407f0795171a140d82084"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:gabae8550f492407f0795171a140d82084"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gabae8550f492407f0795171a140d82084">cub::ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)</td></tr>
+<tr class="memdesc:gabae8550f492407f0795171a140d82084"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)  <a href="group___grid_module.html#gabae8550f492407f0795171a140d82084">More...</a><br/></td></tr>
+<tr class="separator:gabae8550f492407f0795171a140d82084"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab">cub::ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)</td></tr>
+<tr class="memdesc:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)  <a href="group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab">More...</a><br/></td></tr>
+<tr class="separator:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
-<div class="textblock"><p><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97">cub::GridMappingStrategy</a> enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. </p>
+<div class="textblock"><p><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97" title="cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device...">cub::GridMappingStrategy</a> enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. </p>
 </div></div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/grid__queue_8cuh.html b/docs/html/grid__queue_8cuh.html
new file mode 100644
index 0000000000..1d9d13081a
--- /dev/null
+++ b/docs/html/grid__queue_8cuh.html
@@ -0,0 +1,137 @@
+<!-- HTML header for doxygen 1.8.3.1-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.3.1"/>
+<title>CUB: grid_queue.cuh File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { searchBox.OnSelectItem(0); });
+</script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="extra_stylesheet.css" rel="stylesheet" type="text/css"/>
+<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />
+<script type="text/javascript">
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-38890655-1']);
+  _gaq.push(['_trackPageview']);
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">CUB
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.3.1 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+  <div id="navrow1" class="tabs">
+    <ul class="tablist">
+      <li><a href="index.html"><span>Main&#160;Page</span></a></li>
+      <li><a href="modules.html"><span>Modules</span></a></li>
+      <li><a href="annotated.html"><span>Classes</span></a></li>
+      <li>
+        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+      </li>
+    </ul>
+  </div>
+  <div id="navrow2" class="tabs2">
+    <ul class="tablist">
+      <li><a href="files.html"><span>File&#160;List</span></a></li>
+      <li><a href="globals.html"><span>File&#160;Members</span></a></li>
+    </ul>
+  </div>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Classes</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Namespaces</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark">&#160;</span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(5)"><span class="SelectionMark">&#160;</span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(6)"><span class="SelectionMark">&#160;</span>Typedefs</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(7)"><span class="SelectionMark">&#160;</span>Enumerations</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(8)"><span class="SelectionMark">&#160;</span>Enumerator</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(9)"><span class="SelectionMark">&#160;</span>Groups</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(10)"><span class="SelectionMark">&#160;</span>Pages</a></div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_d583f216f1aafe19404e836b0c097ad2.html">cub</a></li><li class="navelem"><a class="el" href="dir_f2530eaff2fd0556614e1cfc33864b65.html">grid</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#namespaces">Namespaces</a>  </div>
+  <div class="headertitle">
+<div class="title">grid_queue.cuh File Reference</div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &quot;../util_namespace.cuh&quot;</code><br/>
+<code>#include &quot;../util_macro.cuh&quot;</code><br/>
+<code>#include &quot;<a class="el" href="util__debug_8cuh.html">../util_debug.cuh</a>&quot;</code><br/>
+<code>#include &quot;<a class="el" href="util__arch_8cuh.html">../util_arch.cuh</a>&quot;</code><br/>
+<code>#include &quot;../util_allocator.cuh&quot;</code><br/>
+</div><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html">cub::GridQueue&lt; SizeT &gt;</a></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> is a descriptor utility for dynamic queue management.  <a href="classcub_1_1_grid_queue.html#details">More...</a><br/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="namespaces"></a>
+Namespaces</h2></td></tr>
+<tr class="memitem:namespacecub"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacecub.html">cub</a></td></tr>
+<tr class="memdesc:namespacecub"><td class="mdescLeft">&#160;</td><td class="mdescRight">Optional outer namespace(s) <br/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
+<div class="textblock"><p><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">cub::GridQueue</a> is a descriptor utility for dynamic queue management. </p>
+</div></div><!-- contents -->
+<!-- HTML footer for doxygen 1.8.3.1-->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+<img class="footer" src="doxygen.png" alt="doxygen"/>
+</a> 1.8.3.1
+<br>
+&copy; 2013 NVIDIA Corporation
+</small></address>
+</body>
+</html>
diff --git a/docs/html/group___block_module.html b/docs/html/group___block_module.html
index 0858a903c3..e3a34a195a 100644
--- a/docs/html/group___block_module.html
+++ b/docs/html/group___block_module.html
@@ -202,14 +202,14 @@
 <tr class="memitem:ga44dddae56b59b69c3786d65bd0a5b111"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
 <tr class="memdesc:ga44dddae56b59b69c3786d65bd0a5b111"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly.  <a href="#ga44dddae56b59b69c3786d65bd0a5b111">More...</a><br/></td></tr>
 <tr class="separator:ga44dddae56b59b69c3786d65bd0a5b111"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gac0bc7f0eae136804a5ec53c65d404c64">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
-<tr class="memdesc:gac0bc7f0eae136804a5ec53c65d404c64"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly using the specified cache modifier, guarded by range.  <a href="#gac0bc7f0eae136804a5ec53c65d404c64">More...</a><br/></td></tr>
-<tr class="separator:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga6c1e557324be533a3adc4e2a9a57f555">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
-<tr class="memdesc:ga6c1e557324be533a3adc4e2a9a57f555"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly, guarded by range.  <a href="#ga6c1e557324be533a3adc4e2a9a57f555">More...</a><br/></td></tr>
-<tr class="separator:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gabb1342fbedb7bc02751bcfdf33be9ce8">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
+<tr class="memdesc:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly using the specified cache modifier, guarded by range.  <a href="#gabb1342fbedb7bc02751bcfdf33be9ce8">More...</a><br/></td></tr>
+<tr class="separator:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga71ed9e38c7d5c183024adb72d40dae1a">cub::BlockStoreDirect</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
+<tr class="memdesc:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly, guarded by range.  <a href="#ga71ed9e38c7d5c183024adb72d40dae1a">More...</a><br/></td></tr>
+<tr class="separator:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="member-group"></a>
 Direct threadblock I/O (striped arrangement)</h2></td></tr>
@@ -245,13 +245,13 @@
 <tr class="memitem:ga23b34fe2b784fb75221a9d5e944cec26"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
 <tr class="memdesc:ga23b34fe2b784fb75221a9d5e944cec26"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly.  <a href="#ga23b34fe2b784fb75221a9d5e944cec26">More...</a><br/></td></tr>
 <tr class="separator:ga23b34fe2b784fb75221a9d5e944cec26"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gadaae2b044498e68d6d09264a32368806"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gadaae2b044498e68d6d09264a32368806"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gadaae2b044498e68d6d09264a32368806">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
-<tr class="separator:gadaae2b044498e68d6d09264a32368806"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gacb7646081377a6dbfe8476ecad623554"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gacb7646081377a6dbfe8476ecad623554"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gacb7646081377a6dbfe8476ecad623554">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
-<tr class="memdesc:gacb7646081377a6dbfe8476ecad623554"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly, guarded by range.  <a href="#gacb7646081377a6dbfe8476ecad623554">More...</a><br/></td></tr>
-<tr class="separator:gacb7646081377a6dbfe8476ecad623554"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga606eaef596511b99efa4417b1ac3f896"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga606eaef596511b99efa4417b1ac3f896"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga606eaef596511b99efa4417b1ac3f896">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
+<tr class="separator:ga606eaef596511b99efa4417b1ac3f896"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82">cub::BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
+<tr class="memdesc:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly, guarded by range.  <a href="#ga8eac6cb6eeffd296c66554b85dff7e82">More...</a><br/></td></tr>
+<tr class="separator:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="member-group"></a>
 Threadblock vectorized I/O (blocked arrangement)</h2></td></tr>
@@ -1155,11 +1155,11 @@ <h2 class="groupheader">Function Documentation</h2>
 
 </div>
 </div>
-<a class="anchor" id="gac0bc7f0eae136804a5ec53c65d404c64"></a>
+<a class="anchor" id="gabb1342fbedb7bc02751bcfdf33be9ce8"></a>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </div>
+template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">__device__ __forceinline__ void cub::BlockStoreDirect </td>
@@ -1170,7 +1170,7 @@ <h2 class="groupheader">Function Documentation</h2>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const SizeT &amp;&#160;</td>
+          <td class="paramtype">const int &amp;&#160;</td>
           <td class="paramname"><em>guarded_items</em>, </td>
         </tr>
         <tr>
@@ -1195,7 +1195,6 @@ <h2 class="groupheader">Function Documentation</h2>
     <tr><td class="paramname">T</td><td><b>[inferred]</b> The data type to store. </td></tr>
     <tr><td class="paramname">ITEMS_PER_THREAD</td><td><b>[inferred]</b> The number of consecutive items partitioned onto each thread. </td></tr>
     <tr><td class="paramname">OutputIteratorRA</td><td><b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type). </td></tr>
-    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integer type for offsets </td></tr>
   </table>
   </dd>
 </dl>
@@ -1210,11 +1209,11 @@ <h2 class="groupheader">Function Documentation</h2>
 
 </div>
 </div>
-<a class="anchor" id="ga6c1e557324be533a3adc4e2a9a57f555"></a>
+<a class="anchor" id="ga71ed9e38c7d5c183024adb72d40dae1a"></a>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </div>
+template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">__device__ __forceinline__ void cub::BlockStoreDirect </td>
@@ -1225,7 +1224,7 @@ <h2 class="groupheader">Function Documentation</h2>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const SizeT &amp;&#160;</td>
+          <td class="paramtype">const int &amp;&#160;</td>
           <td class="paramname"><em>guarded_items</em>, </td>
         </tr>
         <tr>
@@ -1249,7 +1248,6 @@ <h2 class="groupheader">Function Documentation</h2>
     <tr><td class="paramname">T</td><td><b>[inferred]</b> The data type to store. </td></tr>
     <tr><td class="paramname">ITEMS_PER_THREAD</td><td><b>[inferred]</b> The number of consecutive items partitioned onto each thread. </td></tr>
     <tr><td class="paramname">OutputIteratorRA</td><td><b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type). </td></tr>
-    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integer type for offsets </td></tr>
   </table>
   </dd>
 </dl>
@@ -1372,11 +1370,11 @@ <h2 class="groupheader">Function Documentation</h2>
 
 </div>
 </div>
-<a class="anchor" id="gadaae2b044498e68d6d09264a32368806"></a>
+<a class="anchor" id="ga606eaef596511b99efa4417b1ac3f896"></a>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </div>
+template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">__device__ __forceinline__ void cub::BlockStoreDirectStriped </td>
@@ -1387,7 +1385,7 @@ <h2 class="groupheader">Function Documentation</h2>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const SizeT &amp;&#160;</td>
+          <td class="paramtype">const int &amp;&#160;</td>
           <td class="paramname"><em>guarded_items</em>, </td>
         </tr>
         <tr>
@@ -1417,7 +1415,6 @@ <h2 class="groupheader">Function Documentation</h2>
     <tr><td class="paramname">T</td><td><b>[inferred]</b> The data type to store. </td></tr>
     <tr><td class="paramname">ITEMS_PER_THREAD</td><td><b>[inferred]</b> The number of consecutive items partitioned onto each thread. </td></tr>
     <tr><td class="paramname">OutputIteratorRA</td><td><b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type). </td></tr>
-    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integer type for offsets </td></tr>
   </table>
   </dd>
 </dl>
@@ -1433,11 +1430,11 @@ <h2 class="groupheader">Function Documentation</h2>
 
 </div>
 </div>
-<a class="anchor" id="gacb7646081377a6dbfe8476ecad623554"></a>
+<a class="anchor" id="ga8eac6cb6eeffd296c66554b85dff7e82"></a>
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </div>
+template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">__device__ __forceinline__ void cub::BlockStoreDirectStriped </td>
@@ -1448,7 +1445,7 @@ <h2 class="groupheader">Function Documentation</h2>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const SizeT &amp;&#160;</td>
+          <td class="paramtype">const int &amp;&#160;</td>
           <td class="paramname"><em>guarded_items</em>, </td>
         </tr>
         <tr>
@@ -1478,7 +1475,6 @@ <h2 class="groupheader">Function Documentation</h2>
     <tr><td class="paramname">T</td><td><b>[inferred]</b> The data type to store. </td></tr>
     <tr><td class="paramname">ITEMS_PER_THREAD</td><td><b>[inferred]</b> The number of consecutive items partitioned onto each thread. </td></tr>
     <tr><td class="paramname">OutputIteratorRA</td><td><b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type). </td></tr>
-    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integer type for offsets </td></tr>
   </table>
   </dd>
 </dl>
@@ -1601,7 +1597,7 @@ <h2 class="groupheader">Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/group___device_module.html b/docs/html/group___device_module.html
index 1189c7949a..c292928f0b 100644
--- a/docs/html/group___device_module.html
+++ b/docs/html/group___device_module.html
@@ -116,7 +116,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/group___grid_module.html b/docs/html/group___grid_module.html
index fe4dbc08bf..137e539fbc 100644
--- a/docs/html/group___grid_module.html
+++ b/docs/html/group___grid_module.html
@@ -88,7 +88,8 @@
 <div class="header">
   <div class="summary">
 <a href="#nested-classes">Classes</a> &#124;
-<a href="#enum-members">Enumerations</a>  </div>
+<a href="#enum-members">Enumerations</a> &#124;
+<a href="#func-members">Functions</a>  </div>
   <div class="headertitle">
 <div class="title">Grid utilities</div>  </div>
 </div><!--header-->
@@ -97,10 +98,10 @@
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="nested-classes"></a>
 Classes</h2></td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_even_share.html">cub::GridEvenShare&lt; SizeT &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).  <a href="classcub_1_1_grid_even_share.html#details">More...</a><br/></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).  <a href="classcub_1_1_grid_even_share.html#details">More...</a><br/></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html">cub::GridQueue&lt; SizeT &gt;</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Abstraction for grid-wide queue management.  <a href="classcub_1_1_grid_queue.html#details">More...</a><br/></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> is a descriptor utility for dynamic queue management.  <a href="classcub_1_1_grid_queue.html#details">More...</a><br/></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="enum-members"></a>
@@ -108,7 +109,35 @@
 <tr class="memitem:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97">cub::GridMappingStrategy</a> { <a class="el" href="group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a8f282fc1684924a8503850b8b302d3a5">cub::GRID_MAPPING_EVEN_SHARE</a>, 
 <a class="el" href="group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a6857b21076c6af5e9b1f4a55c0bdc971">cub::GRID_MAPPING_DYNAMIC</a>
  }</td></tr>
+<tr class="memdesc:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97" title="cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device...">cub::GridMappingStrategy</a> enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.  <a href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97">More...</a><br/></td></tr>
 <tr class="separator:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
+Functions</h2></td></tr>
+<tr class="memitem:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f">cub::ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)</td></tr>
+<tr class="memdesc:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Dispatches tiles of work from the given input range to the specified thread block abstraction.  <a href="#ga86fcdb021fa1a4cbfa0e5952b6c1920f">More...</a><br/></td></tr>
+<tr class="separator:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga771bd863d4c690a25c267537e2b8cbf9">cub::ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)</td></tr>
+<tr class="memdesc:ga771bd863d4c690a25c267537e2b8cbf9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)  <a href="#ga771bd863d4c690a25c267537e2b8cbf9">More...</a><br/></td></tr>
+<tr class="separator:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga39f156542d92e308412515e349e90300"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga39f156542d92e308412515e349e90300"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga39f156542d92e308412515e349e90300">cub::ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)</td></tr>
+<tr class="memdesc:ga39f156542d92e308412515e349e90300"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)  <a href="#ga39f156542d92e308412515e349e90300">More...</a><br/></td></tr>
+<tr class="separator:ga39f156542d92e308412515e349e90300"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca">cub::ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)</td></tr>
+<tr class="memdesc:gad719cb805d4b53ddca9943abbde1c0ca"><td class="mdescLeft">&#160;</td><td class="mdescRight">Dispatches tiles of work from the given input range to the specified thread block abstraction. The first tile given to each thread block is flagged as such.  <a href="#gad719cb805d4b53ddca9943abbde1c0ca">More...</a><br/></td></tr>
+<tr class="separator:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gabae8550f492407f0795171a140d82084"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:gabae8550f492407f0795171a140d82084"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gabae8550f492407f0795171a140d82084">cub::ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)</td></tr>
+<tr class="memdesc:gabae8550f492407f0795171a140d82084"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)  <a href="#gabae8550f492407f0795171a140d82084">More...</a><br/></td></tr>
+<tr class="separator:gabae8550f492407f0795171a140d82084"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab">cub::ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)</td></tr>
+<tr class="memdesc:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)  <a href="#ga6492a0a23e402a3134bd5b15fa10ecab">More...</a><br/></td></tr>
+<tr class="separator:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Enumeration Type Documentation</h2>
 <a class="anchor" id="gab0872c68bb2f2deba92ae650a9aa3e97"></a>
@@ -120,25 +149,471 @@ <h2 class="groupheader">Enumeration Type Documentation</h2>
         </tr>
       </table>
 </div><div class="memdoc">
-<p>GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. </p>
+
+<p><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97" title="cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device...">cub::GridMappingStrategy</a> enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. </p>
 <table class="fieldtable">
 <tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="ggab0872c68bb2f2deba92ae650a9aa3e97a8f282fc1684924a8503850b8b302d3a5"></a>GRID_MAPPING_EVEN_SHARE</em>&nbsp;</td><td class="fielddoc">
-<p>An "even-share" strategy. </p>
+<p>An "even-share" strategy for assigning input tiles to thread blocks. </p>
 <dl class="section user"><dt>Overview</dt><dd>The input is evenly partitioned into <code>p</code> segments, where <code>p</code> is constant and corresponds loosely to the number of thread blocks that may actively reside on the target device. Each segment is comprised of consecutive tiles, where a tile is a small, constant-sized unit of input to be processed to completion before the thread block terminates or obtains more work. The kernel invokes <code>p</code> thread blocks, each of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements in tile-size increments. </dd></dl>
 </td></tr>
 <tr><td class="fieldname"><em><a class="anchor" id="ggab0872c68bb2f2deba92ae650a9aa3e97a6857b21076c6af5e9b1f4a55c0bdc971"></a>GRID_MAPPING_DYNAMIC</em>&nbsp;</td><td class="fielddoc">
-<p>A dynamic "queue-based" strategy for commutative reduction operators. </p>
+<p>A dynamic "queue-based" strategy for assigning input tiles to thread blocks. </p>
 <dl class="section user"><dt>Overview</dt><dd>The input is treated as a queue to be dynamically consumed by a grid of thread blocks. Work is atomically dequeued in tiles, where a tile is a unit of input to be processed to completion before the thread block terminates or obtains more work. The grid size <code>p</code> is constant, loosely corresponding to the number of thread blocks that may actively reside on the target device. </dd></dl>
 </td></tr>
 </table>
 
+</div>
+</div>
+<h2 class="groupheader">Function Documentation</h2>
+<a class="anchor" id="ga86fcdb021fa1a4cbfa0e5952b6c1920f"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">__device__ __forceinline__ void cub::ConsumeTiles </td>
+          <td>(</td>
+          <td class="paramtype">PersistentBlock &amp;&#160;</td>
+          <td class="paramname"><em>persistent_block</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>block_offset</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>block_oob</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">Result &amp;&#160;</td>
+          <td class="paramname"><em>result</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Dispatches tiles of work from the given input range to the specified thread block abstraction. </p>
+<dl class="section user"><dt></dt><dd>Expects the <code>PersistentBlock</code> type to have the following callback member functions:<ul>
+<li>Tile processing:<ul>
+<li><code>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);</code></li>
+</ul>
+</li>
+<li>Getting the maximum number of items processed per call to <code>PersistentBlock::ConsumeTile</code>:<ul>
+<li><code>int TileItems()</code></li>
+</ul>
+</li>
+<li>Finalization:<ul>
+<li><code>void Finalize(Result &amp;result);</code></li>
+</ul>
+</li>
+</ul>
+</dd></dl>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">PersistentBlock</td><td><b>[inferred]</b> Thread block abstraction type for tile processing </td></tr>
+    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integral type used for global array indexing </td></tr>
+    <tr><td class="paramname">Result</td><td><b>[inferred]</b> Result type to be returned by the PersistentBlock instance </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">persistent_block</td><td>Threadblock abstraction for tile processing </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">block_offset</td><td>Threadblock begin offset (inclusive) </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">block_oob</td><td>Threadblock end offset (exclusive) </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">result</td><td>Result returned by <code>tiles::Finalize()</code> </td></tr>
+  </table>
+  </dd>
+</dl>
+
+</div>
+</div>
+<a class="anchor" id="ga771bd863d4c690a25c267537e2b8cbf9"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">__device__ __forceinline__ void cub::ConsumeTiles </td>
+          <td>(</td>
+          <td class="paramtype">PersistentBlock &amp;&#160;</td>
+          <td class="paramname"><em>persistent_block</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>num_items</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">GridEvenShare&lt; SizeT &gt; &amp;&#160;</td>
+          <td class="paramname"><em>even_share</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">Result &amp;&#160;</td>
+          <td class="paramname"><em>result</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.) </p>
+<dl class="section user"><dt></dt><dd>Expects the <code>PersistentBlock</code> type to have the following callback member functions:<ul>
+<li>Tile processing:<ul>
+<li><code>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);</code></li>
+</ul>
+</li>
+<li>Getting the maximum number of items processed per call to <code>PersistentBlock::ConsumeTile</code>:<ul>
+<li><code>int TileItems()</code></li>
+</ul>
+</li>
+<li>Finalization:<ul>
+<li><code>void Finalize(Result &amp;result);</code></li>
+</ul>
+</li>
+</ul>
+</dd></dl>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">PersistentBlock</td><td><b>[inferred]</b> Thread block abstraction type for tile processing </td></tr>
+    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integral type used for global array indexing </td></tr>
+    <tr><td class="paramname">Result</td><td><b>[inferred]</b> Result type to be returned by the PersistentBlock instance </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">persistent_block</td><td>Threadblock abstraction for tile processing </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">num_items</td><td>Total number of global input items </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">even_share</td><td><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">result</td><td>Result returned by <code>tiles::Finalize()</code> </td></tr>
+  </table>
+  </dd>
+</dl>
+
+</div>
+</div>
+<a class="anchor" id="gad719cb805d4b53ddca9943abbde1c0ca"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">__device__ __forceinline__ void cub::ConsumeTilesFlagFirst </td>
+          <td>(</td>
+          <td class="paramtype">PersistentBlock &amp;&#160;</td>
+          <td class="paramname"><em>persistent_block</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>block_offset</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>block_oob</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">Result &amp;&#160;</td>
+          <td class="paramname"><em>result</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Dispatches tiles of work from the given input range to the specified thread block abstraction. The first tile given to each thread block is flagged as such. </p>
+<dl class="section user"><dt></dt><dd>Expects the <code>PersistentBlock</code> type to have the following callback member functions:<ul>
+<li>Tile processing:<ul>
+<li><code>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);</code></li>
+</ul>
+</li>
+<li>Getting the maximum number of items processed per call to <code>PersistentBlock::ConsumeTile</code>:<ul>
+<li><code>int TileItems()</code></li>
+</ul>
+</li>
+<li>Finalization:<ul>
+<li><code>void Finalize(Result &amp;result);</code></li>
+</ul>
+</li>
+</ul>
+</dd></dl>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">PersistentBlock</td><td><b>[inferred]</b> Thread block abstraction type for tile processing </td></tr>
+    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integral type used for global array indexing </td></tr>
+    <tr><td class="paramname">Result</td><td><b>[inferred]</b> Result type to be returned by the PersistentBlock instance </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">persistent_block</td><td>Threadblock abstraction for tile processing </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">block_offset</td><td>Threadblock begin offset (inclusive) </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">block_oob</td><td>Threadblock end offset (exclusive) </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">result</td><td>Result returned by <code>tiles::Finalize()</code> </td></tr>
+  </table>
+  </dd>
+</dl>
+
+</div>
+</div>
+<a class="anchor" id="gabae8550f492407f0795171a140d82084"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">__device__ __forceinline__ void cub::ConsumeTilesFlagFirst </td>
+          <td>(</td>
+          <td class="paramtype">PersistentBlock &amp;&#160;</td>
+          <td class="paramname"><em>persistent_block</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>num_items</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">GridEvenShare&lt; SizeT &gt; &amp;&#160;</td>
+          <td class="paramname"><em>even_share</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">Result &amp;&#160;</td>
+          <td class="paramname"><em>result</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.) </p>
+<dl class="section user"><dt></dt><dd>Expects the <code>PersistentBlock</code> type to have the following callback member functions:<ul>
+<li>Tile processing:<ul>
+<li><code>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);</code></li>
+</ul>
+</li>
+<li>Getting the maximum number of items processed per call to <code>PersistentBlock::ConsumeTile</code>:<ul>
+<li><code>int TileItems()</code></li>
+</ul>
+</li>
+<li>Finalization:<ul>
+<li><code>void Finalize(Result &amp;result);</code></li>
+</ul>
+</li>
+</ul>
+</dd></dl>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">PersistentBlock</td><td><b>[inferred]</b> Thread block abstraction type for tile processing </td></tr>
+    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integral type used for global array indexing </td></tr>
+    <tr><td class="paramname">Result</td><td><b>[inferred]</b> Result type to be returned by the PersistentBlock instance </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">persistent_block</td><td>Threadblock abstraction for tile processing </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">num_items</td><td>Total number of global input items </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">even_share</td><td><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">result</td><td>Result returned by <code>tiles::Finalize()</code> </td></tr>
+  </table>
+  </dd>
+</dl>
+
+</div>
+</div>
+<a class="anchor" id="ga39f156542d92e308412515e349e90300"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">__device__ __forceinline__ void cub::ConsumeTiles </td>
+          <td>(</td>
+          <td class="paramtype">PersistentBlock &amp;&#160;</td>
+          <td class="paramname"><em>persistent_block</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>num_items</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">GridQueue&lt; SizeT &gt; &amp;&#160;</td>
+          <td class="paramname"><em>queue</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">Result &amp;&#160;</td>
+          <td class="paramname"><em>result</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.) </p>
+<dl class="section user"><dt></dt><dd>Expects the <code>PersistentBlock</code> type to have the following callback member functions:<ul>
+<li>Tile processing:<ul>
+<li><code>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items);</code></li>
+</ul>
+</li>
+<li>Getting the maximum number of items processed per call to <code>PersistentBlock::ConsumeTile</code>:<ul>
+<li><code>int TileItems()</code></li>
+</ul>
+</li>
+<li>Finalization:<ul>
+<li><code>void Finalize(Result &amp;result);</code></li>
+</ul>
+</li>
+</ul>
+</dd></dl>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">PersistentBlock</td><td><b>[inferred]</b> Thread block abstraction type for tile processing </td></tr>
+    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integral type used for global array indexing </td></tr>
+    <tr><td class="paramname">Result</td><td><b>[inferred]</b> Result type to be returned by the PersistentBlock instance </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">persistent_block</td><td>Threadblock abstraction for tile processing </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">num_items</td><td>Total number of global input items </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">queue</td><td><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">result</td><td>Result returned by <code>tiles::Finalize()</code> </td></tr>
+  </table>
+  </dd>
+</dl>
+
+</div>
+</div>
+<a class="anchor" id="ga6492a0a23e402a3134bd5b15fa10ecab"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">__device__ __forceinline__ void cub::ConsumeTilesFlagFirst </td>
+          <td>(</td>
+          <td class="paramtype">PersistentBlock &amp;&#160;</td>
+          <td class="paramname"><em>persistent_block</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SizeT&#160;</td>
+          <td class="paramname"><em>num_items</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">GridQueue&lt; SizeT &gt; &amp;&#160;</td>
+          <td class="paramname"><em>queue</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">Result &amp;&#160;</td>
+          <td class="paramname"><em>result</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.) </p>
+<dl class="section user"><dt></dt><dd>Expects the <code>PersistentBlock</code> type to have the following callback member functions:<ul>
+<li>Tile processing:<ul>
+<li><code>void ConsumeTile(bool sync_after, SizeT block_offset, SizeT valid_tile_items, is_first_tile);</code></li>
+</ul>
+</li>
+<li>Getting the maximum number of items processed per call to <code>PersistentBlock::ConsumeTile</code>:<ul>
+<li><code>int TileItems()</code></li>
+</ul>
+</li>
+<li>Finalization:<ul>
+<li><code>void Finalize(Result &amp;result);</code></li>
+</ul>
+</li>
+</ul>
+</dd></dl>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">PersistentBlock</td><td><b>[inferred]</b> Thread block abstraction type for tile processing </td></tr>
+    <tr><td class="paramname">SizeT</td><td><b>[inferred]</b> Integral type used for global array indexing </td></tr>
+    <tr><td class="paramname">Result</td><td><b>[inferred]</b> Result type to be returned by the PersistentBlock instance </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">persistent_block</td><td>Threadblock abstraction for tile processing </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">num_items</td><td>Total number of global input items </td></tr>
+    <tr><td class="paramdir">[in,out]</td><td class="paramname">queue</td><td><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">result</td><td>Result returned by <code>tiles::Finalize()</code> </td></tr>
+  </table>
+  </dd>
+</dl>
+
 </div>
 </div>
 </div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/group___thread_module.html b/docs/html/group___thread_module.html
index 99be411a4b..cced5b410d 100644
--- a/docs/html/group___thread_module.html
+++ b/docs/html/group___thread_module.html
@@ -932,7 +932,7 @@ <h2 class="groupheader">Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/group___util_module.html b/docs/html/group___util_module.html
index 38437f022c..1b3e7275d2 100644
--- a/docs/html/group___util_module.html
+++ b/docs/html/group___util_module.html
@@ -89,7 +89,6 @@
   <div class="summary">
 <a href="#nested-classes">Classes</a> &#124;
 <a href="#define-members">Macros</a> &#124;
-<a href="#typedef-members">Typedefs</a> &#124;
 <a href="#enum-members">Enumerations</a> &#124;
 <a href="#func-members">Functions</a>  </div>
   <div class="headertitle">
@@ -168,10 +167,6 @@
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="define-members"></a>
 Macros</h2></td></tr>
-<tr class="memitem:ga9b4969946073d1c94ec927991f8b3a6b"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ga9b4969946073d1c94ec927991f8b3a6b"></a>
-#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga9b4969946073d1c94ec927991f8b3a6b">CUB_CNP_ENABLED</a>&#160;&#160;&#160;1</td></tr>
-<tr class="memdesc:ga9b4969946073d1c94ec927991f8b3a6b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. <br/></td></tr>
-<tr class="separator:ga9b4969946073d1c94ec927991f8b3a6b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga84bf1a8ed872e12577935eb28feca420"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga84bf1a8ed872e12577935eb28feca420">CUB_DEFINE_DETECT_NESTED_TYPE</a>(detector_name, nested_type_name)</td></tr>
 <tr class="memdesc:ga84bf1a8ed872e12577935eb28feca420"><td class="mdescLeft">&#160;</td><td class="mdescRight">Defines a structure <code>detector_name</code> that is templated on type <code>T</code>. The <code>detector_name</code> struct exposes a constant member <code>VALUE</code> indicating whether or not parameter <code>T</code> exposes a nested type <code>nested_type_name</code>.  <a href="#ga84bf1a8ed872e12577935eb28feca420">More...</a><br/></td></tr>
 <tr class="separator:ga84bf1a8ed872e12577935eb28feca420"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -183,6 +178,10 @@
 #define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga76769ce380739b38e0bd00f57e0dbe4f">CUB_PTX_ARCH</a>&#160;&#160;&#160;0</td></tr>
 <tr class="memdesc:ga76769ce380739b38e0bd00f57e0dbe4f"><td class="mdescLeft">&#160;</td><td class="mdescRight">CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). <br/></td></tr>
 <tr class="separator:ga76769ce380739b38e0bd00f57e0dbe4f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gac029dc29df3c1871ab3b8a38f50904a6"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="gac029dc29df3c1871ab3b8a38f50904a6"></a>
+#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#gac029dc29df3c1871ab3b8a38f50904a6">CUB_RUNTIME_ENABLED</a></td></tr>
+<tr class="memdesc:gac029dc29df3c1871ab3b8a38f50904a6"><td class="mdescLeft">&#160;</td><td class="mdescRight">Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. <br/></td></tr>
+<tr class="separator:gac029dc29df3c1871ab3b8a38f50904a6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga84c3a4c178bf6593e0fad2b763606236"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ga84c3a4c178bf6593e0fad2b763606236"></a>
 #define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga84c3a4c178bf6593e0fad2b763606236">CubDebug</a>(e)&#160;&#160;&#160;<a class="el" href="group___util_module.html#ga991477281cbf1b003b39a4af29824a1e">cub::Debug</a>((e), __FILE__, __LINE__)</td></tr>
 <tr class="memdesc:ga84c3a4c178bf6593e0fad2b763606236"><td class="mdescLeft">&#160;</td><td class="mdescRight">Debug macro. <br/></td></tr>
@@ -203,13 +202,6 @@
 <tr class="memdesc:ga289830648914c5fec58f34af96247fe0"><td class="mdescLeft">&#160;</td><td class="mdescRight">The default allocator for host and device usage.  <a href="#ga289830648914c5fec58f34af96247fe0">More...</a><br/></td></tr>
 <tr class="separator:ga289830648914c5fec58f34af96247fe0"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
-<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="typedef-members"></a>
-Typedefs</h2></td></tr>
-<tr class="memitem:ga0888aca216483e15abb12b4d80da31a9"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ga0888aca216483e15abb12b4d80da31a9"></a>
-typedef int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga0888aca216483e15abb12b4d80da31a9">cub::DeviceOrdinal</a></td></tr>
-<tr class="memdesc:ga0888aca216483e15abb12b4d80da31a9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Type for representing GPU device ordinals. <br/></td></tr>
-<tr class="separator:ga0888aca216483e15abb12b4d80da31a9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-</table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="enum-members"></a>
 Enumerations</h2></td></tr>
 <tr class="memitem:gaba62b43ab3ddc260aa2d3ee42035aa4e"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom">{ <b>INVALID_DEVICE_ORDINAL</b> = -1
@@ -348,7 +340,7 @@ <h2 class="groupheader">Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/group___warp_module.html b/docs/html/group___warp_module.html
index 405794f84b..70d92c8ba1 100644
--- a/docs/html/group___warp_module.html
+++ b/docs/html/group___warp_module.html
@@ -116,7 +116,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/hierarchy.html b/docs/html/hierarchy.html
index a5fa522c85..a21e0c9608 100644
--- a/docs/html/hierarchy.html
+++ b/docs/html/hierarchy.html
@@ -171,8 +171,8 @@
 <tr id="row_21_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_enable_if.html" target="_self">cub::EnableIf&lt; Condition, T &gt;</a></td><td class="desc">Simple enable-if (similar to Boost)</td></tr>
 <tr id="row_22_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_equality.html" target="_self">cub::Equality&lt; T &gt;</a></td><td class="desc">Default equality functor</td></tr>
 <tr id="row_23_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_equals.html" target="_self">cub::Equals&lt; A, B &gt;</a></td><td class="desc">Type equality test</td></tr>
-<tr id="row_24_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_even_share.html" target="_self">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="desc">A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains)</td></tr>
-<tr id="row_25_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_queue.html" target="_self">cub::GridQueue&lt; SizeT &gt;</a></td><td class="desc">Abstraction for grid-wide queue management</td></tr>
+<tr id="row_24_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_even_share.html" target="_self">cub::GridEvenShare&lt; SizeT &gt;</a></td><td class="desc"><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains)</td></tr>
+<tr id="row_25_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="classcub_1_1_grid_queue.html" target="_self">cub::GridQueue&lt; SizeT &gt;</a></td><td class="desc"><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> is a descriptor utility for dynamic queue management</td></tr>
 <tr id="row_26_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_if.html" target="_self">cub::If&lt; IF, ThenType, ElseType &gt;</a></td><td class="desc">Type selection (<code>IF ? ThenType : ElseType</code>)</td></tr>
 <tr id="row_27_" class="even"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_int2_type.html" target="_self">cub::Int2Type&lt; A &gt;</a></td><td class="desc">Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)</td></tr>
 <tr id="row_28_"><td class="entry"><img src="ftv2node.png" alt="o" width="16" height="22" /><img src="ftv2cl.png" alt="C" width="24" height="22" /><a class="el" href="structcub_1_1_is_pointer.html" target="_self">cub::IsPointer&lt; Tp &gt;</a></td><td class="desc">Pointer vs. iterator</td></tr>
@@ -201,7 +201,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/index.html b/docs/html/index.html
index 2568ae638e..73cd630db2 100644
--- a/docs/html/index.html
+++ b/docs/html/index.html
@@ -160,7 +160,7 @@ <h1><a class="anchor" id="sec2"></a>
 (2) Recent news</h1>
 <dl class="section user"><dt></dt><dd><table class="doxtable">
 <tr>
-<td>04/30/2013 </td><td style="white-space: nowrap"><a href="https://github.com/NVlabs/cub/archive/0.9.3.zip">CUB v0.9.3 (update release)</a> </td><td>Introduced several new device-wide and block-wide primitives, including 256-bin histogram. Misc. cosmetic and bug fixes. See the <a href="https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT">change-log</a> for further details.  </td></tr>
+<td>05/07/2013 </td><td style="white-space: nowrap"><a href="https://github.com/NVlabs/cub/archive/0.9.4.zip">CUB v0.9.4 (update release)</a> </td><td>Compilation fixes for several primitives on older architectures. Introduced several new device-wide and block-wide primitives, including 256-bin histogram. Misc. cosmetic and bug fixes. See the <a href="https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT">change-log</a> for further details.  </td></tr>
 <tr>
 <td>04/04/2013 </td><td style="white-space: nowrap"><a href="https://github.com/NVlabs/cub/archive/0.9.2.zip">CUB v0.9.2 (update release)</a> </td><td>Minor cosmetic, feature, and compilation updates. See the <a href="https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT">change-log</a> for further details.  </td></tr>
 <tr>
@@ -376,7 +376,7 @@ <h1><a class="anchor" id="sec8"></a>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/modules.html b/docs/html/modules.html
index ccad046dbf..194fb94f67 100644
--- a/docs/html/modules.html
+++ b/docs/html/modules.html
@@ -104,7 +104,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/namespacecub.html b/docs/html/namespacecub.html
index 934c4603b8..0f965c63b5 100644
--- a/docs/html/namespacecub.html
+++ b/docs/html/namespacecub.html
@@ -94,7 +94,6 @@
 <div class="header">
   <div class="summary">
 <a href="#nested-classes">Classes</a> &#124;
-<a href="#typedef-members">Typedefs</a> &#124;
 <a href="#enum-members">Enumerations</a> &#124;
 <a href="#func-members">Functions</a>  </div>
   <div class="headertitle">
@@ -229,10 +228,10 @@
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Type equality test.  <a href="structcub_1_1_equals.html#details">More...</a><br/></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_even_share.html">GridEvenShare</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">A descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).  <a href="classcub_1_1_grid_even_share.html#details">More...</a><br/></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).  <a href="classcub_1_1_grid_even_share.html#details">More...</a><br/></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classcub_1_1_grid_queue.html">GridQueue</a></td></tr>
-<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Abstraction for grid-wide queue management.  <a href="classcub_1_1_grid_queue.html#details">More...</a><br/></td></tr>
+<tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> is a descriptor utility for dynamic queue management.  <a href="classcub_1_1_grid_queue.html#details">More...</a><br/></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_if.html">If</a></td></tr>
 <tr class="memdesc:"><td class="mdescLeft">&#160;</td><td class="mdescRight">Type selection (<code>IF ? ThenType : ElseType</code>)  <a href="structcub_1_1_if.html#details">More...</a><br/></td></tr>
@@ -296,22 +295,16 @@
   <a href="classcub_1_1_warp_scan.html#details">More...</a><br/></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
-<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="typedef-members"></a>
-Typedefs</h2></td></tr>
-<tr class="memitem:ga0888aca216483e15abb12b4d80da31a9"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ga0888aca216483e15abb12b4d80da31a9"></a>
-typedef int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga0888aca216483e15abb12b4d80da31a9">DeviceOrdinal</a></td></tr>
-<tr class="memdesc:ga0888aca216483e15abb12b4d80da31a9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Type for representing GPU device ordinals. <br/></td></tr>
-<tr class="separator:ga0888aca216483e15abb12b4d80da31a9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-</table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="enum-members"></a>
 Enumerations</h2></td></tr>
 <tr class="memitem:gaba62b43ab3ddc260aa2d3ee42035aa4e"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom">{ <b>INVALID_DEVICE_ORDINAL</b> = -1
  }</td></tr>
 <tr class="memdesc:gaba62b43ab3ddc260aa2d3ee42035aa4e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Invalid device ordinal. <br/></td></tr>
 <tr class="separator:gaba62b43ab3ddc260aa2d3ee42035aa4e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0f61554b5c901fcc01adb8af3d9aacca"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">BlockHisto256Algorithm</a> { <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d">BLOCK_BYTE_HISTO_SORT</a>, 
-<a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46">BLOCK_BYTE_HISTO_ATOMIC</a>
+<tr class="memitem:a0f61554b5c901fcc01adb8af3d9aacca"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">BlockHisto256Algorithm</a> { <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8">BLOCK_HISTO_256_SORT</a>, 
+<a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e">BLOCK_HISTO_256_ATOMIC</a>
  }</td></tr>
+<tr class="memdesc:a0f61554b5c901fcc01adb8af3d9aacca"><td class="mdescLeft">&#160;</td><td class="mdescRight">BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms.  <a href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aacca">More...</a><br/></td></tr>
 <tr class="separator:a0f61554b5c901fcc01adb8af3d9aacca"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a70f1d3c7536d858d49b896e937d25290"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacecub.html#a70f1d3c7536d858d49b896e937d25290">BlockLoadPolicy</a> { <a class="el" href="namespacecub.html#a70f1d3c7536d858d49b896e937d25290a2d4d8900d7e697e9dac4062e97d3d835">BLOCK_LOAD_DIRECT</a>, 
 <a class="el" href="namespacecub.html#a70f1d3c7536d858d49b896e937d25290a826be9d4df1c44c0e5c00a9c9c136965">BLOCK_LOAD_VECTORIZE</a>, 
@@ -346,6 +339,7 @@
 <tr class="memitem:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97">GridMappingStrategy</a> { <a class="el" href="group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a8f282fc1684924a8503850b8b302d3a5">GRID_MAPPING_EVEN_SHARE</a>, 
 <a class="el" href="group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a6857b21076c6af5e9b1f4a55c0bdc971">GRID_MAPPING_DYNAMIC</a>
  }</td></tr>
+<tr class="memdesc:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="mdescLeft">&#160;</td><td class="mdescRight"><a class="el" href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97" title="cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device...">cub::GridMappingStrategy</a> enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.  <a href="group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97">More...</a><br/></td></tr>
 <tr class="separator:gab0872c68bb2f2deba92ae650a9aa3e97"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga023420f30fec7d4b187fc98f4fd2a55d"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___thread_module.html#ga023420f30fec7d4b187fc98f4fd2a55d">PtxLoadModifier</a> { <br/>
 &#160;&#160;<a class="el" href="group___thread_module.html#gga023420f30fec7d4b187fc98f4fd2a55da017db24b99abd332be14151d35fa3cf5">PTX_LOAD_NONE</a>, 
@@ -375,6 +369,30 @@
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
 Functions</h2></td></tr>
+<tr class="memitem:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f">ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)</td></tr>
+<tr class="memdesc:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Dispatches tiles of work from the given input range to the specified thread block abstraction.  <a href="group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f">More...</a><br/></td></tr>
+<tr class="separator:ga86fcdb021fa1a4cbfa0e5952b6c1920f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga771bd863d4c690a25c267537e2b8cbf9">ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT num_items, <a class="el" href="classcub_1_1_grid_even_share.html">GridEvenShare</a>&lt; SizeT &gt; &amp;even_share, Result &amp;result)</td></tr>
+<tr class="memdesc:ga771bd863d4c690a25c267537e2b8cbf9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)  <a href="group___grid_module.html#ga771bd863d4c690a25c267537e2b8cbf9">More...</a><br/></td></tr>
+<tr class="separator:ga771bd863d4c690a25c267537e2b8cbf9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga39f156542d92e308412515e349e90300"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga39f156542d92e308412515e349e90300"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga39f156542d92e308412515e349e90300">ConsumeTiles</a> (PersistentBlock &amp;persistent_block, SizeT num_items, <a class="el" href="classcub_1_1_grid_queue.html">GridQueue</a>&lt; SizeT &gt; &amp;queue, Result &amp;result)</td></tr>
+<tr class="memdesc:ga39f156542d92e308412515e349e90300"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)  <a href="group___grid_module.html#ga39f156542d92e308412515e349e90300">More...</a><br/></td></tr>
+<tr class="separator:ga39f156542d92e308412515e349e90300"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca">ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)</td></tr>
+<tr class="memdesc:gad719cb805d4b53ddca9943abbde1c0ca"><td class="mdescLeft">&#160;</td><td class="mdescRight">Dispatches tiles of work from the given input range to the specified thread block abstraction. The first tile given to each thread block is flagged as such.  <a href="group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca">More...</a><br/></td></tr>
+<tr class="separator:gad719cb805d4b53ddca9943abbde1c0ca"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gabae8550f492407f0795171a140d82084"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:gabae8550f492407f0795171a140d82084"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#gabae8550f492407f0795171a140d82084">ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT num_items, <a class="el" href="classcub_1_1_grid_even_share.html">GridEvenShare</a>&lt; SizeT &gt; &amp;even_share, Result &amp;result)</td></tr>
+<tr class="memdesc:gabae8550f492407f0795171a140d82084"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_even_share.html" title="GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an &quot;even-shar...">GridEvenShare</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_EVEN_SHARE.)  <a href="group___grid_module.html#gabae8550f492407f0795171a140d82084">More...</a><br/></td></tr>
+<tr class="separator:gabae8550f492407f0795171a140d82084"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memTemplParams" colspan="2">template&lt;typename PersistentBlock , typename SizeT , typename Result &gt; </td></tr>
+<tr class="memitem:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab">ConsumeTilesFlagFirst</a> (PersistentBlock &amp;persistent_block, SizeT num_items, <a class="el" href="classcub_1_1_grid_queue.html">GridQueue</a>&lt; SizeT &gt; &amp;queue, Result &amp;result)</td></tr>
+<tr class="memdesc:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="mdescLeft">&#160;</td><td class="mdescRight">Uses a <a class="el" href="classcub_1_1_grid_queue.html" title="GridQueue is a descriptor utility for dynamic queue management.">GridQueue</a> descriptor to dispatch tiles of work to the specified thread block abstraction. The first tile given to each thread block is flagged as such. (See GridMappingStrategy::GRID_MAPPING_DYNAMIC.)  <a href="group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab">More...</a><br/></td></tr>
+<tr class="separator:ga6492a0a23e402a3134bd5b15fa10ecab"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga991477281cbf1b003b39a4af29824a1e"><td class="memItemLeft" align="right" valign="top">__host__ __device__ <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga991477281cbf1b003b39a4af29824a1e">Debug</a> (cudaError_t error, const char *filename, int line, bool silent=false)</td></tr>
 <tr class="memdesc:ga991477281cbf1b003b39a4af29824a1e"><td class="mdescLeft">&#160;</td><td class="mdescRight">CUB error reporting macro (prints error messages to stderr)  <a href="group___util_module.html#ga991477281cbf1b003b39a4af29824a1e">More...</a><br/></td></tr>
@@ -422,14 +440,14 @@
 <tr class="memitem:ga44dddae56b59b69c3786d65bd0a5b111"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111">BlockStoreDirect</a> (OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
 <tr class="memdesc:ga44dddae56b59b69c3786d65bd0a5b111"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly.  <a href="group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111">More...</a><br/></td></tr>
 <tr class="separator:ga44dddae56b59b69c3786d65bd0a5b111"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gac0bc7f0eae136804a5ec53c65d404c64">BlockStoreDirect</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
-<tr class="memdesc:gac0bc7f0eae136804a5ec53c65d404c64"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly using the specified cache modifier, guarded by range.  <a href="group___block_module.html#gac0bc7f0eae136804a5ec53c65d404c64">More...</a><br/></td></tr>
-<tr class="separator:gac0bc7f0eae136804a5ec53c65d404c64"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga6c1e557324be533a3adc4e2a9a57f555">BlockStoreDirect</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
-<tr class="memdesc:ga6c1e557324be533a3adc4e2a9a57f555"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly, guarded by range.  <a href="group___block_module.html#ga6c1e557324be533a3adc4e2a9a57f555">More...</a><br/></td></tr>
-<tr class="separator:ga6c1e557324be533a3adc4e2a9a57f555"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gabb1342fbedb7bc02751bcfdf33be9ce8">BlockStoreDirect</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
+<tr class="memdesc:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly using the specified cache modifier, guarded by range.  <a href="group___block_module.html#gabb1342fbedb7bc02751bcfdf33be9ce8">More...</a><br/></td></tr>
+<tr class="separator:gabb1342fbedb7bc02751bcfdf33be9ce8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga71ed9e38c7d5c183024adb72d40dae1a">BlockStoreDirect</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
+<tr class="memdesc:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store a tile of items across a threadblock directly, guarded by range.  <a href="group___block_module.html#ga71ed9e38c7d5c183024adb72d40dae1a">More...</a><br/></td></tr>
+<tr class="separator:ga71ed9e38c7d5c183024adb72d40dae1a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr><td colspan="2"><div class="groupHeader">Direct threadblock I/O (striped arrangement)</div></td></tr>
 <tr class="memitem:ga83ee84ebf1fdb8bf7be2c87f405c0ea2"><td class="memTemplParams" colspan="2">template&lt;PtxLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename InputIteratorRA &gt; </td></tr>
 <tr class="memitem:ga83ee84ebf1fdb8bf7be2c87f405c0ea2"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga83ee84ebf1fdb8bf7be2c87f405c0ea2">BlockLoadDirectStriped</a> (InputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
@@ -463,13 +481,13 @@
 <tr class="memitem:ga23b34fe2b784fb75221a9d5e944cec26"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26">BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
 <tr class="memdesc:ga23b34fe2b784fb75221a9d5e944cec26"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly.  <a href="group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26">More...</a><br/></td></tr>
 <tr class="separator:ga23b34fe2b784fb75221a9d5e944cec26"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gadaae2b044498e68d6d09264a32368806"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gadaae2b044498e68d6d09264a32368806"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gadaae2b044498e68d6d09264a32368806">BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
-<tr class="separator:gadaae2b044498e68d6d09264a32368806"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:gacb7646081377a6dbfe8476ecad623554"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA , typename SizeT &gt; </td></tr>
-<tr class="memitem:gacb7646081377a6dbfe8476ecad623554"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gacb7646081377a6dbfe8476ecad623554">BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
-<tr class="memdesc:gacb7646081377a6dbfe8476ecad623554"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly, guarded by range.  <a href="group___block_module.html#gacb7646081377a6dbfe8476ecad623554">More...</a><br/></td></tr>
-<tr class="separator:gacb7646081377a6dbfe8476ecad623554"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga606eaef596511b99efa4417b1ac3f896"><td class="memTemplParams" colspan="2">template&lt;PtxStoreModifier MODIFIER, typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga606eaef596511b99efa4417b1ac3f896"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga606eaef596511b99efa4417b1ac3f896">BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
+<tr class="separator:ga606eaef596511b99efa4417b1ac3f896"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memTemplParams" colspan="2">template&lt;typename T , int ITEMS_PER_THREAD, typename OutputIteratorRA &gt; </td></tr>
+<tr class="memitem:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82">BlockStoreDirectStriped</a> (OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)</td></tr>
+<tr class="memdesc:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="mdescLeft">&#160;</td><td class="mdescRight">Store striped tile directly, guarded by range.  <a href="group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82">More...</a><br/></td></tr>
+<tr class="separator:ga8eac6cb6eeffd296c66554b85dff7e82"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr><td colspan="2"><div class="groupHeader">Threadblock vectorized I/O (blocked arrangement)</div></td></tr>
 <tr class="memitem:gaea8200ef976bb588c569e039ea79005c"><td class="memTemplParams" colspan="2">template&lt;PtxLoadModifier MODIFIER, typename T , int ITEMS_PER_THREAD&gt; </td></tr>
 <tr class="memitem:gaea8200ef976bb588c569e039ea79005c"><td class="memTemplItemLeft" align="right" valign="top">__device__ __forceinline__ void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="group___block_module.html#gaea8200ef976bb588c569e039ea79005c">BlockLoadVectorized</a> (T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])</td></tr>
@@ -555,18 +573,20 @@
         </tr>
       </table>
 </div><div class="memdoc">
+
 <p>BlockHisto256Algorithm enumerates alternative algorithms for the parallel construction of 8b histograms. </p>
 <table class="fieldtable">
-<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d"></a>BLOCK_BYTE_HISTO_SORT</em>&nbsp;</td><td class="fielddoc">
+<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8"></a>BLOCK_HISTO_256_SORT</em>&nbsp;</td><td class="fielddoc">
 <dl class="section user"><dt>Overview</dt><dd>Sorting followed by differentiation. Execution is comprised of two phases:<ol type="1">
 <li>Sort the 8b data using efficient radix sort</li>
-<li>Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts. </li>
+<li>Look for "runs" of same-valued 8b keys by detecting discontinuities; the run-lengths are histogram bin counts.</li>
 </ol>
 </dd></dl>
+<dl class="section user"><dt>Performance Considerations</dt><dd>Delivers consistent throughput regardless of sample bin distribution. </dd></dl>
 </td></tr>
-<tr><td class="fieldname"><em><a class="anchor" id="a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46"></a>BLOCK_BYTE_HISTO_ATOMIC</em>&nbsp;</td><td class="fielddoc">
+<tr><td class="fieldname"><em><a class="anchor" id="a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e"></a>BLOCK_HISTO_256_ATOMIC</em>&nbsp;</td><td class="fielddoc">
 <dl class="section user"><dt>Overview</dt><dd>Use atomic addition to update byte counts directly</dd></dl>
-<dl class="section user"><dt>Usage Considerations</dt><dd>BLOCK_BYTE_HISTO_ATOMIC can only be used on version SM120 or later. Otherwise BLOCK_BYTE_HISTO_SORT is used regardless. </dd></dl>
+<dl class="section user"><dt>Performance Considerations</dt><dd>Performance is strongly tied to the hardware implementation of atomic addition, and may be significantly degraded for non uniformly-random input distributions where many concurrent updates are likely to be made to the same bin counter. </dd></dl>
 </td></tr>
 </table>
 
@@ -788,7 +808,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/namespacemembers.html b/docs/html/namespacemembers.html
index e00e15236f..7f00f5c330 100644
--- a/docs/html/namespacemembers.html
+++ b/docs/html/namespacemembers.html
@@ -80,7 +80,6 @@
     <ul class="tablist">
       <li class="current"><a href="namespacemembers.html"><span>All</span></a></li>
       <li><a href="namespacemembers_func.html"><span>Functions</span></a></li>
-      <li><a href="namespacemembers_type.html"><span>Typedefs</span></a></li>
       <li><a href="namespacemembers_enum.html"><span>Enumerations</span></a></li>
       <li><a href="namespacemembers_eval.html"><span>Enumerator</span></a></li>
     </ul>
@@ -115,11 +114,11 @@
 <div class="textblock">Here is a list of all documented namespace members with links to the namespaces they belong to:</div>
 
 <h3><a class="anchor" id="index_b"></a>- b -</h3><ul>
-<li>BLOCK_BYTE_HISTO_ATOMIC
-: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46">cub</a>
+<li>BLOCK_HISTO_256_ATOMIC
+: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e">cub</a>
 </li>
-<li>BLOCK_BYTE_HISTO_SORT
-: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d">cub</a>
+<li>BLOCK_HISTO_256_SORT
+: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8">cub</a>
 </li>
 <li>BLOCK_LOAD_DIRECT
 : <a class="el" href="namespacecub.html#a70f1d3c7536d858d49b896e937d25290a2d4d8900d7e697e9dac4062e97d3d835">cub</a>
@@ -200,6 +199,12 @@ <h3><a class="anchor" id="index_c"></a>- c -</h3><ul>
 <li>Category
 : <a class="el" href="group___util_module.html#ga4733b6d40e923244502e6f5b200766ef">cub</a>
 </li>
+<li>ConsumeTiles()
+: <a class="el" href="group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f">cub</a>
+</li>
+<li>ConsumeTilesFlagFirst()
+: <a class="el" href="group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab">cub</a>
+</li>
 </ul>
 
 
@@ -207,9 +212,6 @@ <h3><a class="anchor" id="index_d"></a>- d -</h3><ul>
 <li>Debug()
 : <a class="el" href="group___util_module.html#ga991477281cbf1b003b39a4af29824a1e">cub</a>
 </li>
-<li>DeviceOrdinal
-: <a class="el" href="group___util_module.html#ga0888aca216483e15abb12b4d80da31a9">cub</a>
-</li>
 </ul>
 
 
@@ -306,7 +308,7 @@ <h3><a class="anchor" id="index_t"></a>- t -</h3><ul>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/namespacemembers_enum.html b/docs/html/namespacemembers_enum.html
index aa85929861..87fa76c511 100644
--- a/docs/html/namespacemembers_enum.html
+++ b/docs/html/namespacemembers_enum.html
@@ -80,7 +80,6 @@
     <ul class="tablist">
       <li><a href="namespacemembers.html"><span>All</span></a></li>
       <li><a href="namespacemembers_func.html"><span>Functions</span></a></li>
-      <li><a href="namespacemembers_type.html"><span>Typedefs</span></a></li>
       <li class="current"><a href="namespacemembers_enum.html"><span>Enumerations</span></a></li>
       <li><a href="namespacemembers_eval.html"><span>Enumerator</span></a></li>
     </ul>
@@ -134,7 +133,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/namespacemembers_eval.html b/docs/html/namespacemembers_eval.html
index 833c0308e4..2ca44b7e2a 100644
--- a/docs/html/namespacemembers_eval.html
+++ b/docs/html/namespacemembers_eval.html
@@ -80,7 +80,6 @@
     <ul class="tablist">
       <li><a href="namespacemembers.html"><span>All</span></a></li>
       <li><a href="namespacemembers_func.html"><span>Functions</span></a></li>
-      <li><a href="namespacemembers_type.html"><span>Typedefs</span></a></li>
       <li><a href="namespacemembers_enum.html"><span>Enumerations</span></a></li>
       <li class="current"><a href="namespacemembers_eval.html"><span>Enumerator</span></a></li>
     </ul>
@@ -102,11 +101,11 @@
 
 <div class="contents">
 &#160;<ul>
-<li>BLOCK_BYTE_HISTO_ATOMIC
-: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46">cub</a>
+<li>BLOCK_HISTO_256_ATOMIC
+: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e">cub</a>
 </li>
-<li>BLOCK_BYTE_HISTO_SORT
-: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d">cub</a>
+<li>BLOCK_HISTO_256_SORT
+: <a class="el" href="namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8">cub</a>
 </li>
 <li>BLOCK_LOAD_DIRECT
 : <a class="el" href="namespacecub.html#a70f1d3c7536d858d49b896e937d25290a2d4d8900d7e697e9dac4062e97d3d835">cub</a>
@@ -197,7 +196,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/namespacemembers_func.html b/docs/html/namespacemembers_func.html
index 11d24dfda5..a24a36ae3a 100644
--- a/docs/html/namespacemembers_func.html
+++ b/docs/html/namespacemembers_func.html
@@ -80,7 +80,6 @@
     <ul class="tablist">
       <li><a href="namespacemembers.html"><span>All</span></a></li>
       <li class="current"><a href="namespacemembers_func.html"><span>Functions</span></a></li>
-      <li><a href="namespacemembers_type.html"><span>Typedefs</span></a></li>
       <li><a href="namespacemembers_enum.html"><span>Enumerations</span></a></li>
       <li><a href="namespacemembers_eval.html"><span>Enumerator</span></a></li>
     </ul>
@@ -88,6 +87,7 @@
   <div id="navrow4" class="tabs3">
     <ul class="tablist">
       <li><a href="#index_b"><span>b</span></a></li>
+      <li><a href="#index_c"><span>c</span></a></li>
       <li><a href="#index_d"><span>d</span></a></li>
       <li><a href="#index_e"><span>e</span></a></li>
       <li><a href="#index_p"><span>p</span></a></li>
@@ -126,7 +126,7 @@ <h3><a class="anchor" id="index_b"></a>- b -</h3><ul>
 : <a class="el" href="group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111">cub</a>
 </li>
 <li>BlockStoreDirectStriped()
-: <a class="el" href="group___block_module.html#gacb7646081377a6dbfe8476ecad623554">cub</a>
+: <a class="el" href="group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82">cub</a>
 </li>
 <li>BlockStoreVectorized()
 : <a class="el" href="group___block_module.html#ga013c3ab8214854f45e8d678958e7dde9">cub</a>
@@ -134,6 +134,16 @@ <h3><a class="anchor" id="index_b"></a>- b -</h3><ul>
 </ul>
 
 
+<h3><a class="anchor" id="index_c"></a>- c -</h3><ul>
+<li>ConsumeTiles()
+: <a class="el" href="group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f">cub</a>
+</li>
+<li>ConsumeTilesFlagFirst()
+: <a class="el" href="group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca">cub</a>
+</li>
+</ul>
+
+
 <h3><a class="anchor" id="index_d"></a>- d -</h3><ul>
 <li>Debug()
 : <a class="el" href="group___util_module.html#ga991477281cbf1b003b39a4af29824a1e">cub</a>
@@ -176,7 +186,7 @@ <h3><a class="anchor" id="index_t"></a>- t -</h3><ul>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/namespacemembers_type.html b/docs/html/namespacemembers_type.html
index 37cd8ed856..264d199c12 100644
--- a/docs/html/namespacemembers_type.html
+++ b/docs/html/namespacemembers_type.html
@@ -102,7 +102,7 @@
 
 <div class="contents">
 &#160;<ul>
-<li>DeviceOrdinal
+<li>int
 : <a class="el" href="group___util_module.html#ga0888aca216483e15abb12b4d80da31a9">cub</a>
 </li>
 </ul>
@@ -110,7 +110,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:15:28 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/namespaces.html b/docs/html/namespaces.html
index 9c68ceeb00..f980bc5acc 100644
--- a/docs/html/namespaces.html
+++ b/docs/html/namespaces.html
@@ -105,7 +105,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/pages.html b/docs/html/pages.html
index 8c185fbb48..c7383d536d 100644
--- a/docs/html/pages.html
+++ b/docs/html/pages.html
@@ -98,7 +98,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/search/all_62.js b/docs/html/search/all_62.js
index 4bb15fcf8b..49a23ac91e 100644
--- a/docs/html/search/all_62.js
+++ b/docs/html/search/all_62.js
@@ -4,12 +4,12 @@ var searchData=
   ['basetraits_3c_20not_5fa_5fnumber_2c_20false_2c_20false_2c_20removequalifiers_3c_20t_20_3e_3a_3atype_20_3e',['BaseTraits&lt; NOT_A_NUMBER, false, false, RemoveQualifiers&lt; T &gt;::Type &gt;',['../structcub_1_1_base_traits.html',1,'cub']]],
   ['basetraits_3c_20not_5fa_5fnumber_2c_20false_2c_20false_2c_20t_20_3e',['BaseTraits&lt; NOT_A_NUMBER, false, false, T &gt;',['../structcub_1_1_base_traits.html',1,'cub']]],
   ['binaryophasidxparam',['BinaryOpHasIdxParam',['../structcub_1_1_binary_op_has_idx_param.html',1,'cub']]],
-  ['bindtexture',['BindTexture',['../classcub_1_1_tex_iterator_r_a.html#a40805822c8f8966dd07221ccf9c5231b',1,'cub::TexIteratorRA::BindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#a9ff64f31b9d4ac34a04de1905f64f898',1,'cub::TexTransformIteratorRA::BindTexture()']]],
-  ['block_5fbyte_5fhisto_5fatomic',['BLOCK_BYTE_HISTO_ATOMIC',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46',1,'cub']]],
-  ['block_5fbyte_5fhisto_5fsort',['BLOCK_BYTE_HISTO_SORT',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d',1,'cub']]],
+  ['bindtexture',['BindTexture',['../classcub_1_1_tex_iterator_r_a.html#a5f6750afce62e3b947dbb2c6f12b4364',1,'cub::TexIteratorRA::BindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#aaa167451624a1326a9d66b927a861c18',1,'cub::TexTransformIteratorRA::BindTexture()']]],
   ['block_5fdiscontinuity_2ecuh',['block_discontinuity.cuh',['../block__discontinuity_8cuh.html',1,'']]],
   ['block_5fexchange_2ecuh',['block_exchange.cuh',['../block__exchange_8cuh.html',1,'']]],
   ['block_5fhisto_5f256_2ecuh',['block_histo_256.cuh',['../block__histo__256_8cuh.html',1,'']]],
+  ['block_5fhisto_5f256_5fatomic',['BLOCK_HISTO_256_ATOMIC',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e',1,'cub']]],
+  ['block_5fhisto_5f256_5fsort',['BLOCK_HISTO_256_SORT',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8',1,'cub']]],
   ['block_5fload_2ecuh',['block_load.cuh',['../block__load_8cuh.html',1,'']]],
   ['block_5fload_5fdirect',['BLOCK_LOAD_DIRECT',['../namespacecub.html#a70f1d3c7536d858d49b896e937d25290a2d4d8900d7e697e9dac4062e97d3d835',1,'cub']]],
   ['block_5fload_5fstriped',['BLOCK_LOAD_STRIPED',['../namespacecub.html#a70f1d3c7536d858d49b896e937d25290a2656cd687526977aa60f925d1f0ca86f',1,'cub']]],
@@ -50,8 +50,8 @@ var searchData=
   ['blockscan',['BlockScan',['../classcub_1_1_block_scan.html',1,'cub']]],
   ['blockscanalgorithm',['BlockScanAlgorithm',['../namespacecub.html#abec44bba36037c547e7e84906d0d23ab',1,'cub']]],
   ['blockstore',['BlockStore',['../classcub_1_1_block_store.html',1,'cub']]],
-  ['blockstoredirect',['BlockStoreDirect',['../group___block_module.html#gade556bdfdfee03872a67580e0ec3b3f0',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#gac0bc7f0eae136804a5ec53c65d404c64',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga6c1e557324be533a3adc4e2a9a57f555',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
-  ['blockstoredirectstriped',['BlockStoreDirectStriped',['../group___block_module.html#gac9496dc1855119887042c1f3879b8c2f',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#gadaae2b044498e68d6d09264a32368806',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#gacb7646081377a6dbfe8476ecad623554',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)']]],
+  ['blockstoredirect',['BlockStoreDirect',['../group___block_module.html#gade556bdfdfee03872a67580e0ec3b3f0',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#gabb1342fbedb7bc02751bcfdf33be9ce8',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga71ed9e38c7d5c183024adb72d40dae1a',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
+  ['blockstoredirectstriped',['BlockStoreDirectStriped',['../group___block_module.html#gac9496dc1855119887042c1f3879b8c2f',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga606eaef596511b99efa4417b1ac3f896',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)']]],
   ['blockstorepolicy',['BlockStorePolicy',['../namespacecub.html#aaaa9ee8c8a57c6607909c110affd189e',1,'cub']]],
   ['blockstorevectorized',['BlockStoreVectorized',['../group___block_module.html#ga013c3ab8214854f45e8d678958e7dde9',1,'cub::BlockStoreVectorized(T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga5db0cef20c11ea62aef484c587c4e064',1,'cub::BlockStoreVectorized(T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])']]],
   ['bibliographic_20references',['Bibliographic References',['../citelist.html',1,'']]]
diff --git a/docs/html/search/all_63.js b/docs/html/search/all_63.js
index 5be9ad884a..9d30be791b 100644
--- a/docs/html/search/all_63.js
+++ b/docs/html/search/all_63.js
@@ -1,17 +1,19 @@
 var searchData=
 [
   ['cachingdeviceallocator',['CachingDeviceAllocator',['../structcub_1_1_caching_device_allocator.html',1,'cub']]],
-  ['cachingdeviceallocator',['CachingDeviceAllocator',['../structcub_1_1_caching_device_allocator.html#a603ab42b997facc30d9337fe6f56144b',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin, unsigned int max_bin, size_t max_cached_bytes)'],['../structcub_1_1_caching_device_allocator.html#a12f2ea29bf0a27eedf85d43604469780',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(bool silent_cleanup=false)']]],
+  ['cachingdeviceallocator',['CachingDeviceAllocator',['../structcub_1_1_caching_device_allocator.html#a603ab42b997facc30d9337fe6f56144b',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin, unsigned int max_bin, size_t max_cached_bytes)'],['../structcub_1_1_caching_device_allocator.html#a1293d82e262e57926275fce35a3f6230',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(bool skip_cleanup=false)']]],
   ['callback',['Callback',['../classcub_1_1_device.html#a3c986a6aa2d3b69fe8a7ab14c85a6477',1,'cub::Device']]],
   ['cast',['Cast',['../structcub_1_1_cast.html',1,'cub']]],
   ['category',['CATEGORY',['../structcub_1_1_base_traits.html#a25ff6477c84dc3bd5f4b5e70cd600f09',1,'cub::BaseTraits::CATEGORY()'],['../group___util_module.html#ga4733b6d40e923244502e6f5b200766ef',1,'cub::Category()']]],
   ['composite',['Composite',['../classcub_1_1_block_histo256.html#a73ead0815070a889b36b13171abcca2e',1,'cub::BlockHisto256']]],
+  ['consumetiles',['ConsumeTiles',['../group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f',1,'cub::ConsumeTiles(PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)'],['../group___grid_module.html#ga771bd863d4c690a25c267537e2b8cbf9',1,'cub::ConsumeTiles(PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)'],['../group___grid_module.html#ga39f156542d92e308412515e349e90300',1,'cub::ConsumeTiles(PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)']]],
+  ['consumetilesflagfirst',['ConsumeTilesFlagFirst',['../group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca',1,'cub::ConsumeTilesFlagFirst(PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)'],['../group___grid_module.html#gabae8550f492407f0795171a140d82084',1,'cub::ConsumeTilesFlagFirst(PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)'],['../group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab',1,'cub::ConsumeTilesFlagFirst(PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)']]],
   ['cub',['cub',['../namespacecub.html',1,'']]],
   ['cub_2ecuh',['cub.cuh',['../cub_8cuh.html',1,'']]],
-  ['cub_5fcnp_5fenabled',['CUB_CNP_ENABLED',['../group___util_module.html#ga9b4969946073d1c94ec927991f8b3a6b',1,'util_arch.cuh']]],
   ['cub_5fdefine_5fdetect_5fnested_5ftype',['CUB_DEFINE_DETECT_NESTED_TYPE',['../group___util_module.html#ga84bf1a8ed872e12577935eb28feca420',1,'util_type.cuh']]],
   ['cub_5fdestructor',['CUB_DESTRUCTOR',['../group___util_module.html#ga756ebe2b0566abfdfcd3c13fe7d1e9c6',1,'util_arch.cuh']]],
   ['cub_5fptx_5farch',['CUB_PTX_ARCH',['../group___util_module.html#ga76769ce380739b38e0bd00f57e0dbe4f',1,'util_arch.cuh']]],
+  ['cub_5fruntime_5fenabled',['CUB_RUNTIME_ENABLED',['../group___util_module.html#gac029dc29df3c1871ab3b8a38f50904a6',1,'util_arch.cuh']]],
   ['cubdebug',['CubDebug',['../group___util_module.html#ga84c3a4c178bf6593e0fad2b763606236',1,'util_debug.cuh']]],
   ['cubdebug2',['CubDebug2',['../group___util_module.html#gaf5994e3dce3f85e8fb00fc175303e545',1,'util_debug.cuh']]],
   ['cubdebugexit',['CubDebugExit',['../group___util_module.html#ga26211db894893b3cec946e4e537536f8',1,'util_debug.cuh']]],
diff --git a/docs/html/search/all_64.js b/docs/html/search/all_64.js
index 92972f061a..d1ce1c8df9 100644
--- a/docs/html/search/all_64.js
+++ b/docs/html/search/all_64.js
@@ -5,12 +5,11 @@ var searchData=
   ['device',['Device',['../classcub_1_1_device.html',1,'cub']]],
   ['device_5fhisto_5f256_2ecuh',['device_histo_256.cuh',['../device__histo__256_8cuh.html',1,'']]],
   ['device_5freduce_2ecuh',['device_reduce.cuh',['../device__reduce_8cuh.html',1,'']]],
-  ['deviceallocate',['DeviceAllocate',['../classcub_1_1_device_allocator.html#af3c1742fec6b37f2737d951917358099',1,'cub::DeviceAllocator::DeviceAllocate()'],['../structcub_1_1_caching_device_allocator.html#a80902287c5d123f3541cf0002604a8fe',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes, DeviceOrdinal device)'],['../structcub_1_1_caching_device_allocator.html#abedac216108d51a722bad43115831f37',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes)']]],
+  ['deviceallocate',['DeviceAllocate',['../classcub_1_1_device_allocator.html#af3c1742fec6b37f2737d951917358099',1,'cub::DeviceAllocator::DeviceAllocate()'],['../structcub_1_1_caching_device_allocator.html#a8b781537d9955246b873b7a79d95bbe9',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes, int device)'],['../structcub_1_1_caching_device_allocator.html#abedac216108d51a722bad43115831f37',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes)']]],
   ['deviceallocator',['DeviceAllocator',['../classcub_1_1_device_allocator.html',1,'cub']]],
-  ['devicefree',['DeviceFree',['../classcub_1_1_device_allocator.html#a527d9c0c93c177359f5ad823c7d808e2',1,'cub::DeviceAllocator::DeviceFree()'],['../structcub_1_1_caching_device_allocator.html#a35cf35b2610d3c1bb2f7a6ae5d814385',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr, DeviceOrdinal device)'],['../structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr)']]],
+  ['devicefree',['DeviceFree',['../classcub_1_1_device_allocator.html#a527d9c0c93c177359f5ad823c7d808e2',1,'cub::DeviceAllocator::DeviceFree()'],['../structcub_1_1_caching_device_allocator.html#a1a56292df1ed4fa9ec233d784afa7858',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr, int device)'],['../structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr)']]],
   ['devicehisto256',['DeviceHisto256',['../structcub_1_1_device_histo256.html',1,'cub']]],
   ['device_2dwide',['Device-wide',['../group___device_module.html',1,'']]],
-  ['deviceordinal',['DeviceOrdinal',['../group___util_module.html#ga0888aca216483e15abb12b4d80da31a9',1,'cub']]],
   ['devicereduce',['DeviceReduce',['../structcub_1_1_device_reduce.html',1,'cub']]],
   ['drain',['Drain',['../classcub_1_1_grid_queue.html#af6c8e3ff3fdb6f3e1ce7713d417cdf62',1,'cub::GridQueue']]]
 ];
diff --git a/docs/html/search/all_66.js b/docs/html/search/all_66.js
index f77dc7d597..891f5ec74d 100644
--- a/docs/html/search/all_66.js
+++ b/docs/html/search/all_66.js
@@ -4,5 +4,5 @@ var searchData=
   ['fillsize',['FillSize',['../classcub_1_1_grid_queue.html#a1b66918c1c7bdf77b221e15a283b396a',1,'cub::GridQueue']]],
   ['flag',['Flag',['../classcub_1_1_block_discontinuity.html#ab6390151f109ac253810504ddc5a7c04',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD], T &amp;last_tile_item)'],['../classcub_1_1_block_discontinuity.html#a3bdf3b7ad8ace5249f84e103f25ff3bb',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_discontinuity.html#a7fa4c2dc8bbe5db5da50fedca0613b46',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], T tile_predecessor, FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD], T &amp;last_tile_item)'],['../classcub_1_1_block_discontinuity.html#a351ed32eaada93c944fbb29feda5a6cd',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], T tile_predecessor, FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD])']]],
   ['free',['Free',['../classcub_1_1_grid_queue.html#aff77aa7ee352b3bf62725f0e6f41502a',1,'cub::GridQueue']]],
-  ['freeallcached',['FreeAllCached',['../structcub_1_1_caching_device_allocator.html#af55929d787fe0b785da1ae4ceeffea5b',1,'cub::CachingDeviceAllocator']]]
+  ['freeallcached',['FreeAllCached',['../structcub_1_1_caching_device_allocator.html#af25daf8ac1dbce0d8504a6a53c2277ad',1,'cub::CachingDeviceAllocator']]]
 ];
diff --git a/docs/html/search/all_67.js b/docs/html/search/all_67.js
index 99a6d6abbf..d2c4305a97 100644
--- a/docs/html/search/all_67.js
+++ b/docs/html/search/all_67.js
@@ -1,15 +1,18 @@
 var searchData=
 [
   ['grid_5felements',['GRID_ELEMENTS',['../structcub_1_1_block_raking_layout.html#a2aacb46e7388e201bf1aacc4267fcba4afc43f61dacf0b1c963d0e5ff62f8c9d9',1,'cub::BlockRakingLayout']]],
+  ['grid_5feven_5fshare_2ecuh',['grid_even_share.cuh',['../grid__even__share_8cuh.html',1,'']]],
   ['grid_5fmapping_2ecuh',['grid_mapping.cuh',['../grid__mapping_8cuh.html',1,'']]],
   ['grid_5fmapping_5fdynamic',['GRID_MAPPING_DYNAMIC',['../group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a6857b21076c6af5e9b1f4a55c0bdc971',1,'cub']]],
   ['grid_5fmapping_5feven_5fshare',['GRID_MAPPING_EVEN_SHARE',['../group___grid_module.html#ggab0872c68bb2f2deba92ae650a9aa3e97a8f282fc1684924a8503850b8b302d3a5',1,'cub']]],
+  ['grid_5fqueue_2ecuh',['grid_queue.cuh',['../grid__queue_8cuh.html',1,'']]],
   ['grid_5fsize',['grid_size',['../classcub_1_1_grid_even_share.html#a0f0c30d111858d38fc39dd223e8ca676',1,'cub::GridEvenShare']]],
+  ['gridevenshare',['GridEvenShare',['../classcub_1_1_grid_even_share.html#a34167aebdf4b8f6e0ab0d8dbee988e91',1,'cub::GridEvenShare::GridEvenShare(SizeT num_items)'],['../classcub_1_1_grid_even_share.html#ae37f606dd91894e9341e82affe757b0b',1,'cub::GridEvenShare::GridEvenShare()']]],
   ['gridevenshare',['GridEvenShare',['../classcub_1_1_grid_even_share.html',1,'cub']]],
   ['gridinit',['GridInit',['../classcub_1_1_grid_even_share.html#a239f96ab615df22e26c53cde6000e39f',1,'cub::GridEvenShare']]],
   ['gridmappingstrategy',['GridMappingStrategy',['../group___grid_module.html#gab0872c68bb2f2deba92ae650a9aa3e97',1,'cub']]],
   ['grid_20utilities',['Grid utilities',['../group___grid_module.html',1,'']]],
-  ['gridqueue',['GridQueue',['../classcub_1_1_grid_queue.html',1,'cub']]],
   ['gridqueue',['GridQueue',['../classcub_1_1_grid_queue.html#a94f344fcd45b02cd2d6e9b04d8f2665a',1,'cub::GridQueue']]],
+  ['gridqueue',['GridQueue',['../classcub_1_1_grid_queue.html',1,'cub']]],
   ['generic_20cub_20utilities',['Generic CUB utilities',['../group___util_module.html',1,'']]]
 ];
diff --git a/docs/html/search/all_6d.js b/docs/html/search/all_6d.js
index c0869e2254..0d75e03ba2 100644
--- a/docs/html/search/all_6d.js
+++ b/docs/html/search/all_6d.js
@@ -10,5 +10,6 @@ var searchData=
   ['max_5fsm_5fwarps',['max_sm_warps',['../classcub_1_1_device.html#a5ee84e74af9939896bb81b48d4843d17',1,'cub::Device']]],
   ['maxsmoccupancy',['MaxSmOccupancy',['../classcub_1_1_device.html#a618bbf36a5737a9b1533ff65834e88e8',1,'cub::Device']]],
   ['multichannel',['MultiChannel',['../structcub_1_1_device_histo256.html#aadf3db4f5e49852070aea37a21645e36',1,'cub::DeviceHisto256']]],
-  ['multichannelatomic',['MultiChannelAtomic',['../structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a',1,'cub::DeviceHisto256']]]
+  ['multichannelatomic',['MultiChannelAtomic',['../structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a',1,'cub::DeviceHisto256']]],
+  ['multichannelglobalatomic',['MultiChannelGlobalAtomic',['../structcub_1_1_device_histo256.html#a112463c78c6a51d41be4600421873126',1,'cub::DeviceHisto256']]]
 ];
diff --git a/docs/html/search/all_73.js b/docs/html/search/all_73.js
index 2545053afe..67ad4ef117 100644
--- a/docs/html/search/all_73.js
+++ b/docs/html/search/all_73.js
@@ -8,9 +8,10 @@ var searchData=
   ['shared_5felements',['SHARED_ELEMENTS',['../structcub_1_1_block_raking_layout.html#a2aacb46e7388e201bf1aacc4267fcba4a88563e2b0f7a4e3362d912f6493ccdb4',1,'cub::BlockRakingLayout']]],
   ['singlechannel',['SingleChannel',['../structcub_1_1_device_histo256.html#a21cfc3f4d496365051f59d4e87099d01',1,'cub::DeviceHisto256']]],
   ['singlechannelatomic',['SingleChannelAtomic',['../structcub_1_1_device_histo256.html#a792cb350cb09da5299f76806462d84db',1,'cub::DeviceHisto256']]],
+  ['singlechannelglobalatomic',['SingleChannelGlobalAtomic',['../structcub_1_1_device_histo256.html#a3b832f1fc945bdce122edc8a64cf6ec6',1,'cub::DeviceHisto256']]],
   ['sm_5fcount',['sm_count',['../classcub_1_1_device.html#a68885a90ea0badcffc2a3511b0b4fd5c',1,'cub::Device']]],
   ['sm_5fversion',['sm_version',['../classcub_1_1_device.html#ac9b36b3b0edb207aeee60110fbee6406',1,'cub::Device']]],
-  ['smem_5falloc_5funit',['smem_alloc_unit',['../classcub_1_1_device.html#aca17efd166d916008f0fc378adc00b1f',1,'cub::Device::smem_alloc_unit()'],['../structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063af554a252a393bce18200236640959bab',1,'cub::ArchProps::SMEM_ALLOC_UNIT()']]],
+  ['smem_5falloc_5funit',['SMEM_ALLOC_UNIT',['../structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063af554a252a393bce18200236640959bab',1,'cub::ArchProps::SMEM_ALLOC_UNIT()'],['../classcub_1_1_device.html#aca17efd166d916008f0fc378adc00b1f',1,'cub::Device::smem_alloc_unit()']]],
   ['smem_5fbank_5fbytes',['SMEM_BANK_BYTES',['../structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063ab56aa4deb8408a51e7cc139c2c1e14f5',1,'cub::ArchProps::SMEM_BANK_BYTES()'],['../classcub_1_1_device.html#a909d34dd31bdb6f754a603e8eb2b10ab',1,'cub::Device::smem_bank_bytes()']]],
   ['smem_5fbanks',['smem_banks',['../classcub_1_1_device.html#a2e835add85abdf0eb640c551abec2a0b',1,'cub::Device::smem_banks()'],['../structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063a55ab5a77ae95f67255bbec10cd3993e1',1,'cub::ArchProps::SMEM_BANKS()']]],
   ['smem_5fbytes',['SMEM_BYTES',['../structcub_1_1_arch_props.html#a378def3dee75e77eb135efcf6f0ca063a993d6ee811a054da1a8579132c6301a1',1,'cub::ArchProps::SMEM_BYTES()'],['../classcub_1_1_device.html#a838c9a2e207c38ef24e79a40bccdd1ac',1,'cub::Device::smem_bytes()']]],
@@ -18,8 +19,8 @@ var searchData=
   ['sortblocked',['SortBlocked',['../classcub_1_1_block_radix_sort.html#a779bfcd00c57f6b97cbbb8a0aafb616a',1,'cub::BlockRadixSort::SortBlocked(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)'],['../classcub_1_1_block_radix_sort.html#acd0282613cbd6ad6c52ab1bb05b82def',1,'cub::BlockRadixSort::SortBlocked(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], ValueType(&amp;values)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)']]],
   ['sortblockedtostriped',['SortBlockedToStriped',['../classcub_1_1_block_radix_sort.html#ad6cc88c2ae0d36c4c98b901748645e10',1,'cub::BlockRadixSort::SortBlockedToStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)'],['../classcub_1_1_block_radix_sort.html#a1171341e12f14cb19aef0e65ae3842ae',1,'cub::BlockRadixSort::SortBlockedToStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], ValueType(&amp;values)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)']]],
   ['sortstriped',['SortStriped',['../classcub_1_1_block_radix_sort.html#aea1b6f367da1b17ed80f65d511aec15e',1,'cub::BlockRadixSort::SortStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)'],['../classcub_1_1_block_radix_sort.html#a6d3cd483dfc0cce5cc8b1ca538b937ad',1,'cub::BlockRadixSort::SortStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], ValueType(&amp;values)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)']]],
-  ['store',['Store',['../classcub_1_1_block_store.html#a4d7858f738321f7130bc0be7d6fa46a5',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_store.html#a54a10a5fcc8e755941ee5d293440b0af',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
+  ['store',['Store',['../classcub_1_1_block_store.html#a4d7858f738321f7130bc0be7d6fa46a5',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_store.html#a2244c761e873e8e5334dccad9bea6657',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
   ['stripedtoblocked',['StripedToBlocked',['../classcub_1_1_block_exchange.html#ad8000bf73c3ce935018f32451985ae37',1,'cub::BlockExchange']]],
-  ['sum',['Sum',['../structcub_1_1_sum.html',1,'cub']]],
-  ['sum',['Sum',['../classcub_1_1_block_reduce.html#a8cc68b77f25c80b972bdedced6878214',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T input)'],['../classcub_1_1_block_reduce.html#af81b914f22d1c1608d46d12e1c1d888e',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T(&amp;inputs)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_reduce.html#adfc657d856d583321d2604589c52b43a',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T input, const unsigned int &amp;num_valid)'],['../classcub_1_1_warp_reduce.html#a50e4be93dbd298b7c2b5080d1449cf8c',1,'cub::WarpReduce::Sum(SmemStorage &amp;smem_storage, T input)'],['../classcub_1_1_warp_reduce.html#a0c1ad8bee00bc64f14f3b0d762b84363',1,'cub::WarpReduce::Sum(SmemStorage &amp;smem_storage, T input, const unsigned int &amp;valid_lanes)']]]
+  ['sum',['Sum',['../classcub_1_1_block_reduce.html#a8cc68b77f25c80b972bdedced6878214',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T input)'],['../classcub_1_1_block_reduce.html#af81b914f22d1c1608d46d12e1c1d888e',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T(&amp;inputs)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_reduce.html#adfc657d856d583321d2604589c52b43a',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T input, const unsigned int &amp;num_valid)'],['../classcub_1_1_warp_reduce.html#a50e4be93dbd298b7c2b5080d1449cf8c',1,'cub::WarpReduce::Sum(SmemStorage &amp;smem_storage, T input)'],['../classcub_1_1_warp_reduce.html#a0c1ad8bee00bc64f14f3b0d762b84363',1,'cub::WarpReduce::Sum(SmemStorage &amp;smem_storage, T input, const unsigned int &amp;valid_lanes)']]],
+  ['sum',['Sum',['../structcub_1_1_sum.html',1,'cub']]]
 ];
diff --git a/docs/html/search/all_75.js b/docs/html/search/all_75.js
index 7124595293..f2ad5a2b82 100644
--- a/docs/html/search/all_75.js
+++ b/docs/html/search/all_75.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['unbindtexture',['UnbindTexture',['../classcub_1_1_tex_iterator_r_a.html#ab33bd552a21c85dea863b75b09aa6e87',1,'cub::TexIteratorRA::UnbindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#a30fbb31a0a53d87f47174ee1889a6435',1,'cub::TexTransformIteratorRA::UnbindTexture()']]],
+  ['unbindtexture',['UnbindTexture',['../classcub_1_1_tex_iterator_r_a.html#abfc8c42e2da2beffb05ac81ec89ed8b9',1,'cub::TexIteratorRA::UnbindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#abd030991795b9c9ca31f4293486d206a',1,'cub::TexTransformIteratorRA::UnbindTexture()']]],
   ['unguarded',['UNGUARDED',['../structcub_1_1_block_raking_layout.html#a2aacb46e7388e201bf1aacc4267fcba4a26085e779253b376e60b904389eca382',1,'cub::BlockRakingLayout']]],
   ['util_5farch_2ecuh',['util_arch.cuh',['../util__arch_8cuh.html',1,'']]],
   ['util_5fdebug_2ecuh',['util_debug.cuh',['../util__debug_8cuh.html',1,'']]],
diff --git a/docs/html/search/enumvalues_62.js b/docs/html/search/enumvalues_62.js
index ef04dce397..e21de38f1a 100644
--- a/docs/html/search/enumvalues_62.js
+++ b/docs/html/search/enumvalues_62.js
@@ -1,7 +1,7 @@
 var searchData=
 [
-  ['block_5fbyte_5fhisto_5fatomic',['BLOCK_BYTE_HISTO_ATOMIC',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa0c3b7f30e0f0738bf508fdbdbe650f46',1,'cub']]],
-  ['block_5fbyte_5fhisto_5fsort',['BLOCK_BYTE_HISTO_SORT',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa5ca4cd67fcb7cc2a2e5fad154828f48d',1,'cub']]],
+  ['block_5fhisto_5f256_5fatomic',['BLOCK_HISTO_256_ATOMIC',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaa67faf22d4ff128547b2bc379efc9b99e',1,'cub']]],
+  ['block_5fhisto_5f256_5fsort',['BLOCK_HISTO_256_SORT',['../namespacecub.html#a0f61554b5c901fcc01adb8af3d9aaccaad00d3bfb0c22bb57cfb9700ff638a0d8',1,'cub']]],
   ['block_5fload_5fdirect',['BLOCK_LOAD_DIRECT',['../namespacecub.html#a70f1d3c7536d858d49b896e937d25290a2d4d8900d7e697e9dac4062e97d3d835',1,'cub']]],
   ['block_5fload_5fstriped',['BLOCK_LOAD_STRIPED',['../namespacecub.html#a70f1d3c7536d858d49b896e937d25290a2656cd687526977aa60f925d1f0ca86f',1,'cub']]],
   ['block_5fload_5ftranspose',['BLOCK_LOAD_TRANSPOSE',['../namespacecub.html#a70f1d3c7536d858d49b896e937d25290acd94f285472e8f7c883a7407f6f4efc4',1,'cub']]],
diff --git a/docs/html/search/files_67.js b/docs/html/search/files_67.js
index bd2e001eb0..a0f80cdc3f 100644
--- a/docs/html/search/files_67.js
+++ b/docs/html/search/files_67.js
@@ -1,4 +1,6 @@
 var searchData=
 [
-  ['grid_5fmapping_2ecuh',['grid_mapping.cuh',['../grid__mapping_8cuh.html',1,'']]]
+  ['grid_5feven_5fshare_2ecuh',['grid_even_share.cuh',['../grid__even__share_8cuh.html',1,'']]],
+  ['grid_5fmapping_2ecuh',['grid_mapping.cuh',['../grid__mapping_8cuh.html',1,'']]],
+  ['grid_5fqueue_2ecuh',['grid_queue.cuh',['../grid__queue_8cuh.html',1,'']]]
 ];
diff --git a/docs/html/search/functions_62.js b/docs/html/search/functions_62.js
index bfea627f9f..0110a5cb2e 100644
--- a/docs/html/search/functions_62.js
+++ b/docs/html/search/functions_62.js
@@ -1,12 +1,12 @@
 var searchData=
 [
-  ['bindtexture',['BindTexture',['../classcub_1_1_tex_iterator_r_a.html#a40805822c8f8966dd07221ccf9c5231b',1,'cub::TexIteratorRA::BindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#a9ff64f31b9d4ac34a04de1905f64f898',1,'cub::TexTransformIteratorRA::BindTexture()']]],
+  ['bindtexture',['BindTexture',['../classcub_1_1_tex_iterator_r_a.html#a5f6750afce62e3b947dbb2c6f12b4364',1,'cub::TexIteratorRA::BindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#aaa167451624a1326a9d66b927a861c18',1,'cub::TexTransformIteratorRA::BindTexture()']]],
   ['blockedtostriped',['BlockedToStriped',['../classcub_1_1_block_exchange.html#a068f68d3f9d5c53920eeae82594d6935',1,'cub::BlockExchange']]],
   ['blockinit',['BlockInit',['../classcub_1_1_grid_even_share.html#a214c0fc54f5ade75049ade09a08becf5',1,'cub::GridEvenShare']]],
   ['blockloaddirect',['BlockLoadDirect',['../group___block_module.html#ga17dd53c5e498a457cb002ea8a17fd26f',1,'cub::BlockLoadDirect(InputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga58df9568d2e73868661204e665dfba75',1,'cub::BlockLoadDirect(InputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga526e6017bd1939c9e91c713834d556b7',1,'cub::BlockLoadDirect(InputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#gabffc17c4e3b322b7d8ce3affdd803e36',1,'cub::BlockLoadDirect(InputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga19e97352c195418039667c47af6e04ec',1,'cub::BlockLoadDirect(InputIteratorRA block_itr, const int &amp;guarded_items, T oob_default, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga5ffee768b986f93e7ac825aadaadb8bd',1,'cub::BlockLoadDirect(InputIteratorRA block_itr, const int &amp;guarded_items, T oob_default, T(&amp;items)[ITEMS_PER_THREAD])']]],
   ['blockloaddirectstriped',['BlockLoadDirectStriped',['../group___block_module.html#ga83ee84ebf1fdb8bf7be2c87f405c0ea2',1,'cub::BlockLoadDirectStriped(InputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga1f83b2777086bb0cc23239f45ca89539',1,'cub::BlockLoadDirectStriped(InputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga9512f7cafeba9baeabbe4f45b553b07c',1,'cub::BlockLoadDirectStriped(InputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga4889345dc6435111948f97522b0fea85',1,'cub::BlockLoadDirectStriped(InputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#gab609d301efe4570a61a1ddcffc6927ce',1,'cub::BlockLoadDirectStriped(InputIteratorRA block_itr, const int &amp;guarded_items, T oob_default, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga615141197055656db62c416452ce18b0',1,'cub::BlockLoadDirectStriped(InputIteratorRA block_itr, const int &amp;guarded_items, T oob_default, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)']]],
   ['blockloadvectorized',['BlockLoadVectorized',['../group___block_module.html#gaea8200ef976bb588c569e039ea79005c',1,'cub::BlockLoadVectorized(T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#gab1a8ffc7fe70a636a3d09403344cfced',1,'cub::BlockLoadVectorized(T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])']]],
-  ['blockstoredirect',['BlockStoreDirect',['../group___block_module.html#gade556bdfdfee03872a67580e0ec3b3f0',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#gac0bc7f0eae136804a5ec53c65d404c64',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga6c1e557324be533a3adc4e2a9a57f555',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
-  ['blockstoredirectstriped',['BlockStoreDirectStriped',['../group___block_module.html#gac9496dc1855119887042c1f3879b8c2f',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#gadaae2b044498e68d6d09264a32368806',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#gacb7646081377a6dbfe8476ecad623554',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)']]],
+  ['blockstoredirect',['BlockStoreDirect',['../group___block_module.html#gade556bdfdfee03872a67580e0ec3b3f0',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga44dddae56b59b69c3786d65bd0a5b111',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#gabb1342fbedb7bc02751bcfdf33be9ce8',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga71ed9e38c7d5c183024adb72d40dae1a',1,'cub::BlockStoreDirect(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
+  ['blockstoredirectstriped',['BlockStoreDirectStriped',['../group___block_module.html#gac9496dc1855119887042c1f3879b8c2f',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga23b34fe2b784fb75221a9d5e944cec26',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga606eaef596511b99efa4417b1ac3f896',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)'],['../group___block_module.html#ga8eac6cb6eeffd296c66554b85dff7e82',1,'cub::BlockStoreDirectStriped(OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD], int stride=blockDim.x)']]],
   ['blockstorevectorized',['BlockStoreVectorized',['../group___block_module.html#ga013c3ab8214854f45e8d678958e7dde9',1,'cub::BlockStoreVectorized(T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])'],['../group___block_module.html#ga5db0cef20c11ea62aef484c587c4e064',1,'cub::BlockStoreVectorized(T *block_ptr, T(&amp;items)[ITEMS_PER_THREAD])']]]
 ];
diff --git a/docs/html/search/functions_63.js b/docs/html/search/functions_63.js
index a11bd633fc..fc3260756b 100644
--- a/docs/html/search/functions_63.js
+++ b/docs/html/search/functions_63.js
@@ -1,6 +1,8 @@
 var searchData=
 [
-  ['cachingdeviceallocator',['CachingDeviceAllocator',['../structcub_1_1_caching_device_allocator.html#a603ab42b997facc30d9337fe6f56144b',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin, unsigned int max_bin, size_t max_cached_bytes)'],['../structcub_1_1_caching_device_allocator.html#a12f2ea29bf0a27eedf85d43604469780',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(bool silent_cleanup=false)']]],
+  ['cachingdeviceallocator',['CachingDeviceAllocator',['../structcub_1_1_caching_device_allocator.html#a603ab42b997facc30d9337fe6f56144b',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin, unsigned int max_bin, size_t max_cached_bytes)'],['../structcub_1_1_caching_device_allocator.html#a1293d82e262e57926275fce35a3f6230',1,'cub::CachingDeviceAllocator::CachingDeviceAllocator(bool skip_cleanup=false)']]],
   ['callback',['Callback',['../classcub_1_1_device.html#a3c986a6aa2d3b69fe8a7ab14c85a6477',1,'cub::Device']]],
-  ['composite',['Composite',['../classcub_1_1_block_histo256.html#a73ead0815070a889b36b13171abcca2e',1,'cub::BlockHisto256']]]
+  ['composite',['Composite',['../classcub_1_1_block_histo256.html#a73ead0815070a889b36b13171abcca2e',1,'cub::BlockHisto256']]],
+  ['consumetiles',['ConsumeTiles',['../group___grid_module.html#ga86fcdb021fa1a4cbfa0e5952b6c1920f',1,'cub::ConsumeTiles(PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)'],['../group___grid_module.html#ga771bd863d4c690a25c267537e2b8cbf9',1,'cub::ConsumeTiles(PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)'],['../group___grid_module.html#ga39f156542d92e308412515e349e90300',1,'cub::ConsumeTiles(PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)']]],
+  ['consumetilesflagfirst',['ConsumeTilesFlagFirst',['../group___grid_module.html#gad719cb805d4b53ddca9943abbde1c0ca',1,'cub::ConsumeTilesFlagFirst(PersistentBlock &amp;persistent_block, SizeT block_offset, SizeT block_oob, Result &amp;result)'],['../group___grid_module.html#gabae8550f492407f0795171a140d82084',1,'cub::ConsumeTilesFlagFirst(PersistentBlock &amp;persistent_block, SizeT num_items, GridEvenShare&lt; SizeT &gt; &amp;even_share, Result &amp;result)'],['../group___grid_module.html#ga6492a0a23e402a3134bd5b15fa10ecab',1,'cub::ConsumeTilesFlagFirst(PersistentBlock &amp;persistent_block, SizeT num_items, GridQueue&lt; SizeT &gt; &amp;queue, Result &amp;result)']]]
 ];
diff --git a/docs/html/search/functions_64.js b/docs/html/search/functions_64.js
index 841834f92e..ea55d5e72b 100644
--- a/docs/html/search/functions_64.js
+++ b/docs/html/search/functions_64.js
@@ -1,7 +1,7 @@
 var searchData=
 [
   ['debug',['Debug',['../group___util_module.html#ga991477281cbf1b003b39a4af29824a1e',1,'cub']]],
-  ['deviceallocate',['DeviceAllocate',['../classcub_1_1_device_allocator.html#af3c1742fec6b37f2737d951917358099',1,'cub::DeviceAllocator::DeviceAllocate()'],['../structcub_1_1_caching_device_allocator.html#a80902287c5d123f3541cf0002604a8fe',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes, DeviceOrdinal device)'],['../structcub_1_1_caching_device_allocator.html#abedac216108d51a722bad43115831f37',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes)']]],
-  ['devicefree',['DeviceFree',['../classcub_1_1_device_allocator.html#a527d9c0c93c177359f5ad823c7d808e2',1,'cub::DeviceAllocator::DeviceFree()'],['../structcub_1_1_caching_device_allocator.html#a35cf35b2610d3c1bb2f7a6ae5d814385',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr, DeviceOrdinal device)'],['../structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr)']]],
+  ['deviceallocate',['DeviceAllocate',['../classcub_1_1_device_allocator.html#af3c1742fec6b37f2737d951917358099',1,'cub::DeviceAllocator::DeviceAllocate()'],['../structcub_1_1_caching_device_allocator.html#a8b781537d9955246b873b7a79d95bbe9',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes, int device)'],['../structcub_1_1_caching_device_allocator.html#abedac216108d51a722bad43115831f37',1,'cub::CachingDeviceAllocator::DeviceAllocate(void **d_ptr, size_t bytes)']]],
+  ['devicefree',['DeviceFree',['../classcub_1_1_device_allocator.html#a527d9c0c93c177359f5ad823c7d808e2',1,'cub::DeviceAllocator::DeviceFree()'],['../structcub_1_1_caching_device_allocator.html#a1a56292df1ed4fa9ec233d784afa7858',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr, int device)'],['../structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b',1,'cub::CachingDeviceAllocator::DeviceFree(void *d_ptr)']]],
   ['drain',['Drain',['../classcub_1_1_grid_queue.html#af6c8e3ff3fdb6f3e1ce7713d417cdf62',1,'cub::GridQueue']]]
 ];
diff --git a/docs/html/search/functions_66.js b/docs/html/search/functions_66.js
index f77dc7d597..891f5ec74d 100644
--- a/docs/html/search/functions_66.js
+++ b/docs/html/search/functions_66.js
@@ -4,5 +4,5 @@ var searchData=
   ['fillsize',['FillSize',['../classcub_1_1_grid_queue.html#a1b66918c1c7bdf77b221e15a283b396a',1,'cub::GridQueue']]],
   ['flag',['Flag',['../classcub_1_1_block_discontinuity.html#ab6390151f109ac253810504ddc5a7c04',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD], T &amp;last_tile_item)'],['../classcub_1_1_block_discontinuity.html#a3bdf3b7ad8ace5249f84e103f25ff3bb',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_discontinuity.html#a7fa4c2dc8bbe5db5da50fedca0613b46',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], T tile_predecessor, FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD], T &amp;last_tile_item)'],['../classcub_1_1_block_discontinuity.html#a351ed32eaada93c944fbb29feda5a6cd',1,'cub::BlockDiscontinuity::Flag(SmemStorage &amp;smem_storage, T(&amp;input)[ITEMS_PER_THREAD], T tile_predecessor, FlagOp flag_op, FlagT(&amp;flags)[ITEMS_PER_THREAD])']]],
   ['free',['Free',['../classcub_1_1_grid_queue.html#aff77aa7ee352b3bf62725f0e6f41502a',1,'cub::GridQueue']]],
-  ['freeallcached',['FreeAllCached',['../structcub_1_1_caching_device_allocator.html#af55929d787fe0b785da1ae4ceeffea5b',1,'cub::CachingDeviceAllocator']]]
+  ['freeallcached',['FreeAllCached',['../structcub_1_1_caching_device_allocator.html#af25daf8ac1dbce0d8504a6a53c2277ad',1,'cub::CachingDeviceAllocator']]]
 ];
diff --git a/docs/html/search/functions_67.js b/docs/html/search/functions_67.js
index 1b208da43b..4be95789a6 100644
--- a/docs/html/search/functions_67.js
+++ b/docs/html/search/functions_67.js
@@ -1,5 +1,6 @@
 var searchData=
 [
+  ['gridevenshare',['GridEvenShare',['../classcub_1_1_grid_even_share.html#a34167aebdf4b8f6e0ab0d8dbee988e91',1,'cub::GridEvenShare::GridEvenShare(SizeT num_items)'],['../classcub_1_1_grid_even_share.html#ae37f606dd91894e9341e82affe757b0b',1,'cub::GridEvenShare::GridEvenShare()']]],
   ['gridinit',['GridInit',['../classcub_1_1_grid_even_share.html#a239f96ab615df22e26c53cde6000e39f',1,'cub::GridEvenShare']]],
   ['gridqueue',['GridQueue',['../classcub_1_1_grid_queue.html#a94f344fcd45b02cd2d6e9b04d8f2665a',1,'cub::GridQueue']]]
 ];
diff --git a/docs/html/search/functions_6d.js b/docs/html/search/functions_6d.js
index 0698d9ef5c..521d242ed7 100644
--- a/docs/html/search/functions_6d.js
+++ b/docs/html/search/functions_6d.js
@@ -2,5 +2,6 @@ var searchData=
 [
   ['maxsmoccupancy',['MaxSmOccupancy',['../classcub_1_1_device.html#a618bbf36a5737a9b1533ff65834e88e8',1,'cub::Device']]],
   ['multichannel',['MultiChannel',['../structcub_1_1_device_histo256.html#aadf3db4f5e49852070aea37a21645e36',1,'cub::DeviceHisto256']]],
-  ['multichannelatomic',['MultiChannelAtomic',['../structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a',1,'cub::DeviceHisto256']]]
+  ['multichannelatomic',['MultiChannelAtomic',['../structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a',1,'cub::DeviceHisto256']]],
+  ['multichannelglobalatomic',['MultiChannelGlobalAtomic',['../structcub_1_1_device_histo256.html#a112463c78c6a51d41be4600421873126',1,'cub::DeviceHisto256']]]
 ];
diff --git a/docs/html/search/functions_73.js b/docs/html/search/functions_73.js
index de413469de..9e0140e843 100644
--- a/docs/html/search/functions_73.js
+++ b/docs/html/search/functions_73.js
@@ -5,10 +5,11 @@ var searchData=
   ['setmaxcachedbytes',['SetMaxCachedBytes',['../structcub_1_1_caching_device_allocator.html#a5ddac0917fce3445aa660d747e67a8d9',1,'cub::CachingDeviceAllocator']]],
   ['singlechannel',['SingleChannel',['../structcub_1_1_device_histo256.html#a21cfc3f4d496365051f59d4e87099d01',1,'cub::DeviceHisto256']]],
   ['singlechannelatomic',['SingleChannelAtomic',['../structcub_1_1_device_histo256.html#a792cb350cb09da5299f76806462d84db',1,'cub::DeviceHisto256']]],
+  ['singlechannelglobalatomic',['SingleChannelGlobalAtomic',['../structcub_1_1_device_histo256.html#a3b832f1fc945bdce122edc8a64cf6ec6',1,'cub::DeviceHisto256']]],
   ['sortblocked',['SortBlocked',['../classcub_1_1_block_radix_sort.html#a779bfcd00c57f6b97cbbb8a0aafb616a',1,'cub::BlockRadixSort::SortBlocked(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)'],['../classcub_1_1_block_radix_sort.html#acd0282613cbd6ad6c52ab1bb05b82def',1,'cub::BlockRadixSort::SortBlocked(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], ValueType(&amp;values)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)']]],
   ['sortblockedtostriped',['SortBlockedToStriped',['../classcub_1_1_block_radix_sort.html#ad6cc88c2ae0d36c4c98b901748645e10',1,'cub::BlockRadixSort::SortBlockedToStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)'],['../classcub_1_1_block_radix_sort.html#a1171341e12f14cb19aef0e65ae3842ae',1,'cub::BlockRadixSort::SortBlockedToStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], ValueType(&amp;values)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)']]],
   ['sortstriped',['SortStriped',['../classcub_1_1_block_radix_sort.html#aea1b6f367da1b17ed80f65d511aec15e',1,'cub::BlockRadixSort::SortStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)'],['../classcub_1_1_block_radix_sort.html#a6d3cd483dfc0cce5cc8b1ca538b937ad',1,'cub::BlockRadixSort::SortStriped(SmemStorage &amp;smem_storage, KeyType(&amp;keys)[ITEMS_PER_THREAD], ValueType(&amp;values)[ITEMS_PER_THREAD], unsigned int begin_bit=0, const unsigned int &amp;end_bit=sizeof(KeyType)*8)']]],
-  ['store',['Store',['../classcub_1_1_block_store.html#a4d7858f738321f7130bc0be7d6fa46a5',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_store.html#a54a10a5fcc8e755941ee5d293440b0af',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, const SizeT &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
+  ['store',['Store',['../classcub_1_1_block_store.html#a4d7858f738321f7130bc0be7d6fa46a5',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, T(&amp;items)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_store.html#a2244c761e873e8e5334dccad9bea6657',1,'cub::BlockStore::Store(SmemStorage &amp;smem_storage, OutputIteratorRA block_itr, const int &amp;guarded_items, T(&amp;items)[ITEMS_PER_THREAD])']]],
   ['stripedtoblocked',['StripedToBlocked',['../classcub_1_1_block_exchange.html#ad8000bf73c3ce935018f32451985ae37',1,'cub::BlockExchange']]],
   ['sum',['Sum',['../classcub_1_1_block_reduce.html#a8cc68b77f25c80b972bdedced6878214',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T input)'],['../classcub_1_1_block_reduce.html#af81b914f22d1c1608d46d12e1c1d888e',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T(&amp;inputs)[ITEMS_PER_THREAD])'],['../classcub_1_1_block_reduce.html#adfc657d856d583321d2604589c52b43a',1,'cub::BlockReduce::Sum(SmemStorage &amp;smem_storage, T input, const unsigned int &amp;num_valid)'],['../classcub_1_1_warp_reduce.html#a50e4be93dbd298b7c2b5080d1449cf8c',1,'cub::WarpReduce::Sum(SmemStorage &amp;smem_storage, T input)'],['../classcub_1_1_warp_reduce.html#a0c1ad8bee00bc64f14f3b0d762b84363',1,'cub::WarpReduce::Sum(SmemStorage &amp;smem_storage, T input, const unsigned int &amp;valid_lanes)']]]
 ];
diff --git a/docs/html/search/functions_75.js b/docs/html/search/functions_75.js
index 59ffb8a1b7..85f83173c1 100644
--- a/docs/html/search/functions_75.js
+++ b/docs/html/search/functions_75.js
@@ -1,4 +1,4 @@
 var searchData=
 [
-  ['unbindtexture',['UnbindTexture',['../classcub_1_1_tex_iterator_r_a.html#ab33bd552a21c85dea863b75b09aa6e87',1,'cub::TexIteratorRA::UnbindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#a30fbb31a0a53d87f47174ee1889a6435',1,'cub::TexTransformIteratorRA::UnbindTexture()']]]
+  ['unbindtexture',['UnbindTexture',['../classcub_1_1_tex_iterator_r_a.html#abfc8c42e2da2beffb05ac81ec89ed8b9',1,'cub::TexIteratorRA::UnbindTexture()'],['../classcub_1_1_tex_transform_iterator_r_a.html#abd030991795b9c9ca31f4293486d206a',1,'cub::TexTransformIteratorRA::UnbindTexture()']]]
 ];
diff --git a/docs/html/search/search.js b/docs/html/search/search.js
index e960e3e0e3..5b3b6022cc 100644
--- a/docs/html/search/search.js
+++ b/docs/html/search/search.js
@@ -13,7 +13,7 @@ var indexSectionsWithContent =
   3: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011100100000000000011010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
   4: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000111111111001101101111000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
   5: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000111000110000110101100010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
-  6: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+  6: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
   7: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011000100000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
   8: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000100001100101101010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
   9: "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010100100000000000010010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
diff --git a/docs/html/search/typedefs_64.js b/docs/html/search/typedefs_64.js
index 77a79d5ab5..0cdac78e24 100644
--- a/docs/html/search/typedefs_64.js
+++ b/docs/html/search/typedefs_64.js
@@ -1,4 +1,4 @@
 var searchData=
 [
-  ['deviceordinal',['DeviceOrdinal',['../group___util_module.html#ga0888aca216483e15abb12b4d80da31a9',1,'cub']]]
+  ['deviceordinal',['int',['../group___util_module.html#ga0888aca216483e15abb12b4d80da31a9',1,'cub']]]
 ];
diff --git a/docs/html/structcub_1_1_arch_props-members.html b/docs/html/structcub_1_1_arch_props-members.html
index cbd75e73bb..566b1028b7 100644
--- a/docs/html/structcub_1_1_arch_props-members.html
+++ b/docs/html/structcub_1_1_arch_props-members.html
@@ -121,7 +121,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_arch_props.html b/docs/html/structcub_1_1_arch_props.html
index 2216fdb80a..e3d8e2a45b 100644
--- a/docs/html/structcub_1_1_arch_props.html
+++ b/docs/html/structcub_1_1_arch_props.html
@@ -197,7 +197,7 @@ <h2 class="groupheader">Member Enumeration Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_array_wrapper-members.html b/docs/html/structcub_1_1_array_wrapper-members.html
index 5f11fcaf4c..d7892bf5aa 100644
--- a/docs/html/structcub_1_1_array_wrapper-members.html
+++ b/docs/html/structcub_1_1_array_wrapper-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_array_wrapper.html b/docs/html/structcub_1_1_array_wrapper.html
index a8795d5a4b..bb940fe617 100644
--- a/docs/html/structcub_1_1_array_wrapper.html
+++ b/docs/html/structcub_1_1_array_wrapper.html
@@ -123,7 +123,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_base_traits-members.html b/docs/html/structcub_1_1_base_traits-members.html
index 8ed671bb33..c78605aa33 100644
--- a/docs/html/structcub_1_1_base_traits-members.html
+++ b/docs/html/structcub_1_1_base_traits-members.html
@@ -110,7 +110,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_base_traits.html b/docs/html/structcub_1_1_base_traits.html
index c4edd35e05..dc60686248 100644
--- a/docs/html/structcub_1_1_base_traits.html
+++ b/docs/html/structcub_1_1_base_traits.html
@@ -131,7 +131,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_binary_op_has_idx_param-members.html b/docs/html/structcub_1_1_binary_op_has_idx_param-members.html
index aa67c6d145..bfe3072f8c 100644
--- a/docs/html/structcub_1_1_binary_op_has_idx_param-members.html
+++ b/docs/html/structcub_1_1_binary_op_has_idx_param-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_binary_op_has_idx_param.html b/docs/html/structcub_1_1_binary_op_has_idx_param.html
index 43f70e2a2e..6a72165fa8 100644
--- a/docs/html/structcub_1_1_binary_op_has_idx_param.html
+++ b/docs/html/structcub_1_1_binary_op_has_idx_param.html
@@ -124,7 +124,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_block_raking_layout-members.html b/docs/html/structcub_1_1_block_raking_layout-members.html
index 2289f1459d..33634c98cc 100644
--- a/docs/html/structcub_1_1_block_raking_layout-members.html
+++ b/docs/html/structcub_1_1_block_raking_layout-members.html
@@ -117,7 +117,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_block_raking_layout.html b/docs/html/structcub_1_1_block_raking_layout.html
index 1349ea9a2c..d463db4aa0 100644
--- a/docs/html/structcub_1_1_block_raking_layout.html
+++ b/docs/html/structcub_1_1_block_raking_layout.html
@@ -200,7 +200,7 @@ <h2 class="groupheader">Member Enumeration Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_caching_device_allocator-members.html b/docs/html/structcub_1_1_caching_device_allocator-members.html
index f3515c3578..43d6b9d296 100644
--- a/docs/html/structcub_1_1_caching_device_allocator-members.html
+++ b/docs/html/structcub_1_1_caching_device_allocator-members.html
@@ -104,12 +104,12 @@
 <p>This is the complete list of members for <a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a>, including all inherited members.</p>
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a603ab42b997facc30d9337fe6f56144b">CachingDeviceAllocator</a>(unsigned int bin_growth, unsigned int min_bin, unsigned int max_bin, size_t max_cached_bytes)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a12f2ea29bf0a27eedf85d43604469780">CachingDeviceAllocator</a>(bool silent_cleanup=false)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a80902287c5d123f3541cf0002604a8fe">DeviceAllocate</a>(void **d_ptr, size_t bytes, DeviceOrdinal device)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a1293d82e262e57926275fce35a3f6230">CachingDeviceAllocator</a>(bool skip_cleanup=false)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a8b781537d9955246b873b7a79d95bbe9">DeviceAllocate</a>(void **d_ptr, size_t bytes, int device)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#abedac216108d51a722bad43115831f37">DeviceAllocate</a>(void **d_ptr, size_t bytes)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a35cf35b2610d3c1bb2f7a6ae5d814385">DeviceFree</a>(void *d_ptr, DeviceOrdinal device)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a1a56292df1ed4fa9ec233d784afa7858">DeviceFree</a>(void *d_ptr, int device)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b">DeviceFree</a>(void *d_ptr)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#af55929d787fe0b785da1ae4ceeffea5b">FreeAllCached</a>(bool silent=false)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#af25daf8ac1dbce0d8504a6a53c2277ad">FreeAllCached</a>()</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a5ddac0917fce3445aa660d747e67a8d9">SetMaxCachedBytes</a>(size_t max_cached_bytes)</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html#a8b5be1674adff948f45a4256839ad9e2">~CachingDeviceAllocator</a>()</td><td class="entry"><a class="el" href="structcub_1_1_caching_device_allocator.html">cub::CachingDeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
   <tr><td class="entry"><a class="el" href="classcub_1_1_device_allocator.html#aa05f6eb52661fd42347e3a85c6164965">~DeviceAllocator</a>()</td><td class="entry"><a class="el" href="classcub_1_1_device_allocator.html">cub::DeviceAllocator</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
@@ -117,7 +117,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_caching_device_allocator.html b/docs/html/structcub_1_1_caching_device_allocator.html
index d24325360c..b74b004c7d 100644
--- a/docs/html/structcub_1_1_caching_device_allocator.html
+++ b/docs/html/structcub_1_1_caching_device_allocator.html
@@ -137,39 +137,39 @@
 <tr class="memitem:a603ab42b997facc30d9337fe6f56144b"><td class="memItemLeft" align="right" valign="top">__host__ __device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a603ab42b997facc30d9337fe6f56144b">CachingDeviceAllocator</a> (unsigned int bin_growth, unsigned int min_bin, unsigned int max_bin, size_t max_cached_bytes)</td></tr>
 <tr class="memdesc:a603ab42b997facc30d9337fe6f56144b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Constructor.  <a href="#a603ab42b997facc30d9337fe6f56144b">More...</a><br/></td></tr>
 <tr class="separator:a603ab42b997facc30d9337fe6f56144b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a12f2ea29bf0a27eedf85d43604469780"><td class="memItemLeft" align="right" valign="top">__host__ __device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a12f2ea29bf0a27eedf85d43604469780">CachingDeviceAllocator</a> (bool silent_cleanup=false)</td></tr>
-<tr class="memdesc:a12f2ea29bf0a27eedf85d43604469780"><td class="mdescLeft">&#160;</td><td class="mdescRight">Default constructor.  <a href="#a12f2ea29bf0a27eedf85d43604469780">More...</a><br/></td></tr>
-<tr class="separator:a12f2ea29bf0a27eedf85d43604469780"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1293d82e262e57926275fce35a3f6230"><td class="memItemLeft" align="right" valign="top">__host__ __device__ __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a1293d82e262e57926275fce35a3f6230">CachingDeviceAllocator</a> (bool skip_cleanup=false)</td></tr>
+<tr class="memdesc:a1293d82e262e57926275fce35a3f6230"><td class="mdescLeft">&#160;</td><td class="mdescRight">Default constructor.  <a href="#a1293d82e262e57926275fce35a3f6230">More...</a><br/></td></tr>
+<tr class="separator:a1293d82e262e57926275fce35a3f6230"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a8b5be1674adff948f45a4256839ad9e2"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a8b5be1674adff948f45a4256839ad9e2"></a>
 <a class="el" href="group___util_module.html#ga756ebe2b0566abfdfcd3c13fe7d1e9c6">CUB_DESTRUCTOR</a> virtual <br class="typebreak"/>
 __forceinline__&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a8b5be1674adff948f45a4256839ad9e2">~CachingDeviceAllocator</a> ()</td></tr>
 <tr class="memdesc:a8b5be1674adff948f45a4256839ad9e2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Destructor. <br/></td></tr>
 <tr class="separator:a8b5be1674adff948f45a4256839ad9e2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a80902287c5d123f3541cf0002604a8fe"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a80902287c5d123f3541cf0002604a8fe"></a>
+<tr class="memitem:a8b781537d9955246b873b7a79d95bbe9"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a8b781537d9955246b873b7a79d95bbe9"></a>
 __host__ __device__ <br class="typebreak"/>
-__forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a80902287c5d123f3541cf0002604a8fe">DeviceAllocate</a> (void **d_ptr, size_t bytes, <a class="el" href="group___util_module.html#ga0888aca216483e15abb12b4d80da31a9">DeviceOrdinal</a> device)</td></tr>
-<tr class="memdesc:a80902287c5d123f3541cf0002604a8fe"><td class="mdescLeft">&#160;</td><td class="mdescRight">Provides a suitable allocation of device memory for the given size on the specified device. <br/></td></tr>
-<tr class="separator:a80902287c5d123f3541cf0002604a8fe"><td class="memSeparator" colspan="2">&#160;</td></tr>
+__forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a8b781537d9955246b873b7a79d95bbe9">DeviceAllocate</a> (void **d_ptr, size_t bytes, int device)</td></tr>
+<tr class="memdesc:a8b781537d9955246b873b7a79d95bbe9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Provides a suitable allocation of device memory for the given size on the specified device. <br/></td></tr>
+<tr class="separator:a8b781537d9955246b873b7a79d95bbe9"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:abedac216108d51a722bad43115831f37"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="abedac216108d51a722bad43115831f37"></a>
 __host__ __device__ <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#abedac216108d51a722bad43115831f37">DeviceAllocate</a> (void **d_ptr, size_t bytes)</td></tr>
 <tr class="memdesc:abedac216108d51a722bad43115831f37"><td class="mdescLeft">&#160;</td><td class="mdescRight">Provides a suitable allocation of device memory for the given size on the current device. <br/></td></tr>
 <tr class="separator:abedac216108d51a722bad43115831f37"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a35cf35b2610d3c1bb2f7a6ae5d814385"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a35cf35b2610d3c1bb2f7a6ae5d814385"></a>
+<tr class="memitem:a1a56292df1ed4fa9ec233d784afa7858"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a1a56292df1ed4fa9ec233d784afa7858"></a>
 __host__ __device__ <br class="typebreak"/>
-__forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a35cf35b2610d3c1bb2f7a6ae5d814385">DeviceFree</a> (void *d_ptr, <a class="el" href="group___util_module.html#ga0888aca216483e15abb12b4d80da31a9">DeviceOrdinal</a> device)</td></tr>
-<tr class="memdesc:a35cf35b2610d3c1bb2f7a6ae5d814385"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees a live allocation of device memory on the specified device, returning it to the allocator. <br/></td></tr>
-<tr class="separator:a35cf35b2610d3c1bb2f7a6ae5d814385"><td class="memSeparator" colspan="2">&#160;</td></tr>
+__forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a1a56292df1ed4fa9ec233d784afa7858">DeviceFree</a> (void *d_ptr, int device)</td></tr>
+<tr class="memdesc:a1a56292df1ed4fa9ec233d784afa7858"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees a live allocation of device memory on the specified device, returning it to the allocator. <br/></td></tr>
+<tr class="separator:a1a56292df1ed4fa9ec233d784afa7858"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a61b620dde7d4057fd05fb8a3fa0ae11b"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a61b620dde7d4057fd05fb8a3fa0ae11b"></a>
 __host__ __device__ <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a61b620dde7d4057fd05fb8a3fa0ae11b">DeviceFree</a> (void *d_ptr)</td></tr>
 <tr class="memdesc:a61b620dde7d4057fd05fb8a3fa0ae11b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees a live allocation of device memory on the current device, returning it to the allocator. <br/></td></tr>
 <tr class="separator:a61b620dde7d4057fd05fb8a3fa0ae11b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af55929d787fe0b785da1ae4ceeffea5b"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="af55929d787fe0b785da1ae4ceeffea5b"></a>
+<tr class="memitem:af25daf8ac1dbce0d8504a6a53c2277ad"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="af25daf8ac1dbce0d8504a6a53c2277ad"></a>
 __host__ __device__ <br class="typebreak"/>
-__forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#af55929d787fe0b785da1ae4ceeffea5b">FreeAllCached</a> (bool silent=false)</td></tr>
-<tr class="memdesc:af55929d787fe0b785da1ae4ceeffea5b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees all cached device allocations on all devices. <br/></td></tr>
-<tr class="separator:af55929d787fe0b785da1ae4ceeffea5b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+__forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#af25daf8ac1dbce0d8504a6a53c2277ad">FreeAllCached</a> ()</td></tr>
+<tr class="memdesc:af25daf8ac1dbce0d8504a6a53c2277ad"><td class="mdescLeft">&#160;</td><td class="mdescRight">Frees all cached device allocations on all devices. <br/></td></tr>
+<tr class="separator:af25daf8ac1dbce0d8504a6a53c2277ad"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a5ddac0917fce3445aa660d747e67a8d9"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="a5ddac0917fce3445aa660d747e67a8d9"></a>
 __host__ __device__ <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structcub_1_1_caching_device_allocator.html#a5ddac0917fce3445aa660d747e67a8d9">SetMaxCachedBytes</a> (size_t max_cached_bytes)</td></tr>
@@ -237,7 +237,7 @@ <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 
 </div>
 </div>
-<a class="anchor" id="a12f2ea29bf0a27eedf85d43604469780"></a>
+<a class="anchor" id="a1293d82e262e57926275fce35a3f6230"></a>
 <div class="memitem">
 <div class="memproto">
 <table class="mlabels">
@@ -248,7 +248,7 @@ <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
           <td class="memname">__host__ __device__ __forceinline__ cub::CachingDeviceAllocator::CachingDeviceAllocator </td>
           <td>(</td>
           <td class="paramtype">bool&#160;</td>
-          <td class="paramname"><em>silent_cleanup</em> = <code>false</code></td><td>)</td>
+          <td class="paramname"><em>skip_cleanup</em> = <code>false</code></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -279,7 +279,7 @@ <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_cast-members.html b/docs/html/structcub_1_1_cast-members.html
index 89a5da1bb9..b4fc77cf4a 100644
--- a/docs/html/structcub_1_1_cast-members.html
+++ b/docs/html/structcub_1_1_cast-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_cast.html b/docs/html/structcub_1_1_cast.html
index c4b6030e08..2d992e6260 100644
--- a/docs/html/structcub_1_1_cast.html
+++ b/docs/html/structcub_1_1_cast.html
@@ -125,7 +125,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_device_histo256-members.html b/docs/html/structcub_1_1_device_histo256-members.html
index 467b2b616c..c9333a2049 100644
--- a/docs/html/structcub_1_1_device_histo256-members.html
+++ b/docs/html/structcub_1_1_device_histo256-members.html
@@ -105,13 +105,15 @@
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#aadf3db4f5e49852070aea37a21645e36">MultiChannel</a>(InputIteratorRA d_samples, HistoCounter *(&amp;d_histograms)[ACTIVE_CHANNELS], int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a">MultiChannelAtomic</a>(InputIteratorRA d_samples, HistoCounter *(&amp;d_histograms)[ACTIVE_CHANNELS], int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#a21cfc3f4d496365051f59d4e87099d01">SingleChannel</a>(InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
-  <tr><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#a792cb350cb09da5299f76806462d84db">SingleChannelAtomic</a>(InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#a112463c78c6a51d41be4600421873126">MultiChannelGlobalAtomic</a>(InputIteratorRA d_samples, HistoCounter *(&amp;d_histograms)[ACTIVE_CHANNELS], int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#a21cfc3f4d496365051f59d4e87099d01">SingleChannel</a>(InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#a792cb350cb09da5299f76806462d84db">SingleChannelAtomic</a>(InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html#a3b832f1fc945bdce122edc8a64cf6ec6">SingleChannelGlobalAtomic</a>(InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, DeviceAllocator *device_allocator=DefaultDeviceAllocator())</td><td class="entry"><a class="el" href="structcub_1_1_device_histo256.html">cub::DeviceHisto256</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
 </table></div><!-- contents -->
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_device_histo256.html b/docs/html/structcub_1_1_device_histo256.html
index 5f72bedd6e..8d7f3992e7 100644
--- a/docs/html/structcub_1_1_device_histo256.html
+++ b/docs/html/structcub_1_1_device_histo256.html
@@ -115,23 +115,33 @@
 <tr class="memitem:aadf3db4f5e49852070aea37a21645e36"><td class="memTemplParams" colspan="2">template&lt;int CHANNELS, int ACTIVE_CHANNELS, typename InputIteratorRA , typename HistoCounter &gt; </td></tr>
 <tr class="memitem:aadf3db4f5e49852070aea37a21645e36"><td class="memTemplItemLeft" align="right" valign="top">__host__ __device__ static <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcub_1_1_device_histo256.html#aadf3db4f5e49852070aea37a21645e36">MultiChannel</a> (InputIteratorRA d_samples, HistoCounter *(&amp;d_histograms)[ACTIVE_CHANNELS], int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, <a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
-<tr class="memdesc:aadf3db4f5e49852070aea37a21645e36"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram from multi-channel data.  <a href="#aadf3db4f5e49852070aea37a21645e36">More...</a><br/></td></tr>
+<tr class="memdesc:aadf3db4f5e49852070aea37a21645e36"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram from multi-channel data. Uses fast block-sorting to compute the histogram.  <a href="#aadf3db4f5e49852070aea37a21645e36">More...</a><br/></td></tr>
 <tr class="separator:aadf3db4f5e49852070aea37a21645e36"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a0aa6ea591b18ea122328c6b17d92c14a"><td class="memTemplParams" colspan="2">template&lt;int CHANNELS, int ACTIVE_CHANNELS, typename InputIteratorRA , typename HistoCounter &gt; </td></tr>
 <tr class="memitem:a0aa6ea591b18ea122328c6b17d92c14a"><td class="memTemplItemLeft" align="right" valign="top">__host__ __device__ static <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcub_1_1_device_histo256.html#a0aa6ea591b18ea122328c6b17d92c14a">MultiChannelAtomic</a> (InputIteratorRA d_samples, HistoCounter *(&amp;d_histograms)[ACTIVE_CHANNELS], int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, <a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
-<tr class="memdesc:a0aa6ea591b18ea122328c6b17d92c14a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram from multi-channel data. Uses atomic read-modify-write operations to compute the histogram.  <a href="#a0aa6ea591b18ea122328c6b17d92c14a">More...</a><br/></td></tr>
+<tr class="memdesc:a0aa6ea591b18ea122328c6b17d92c14a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram from multi-channel data. Uses shared-memory atomic read-modify-write operations to compute the histogram.  <a href="#a0aa6ea591b18ea122328c6b17d92c14a">More...</a><br/></td></tr>
 <tr class="separator:a0aa6ea591b18ea122328c6b17d92c14a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a112463c78c6a51d41be4600421873126"><td class="memTemplParams" colspan="2">template&lt;int CHANNELS, int ACTIVE_CHANNELS, typename InputIteratorRA , typename HistoCounter &gt; </td></tr>
+<tr class="memitem:a112463c78c6a51d41be4600421873126"><td class="memTemplItemLeft" align="right" valign="top">__host__ __device__ static <br class="typebreak"/>
+__forceinline__ cudaError_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcub_1_1_device_histo256.html#a112463c78c6a51d41be4600421873126">MultiChannelGlobalAtomic</a> (InputIteratorRA d_samples, HistoCounter *(&amp;d_histograms)[ACTIVE_CHANNELS], int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, <a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
+<tr class="memdesc:a112463c78c6a51d41be4600421873126"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram from multi-channel data. Uses global-memory atomic read-modify-write operations to compute the histogram.  <a href="#a112463c78c6a51d41be4600421873126">More...</a><br/></td></tr>
+<tr class="separator:a112463c78c6a51d41be4600421873126"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a21cfc3f4d496365051f59d4e87099d01"><td class="memTemplParams" colspan="2">template&lt;typename InputIteratorRA , typename HistoCounter &gt; </td></tr>
 <tr class="memitem:a21cfc3f4d496365051f59d4e87099d01"><td class="memTemplItemLeft" align="right" valign="top">__host__ __device__ static <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcub_1_1_device_histo256.html#a21cfc3f4d496365051f59d4e87099d01">SingleChannel</a> (InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, <a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
-<tr class="memdesc:a21cfc3f4d496365051f59d4e87099d01"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram.  <a href="#a21cfc3f4d496365051f59d4e87099d01">More...</a><br/></td></tr>
+<tr class="memdesc:a21cfc3f4d496365051f59d4e87099d01"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram. Uses fast block-sorting to compute the histogram.  <a href="#a21cfc3f4d496365051f59d4e87099d01">More...</a><br/></td></tr>
 <tr class="separator:a21cfc3f4d496365051f59d4e87099d01"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a792cb350cb09da5299f76806462d84db"><td class="memTemplParams" colspan="2">template&lt;typename InputIteratorRA , typename HistoCounter &gt; </td></tr>
 <tr class="memitem:a792cb350cb09da5299f76806462d84db"><td class="memTemplItemLeft" align="right" valign="top">__host__ __device__ static <br class="typebreak"/>
 __forceinline__ cudaError_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcub_1_1_device_histo256.html#a792cb350cb09da5299f76806462d84db">SingleChannelAtomic</a> (InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, <a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
-<tr class="memdesc:a792cb350cb09da5299f76806462d84db"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram. Uses atomic read-modify-write operations to compute the histogram.  <a href="#a792cb350cb09da5299f76806462d84db">More...</a><br/></td></tr>
+<tr class="memdesc:a792cb350cb09da5299f76806462d84db"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram. Uses shared-memory atomic read-modify-write operations to compute the histogram.  <a href="#a792cb350cb09da5299f76806462d84db">More...</a><br/></td></tr>
 <tr class="separator:a792cb350cb09da5299f76806462d84db"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3b832f1fc945bdce122edc8a64cf6ec6"><td class="memTemplParams" colspan="2">template&lt;typename InputIteratorRA , typename HistoCounter &gt; </td></tr>
+<tr class="memitem:a3b832f1fc945bdce122edc8a64cf6ec6"><td class="memTemplItemLeft" align="right" valign="top">__host__ __device__ static <br class="typebreak"/>
+__forceinline__ cudaError_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="structcub_1_1_device_histo256.html#a3b832f1fc945bdce122edc8a64cf6ec6">SingleChannelGlobalAtomic</a> (InputIteratorRA d_samples, HistoCounter *d_histogram, int num_samples, cudaStream_t stream=0, bool stream_synchronous=false, <a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *device_allocator=<a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>())</td></tr>
+<tr class="memdesc:a3b832f1fc945bdce122edc8a64cf6ec6"><td class="mdescLeft">&#160;</td><td class="mdescRight">Computes a 256-bin device-wide histogram. Uses global-memory atomic read-modify-write operations to compute the histogram.  <a href="#a3b832f1fc945bdce122edc8a64cf6ec6">More...</a><br/></td></tr>
+<tr class="separator:a3b832f1fc945bdce122edc8a64cf6ec6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a class="anchor" id="a21cfc3f4d496365051f59d4e87099d01"></a>
@@ -192,7 +202,8 @@ <h2 class="groupheader">Member Function Documentation</h2>
 </table>
 </div><div class="memdoc">
 
-<p>Computes a 256-bin device-wide histogram. </p>
+<p>Computes a 256-bin device-wide histogram. Uses fast block-sorting to compute the histogram. </p>
+<p>Delivers consistent throughput regardless of sample diversity.</p>
 <dl class="tparams"><dt>Template Parameters</dt><dd>
   <table class="tparams">
     <tr><td class="paramname">InputIteratorRA</td><td><b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to <code>unsigned char</code> </td></tr>
@@ -272,7 +283,88 @@ <h2 class="groupheader">Member Function Documentation</h2>
 </table>
 </div><div class="memdoc">
 
-<p>Computes a 256-bin device-wide histogram. Uses atomic read-modify-write operations to compute the histogram. </p>
+<p>Computes a 256-bin device-wide histogram. Uses shared-memory atomic read-modify-write operations to compute the histogram. </p>
+<p>Sample input having lower diversity cause performance to be degraded.</p>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">InputIteratorRA</td><td><b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to <code>unsigned char</code> </td></tr>
+    <tr><td class="paramname">HistoCounter</td><td><b>[inferred]</b> Integral type for counting sample occurrences per histogram bin </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in]</td><td class="paramname">d_samples</td><td>Input samples </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">d_histogram</td><td>Array of 256 counters of integral type <code>HistoCounter</code>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">num_samples</td><td>Number of samples to process </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">stream</td><td><b>[optional]</b> CUDA stream to launch kernels within. Default is stream-0. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">stream_synchronous</td><td><b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is <code>false</code>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">device_allocator</td><td><b>[optional]</b> Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. </td></tr>
+  </table>
+  </dd>
+</dl>
+
+</div>
+</div>
+<a class="anchor" id="a3b832f1fc945bdce122edc8a64cf6ec6"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename InputIteratorRA , typename HistoCounter &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">__host__ __device__ static __forceinline__ cudaError_t cub::DeviceHisto256::SingleChannelGlobalAtomic </td>
+          <td>(</td>
+          <td class="paramtype">InputIteratorRA&#160;</td>
+          <td class="paramname"><em>d_samples</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">HistoCounter *&#160;</td>
+          <td class="paramname"><em>d_histogram</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>num_samples</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">cudaStream_t&#160;</td>
+          <td class="paramname"><em>stream</em> = <code>0</code>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">bool&#160;</td>
+          <td class="paramname"><em>stream_synchronous</em> = <code>false</code>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *&#160;</td>
+          <td class="paramname"><em>device_allocator</em> = <code><a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>()</code>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Computes a 256-bin device-wide histogram. Uses global-memory atomic read-modify-write operations to compute the histogram. </p>
 <p>Sample input having lower diversity cause performance to be degraded.</p>
 <dl class="tparams"><dt>Template Parameters</dt><dd>
   <table class="tparams">
@@ -353,7 +445,8 @@ <h2 class="groupheader">Member Function Documentation</h2>
 </table>
 </div><div class="memdoc">
 
-<p>Computes a 256-bin device-wide histogram from multi-channel data. </p>
+<p>Computes a 256-bin device-wide histogram from multi-channel data. Uses fast block-sorting to compute the histogram. </p>
+<p>Delivers consistent throughput regardless of sample diversity.</p>
 <dl class="tparams"><dt>Template Parameters</dt><dd>
   <table class="tparams">
     <tr><td class="paramname">CHANNELS</td><td>Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) </td></tr>
@@ -435,7 +528,90 @@ <h2 class="groupheader">Member Function Documentation</h2>
 </table>
 </div><div class="memdoc">
 
-<p>Computes a 256-bin device-wide histogram from multi-channel data. Uses atomic read-modify-write operations to compute the histogram. </p>
+<p>Computes a 256-bin device-wide histogram from multi-channel data. Uses shared-memory atomic read-modify-write operations to compute the histogram. </p>
+<p>Sample input having lower diversity cause performance to be degraded.</p>
+<dl class="tparams"><dt>Template Parameters</dt><dd>
+  <table class="tparams">
+    <tr><td class="paramname">CHANNELS</td><td>Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) </td></tr>
+    <tr><td class="paramname">ACTIVE_CHANNELS</td><td><b>[inferred]</b> Number of channels actively being histogrammed </td></tr>
+    <tr><td class="paramname">InputIteratorRA</td><td><b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type). Must have a value type that is assignable to <code>unsigned char</code> </td></tr>
+    <tr><td class="paramname">HistoCounter</td><td><b>[inferred]</b> Integral type for counting sample occurrences per histogram bin </td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in]</td><td class="paramname">d_samples</td><td>Input samples. (Channels, if any, are interleaved in "AOS" format) </td></tr>
+    <tr><td class="paramdir">[out]</td><td class="paramname">d_histograms</td><td>Array of channel histograms, each having 256 counters of integral type <code>HistoCounter</code>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">num_samples</td><td>Number of samples to process </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">stream</td><td><b>[optional]</b> CUDA stream to launch kernels within. Default is stream-0. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">stream_synchronous</td><td><b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Default is <code>false</code>. </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">device_allocator</td><td><b>[optional]</b> Allocator for allocating and freeing device memory. Default is provided by DefaultDeviceAllocator. </td></tr>
+  </table>
+  </dd>
+</dl>
+
+</div>
+</div>
+<a class="anchor" id="a112463c78c6a51d41be4600421873126"></a>
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int CHANNELS, int ACTIVE_CHANNELS, typename InputIteratorRA , typename HistoCounter &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">__host__ __device__ static __forceinline__ cudaError_t cub::DeviceHisto256::MultiChannelGlobalAtomic </td>
+          <td>(</td>
+          <td class="paramtype">InputIteratorRA&#160;</td>
+          <td class="paramname"><em>d_samples</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">HistoCounter *(&amp;)&#160;</td>
+          <td class="paramname"><em>d_histograms</em>[ACTIVE_CHANNELS], </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>num_samples</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">cudaStream_t&#160;</td>
+          <td class="paramname"><em>stream</em> = <code>0</code>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">bool&#160;</td>
+          <td class="paramname"><em>stream_synchronous</em> = <code>false</code>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="classcub_1_1_device_allocator.html">DeviceAllocator</a> *&#160;</td>
+          <td class="paramname"><em>device_allocator</em> = <code><a class="el" href="group___util_module.html#ga289830648914c5fec58f34af96247fe0">DefaultDeviceAllocator</a>()</code>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Computes a 256-bin device-wide histogram from multi-channel data. Uses global-memory atomic read-modify-write operations to compute the histogram. </p>
 <p>Sample input having lower diversity cause performance to be degraded.</p>
 <dl class="tparams"><dt>Template Parameters</dt><dd>
   <table class="tparams">
@@ -467,7 +643,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_device_reduce-members.html b/docs/html/structcub_1_1_device_reduce-members.html
index 31ffa2c105..cb6821e201 100644
--- a/docs/html/structcub_1_1_device_reduce-members.html
+++ b/docs/html/structcub_1_1_device_reduce-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_device_reduce.html b/docs/html/structcub_1_1_device_reduce.html
index dfa4749a9d..a6f5473838 100644
--- a/docs/html/structcub_1_1_device_reduce.html
+++ b/docs/html/structcub_1_1_device_reduce.html
@@ -214,7 +214,7 @@ <h2 class="groupheader">Member Function Documentation</h2>
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_enable_if-members.html b/docs/html/structcub_1_1_enable_if-members.html
index cce37621f0..649f9ebf0c 100644
--- a/docs/html/structcub_1_1_enable_if-members.html
+++ b/docs/html/structcub_1_1_enable_if-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_enable_if.html b/docs/html/structcub_1_1_enable_if.html
index ec65088d9d..4f2880f34b 100644
--- a/docs/html/structcub_1_1_enable_if.html
+++ b/docs/html/structcub_1_1_enable_if.html
@@ -123,7 +123,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_equality-members.html b/docs/html/structcub_1_1_equality-members.html
index e7d9a78bb1..3a8a6dcb22 100644
--- a/docs/html/structcub_1_1_equality-members.html
+++ b/docs/html/structcub_1_1_equality-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_equality.html b/docs/html/structcub_1_1_equality.html
index 7e5742bf29..89010309b0 100644
--- a/docs/html/structcub_1_1_equality.html
+++ b/docs/html/structcub_1_1_equality.html
@@ -124,7 +124,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_equals-members.html b/docs/html/structcub_1_1_equals-members.html
index 4c3f706158..be92fe5185 100644
--- a/docs/html/structcub_1_1_equals-members.html
+++ b/docs/html/structcub_1_1_equals-members.html
@@ -109,7 +109,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_equals.html b/docs/html/structcub_1_1_equals.html
index 6784e35eb9..180a2cef63 100644
--- a/docs/html/structcub_1_1_equals.html
+++ b/docs/html/structcub_1_1_equals.html
@@ -123,7 +123,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_if-members.html b/docs/html/structcub_1_1_if-members.html
index 9117139cfe..cea5167bdc 100644
--- a/docs/html/structcub_1_1_if-members.html
+++ b/docs/html/structcub_1_1_if-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_if.html b/docs/html/structcub_1_1_if.html
index a5e7dd63fb..549223ac33 100644
--- a/docs/html/structcub_1_1_if.html
+++ b/docs/html/structcub_1_1_if.html
@@ -123,7 +123,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_int2_type-members.html b/docs/html/structcub_1_1_int2_type-members.html
index 359304b182..043da9a61c 100644
--- a/docs/html/structcub_1_1_int2_type-members.html
+++ b/docs/html/structcub_1_1_int2_type-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_int2_type.html b/docs/html/structcub_1_1_int2_type.html
index 919831c7db..2153d6a0bf 100644
--- a/docs/html/structcub_1_1_int2_type.html
+++ b/docs/html/structcub_1_1_int2_type.html
@@ -122,7 +122,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_is_pointer-members.html b/docs/html/structcub_1_1_is_pointer-members.html
index 22ceafd6c1..f2717d7e8f 100644
--- a/docs/html/structcub_1_1_is_pointer-members.html
+++ b/docs/html/structcub_1_1_is_pointer-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_is_pointer.html b/docs/html/structcub_1_1_is_pointer.html
index 8123ee56ff..adf1a6594e 100644
--- a/docs/html/structcub_1_1_is_pointer.html
+++ b/docs/html/structcub_1_1_is_pointer.html
@@ -122,7 +122,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_is_volatile-members.html b/docs/html/structcub_1_1_is_volatile-members.html
index 017747cd6b..78aeeb8152 100644
--- a/docs/html/structcub_1_1_is_volatile-members.html
+++ b/docs/html/structcub_1_1_is_volatile-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_is_volatile.html b/docs/html/structcub_1_1_is_volatile.html
index 918cdcaf2a..5cdf70b165 100644
--- a/docs/html/structcub_1_1_is_volatile.html
+++ b/docs/html/structcub_1_1_is_volatile.html
@@ -122,7 +122,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_log2-members.html b/docs/html/structcub_1_1_log2-members.html
index 77b7b29bfe..519a6c3cca 100644
--- a/docs/html/structcub_1_1_log2-members.html
+++ b/docs/html/structcub_1_1_log2-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_log2.html b/docs/html/structcub_1_1_log2.html
index 5d327d16b1..733a0f6513 100644
--- a/docs/html/structcub_1_1_log2.html
+++ b/docs/html/structcub_1_1_log2.html
@@ -124,7 +124,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_max-members.html b/docs/html/structcub_1_1_max-members.html
index 9aba252e2b..1d3a7ea5bc 100644
--- a/docs/html/structcub_1_1_max-members.html
+++ b/docs/html/structcub_1_1_max-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_max.html b/docs/html/structcub_1_1_max.html
index 41d821bcc9..efd7210de8 100644
--- a/docs/html/structcub_1_1_max.html
+++ b/docs/html/structcub_1_1_max.html
@@ -124,7 +124,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_null_type.html b/docs/html/structcub_1_1_null_type.html
index a7920144ec..48fbd239d9 100644
--- a/docs/html/structcub_1_1_null_type.html
+++ b/docs/html/structcub_1_1_null_type.html
@@ -109,7 +109,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_numeric_traits-members.html b/docs/html/structcub_1_1_numeric_traits-members.html
index edf07df2d4..2616b7899f 100644
--- a/docs/html/structcub_1_1_numeric_traits-members.html
+++ b/docs/html/structcub_1_1_numeric_traits-members.html
@@ -110,7 +110,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_numeric_traits.html b/docs/html/structcub_1_1_numeric_traits.html
index 22c202bb7d..d2744cc64b 100644
--- a/docs/html/structcub_1_1_numeric_traits.html
+++ b/docs/html/structcub_1_1_numeric_traits.html
@@ -135,7 +135,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_ptx_arch_props-members.html b/docs/html/structcub_1_1_ptx_arch_props-members.html
index df74646ba4..53f3de3956 100644
--- a/docs/html/structcub_1_1_ptx_arch_props-members.html
+++ b/docs/html/structcub_1_1_ptx_arch_props-members.html
@@ -121,7 +121,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_ptx_arch_props.html b/docs/html/structcub_1_1_ptx_arch_props.html
index e8cd8eb1e9..353dee4fdb 100644
--- a/docs/html/structcub_1_1_ptx_arch_props.html
+++ b/docs/html/structcub_1_1_ptx_arch_props.html
@@ -127,7 +127,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_remove_qualifiers-members.html b/docs/html/structcub_1_1_remove_qualifiers-members.html
index caf37a8160..96ffe06fe2 100644
--- a/docs/html/structcub_1_1_remove_qualifiers-members.html
+++ b/docs/html/structcub_1_1_remove_qualifiers-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_remove_qualifiers.html b/docs/html/structcub_1_1_remove_qualifiers.html
index 919d9fb92a..4f2ef71b5f 100644
--- a/docs/html/structcub_1_1_remove_qualifiers.html
+++ b/docs/html/structcub_1_1_remove_qualifiers.html
@@ -124,7 +124,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_sum-members.html b/docs/html/structcub_1_1_sum-members.html
index 577d8b9703..8cf5ee4a59 100644
--- a/docs/html/structcub_1_1_sum-members.html
+++ b/docs/html/structcub_1_1_sum-members.html
@@ -108,7 +108,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_sum.html b/docs/html/structcub_1_1_sum.html
index 86d4ef1be7..1750823c13 100644
--- a/docs/html/structcub_1_1_sum.html
+++ b/docs/html/structcub_1_1_sum.html
@@ -124,7 +124,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_traits-members.html b/docs/html/structcub_1_1_traits-members.html
index 8abb9af827..52da8cfef0 100644
--- a/docs/html/structcub_1_1_traits-members.html
+++ b/docs/html/structcub_1_1_traits-members.html
@@ -110,7 +110,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/structcub_1_1_traits.html b/docs/html/structcub_1_1_traits.html
index 6b4b59a220..1b85b30ba0 100644
--- a/docs/html/structcub_1_1_traits.html
+++ b/docs/html/structcub_1_1_traits.html
@@ -136,7 +136,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:35 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:26 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/thread__load_8cuh.html b/docs/html/thread__load_8cuh.html
index 4f581f8ebf..2f84ceb725 100644
--- a/docs/html/thread__load_8cuh.html
+++ b/docs/html/thread__load_8cuh.html
@@ -148,7 +148,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/thread__operators_8cuh.html b/docs/html/thread__operators_8cuh.html
index c104165333..9c3d8ee944 100644
--- a/docs/html/thread__operators_8cuh.html
+++ b/docs/html/thread__operators_8cuh.html
@@ -134,7 +134,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/thread__reduce_8cuh.html b/docs/html/thread__reduce_8cuh.html
index 623c570f13..154eb7f897 100644
--- a/docs/html/thread__reduce_8cuh.html
+++ b/docs/html/thread__reduce_8cuh.html
@@ -138,7 +138,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/thread__scan_8cuh.html b/docs/html/thread__scan_8cuh.html
index 9900669692..5b7222c404 100644
--- a/docs/html/thread__scan_8cuh.html
+++ b/docs/html/thread__scan_8cuh.html
@@ -147,7 +147,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/thread__store_8cuh.html b/docs/html/thread__store_8cuh.html
index 416d52da21..9ed110243e 100644
--- a/docs/html/thread__store_8cuh.html
+++ b/docs/html/thread__store_8cuh.html
@@ -143,7 +143,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/util__arch_8cuh.html b/docs/html/util__arch_8cuh.html
index 5e3d04bf99..6a38a6e50a 100644
--- a/docs/html/util__arch_8cuh.html
+++ b/docs/html/util__arch_8cuh.html
@@ -123,10 +123,6 @@
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="define-members"></a>
 Macros</h2></td></tr>
-<tr class="memitem:ga9b4969946073d1c94ec927991f8b3a6b"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ga9b4969946073d1c94ec927991f8b3a6b"></a>
-#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga9b4969946073d1c94ec927991f8b3a6b">CUB_CNP_ENABLED</a>&#160;&#160;&#160;1</td></tr>
-<tr class="memdesc:ga9b4969946073d1c94ec927991f8b3a6b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. <br/></td></tr>
-<tr class="separator:ga9b4969946073d1c94ec927991f8b3a6b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga756ebe2b0566abfdfcd3c13fe7d1e9c6"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ga756ebe2b0566abfdfcd3c13fe7d1e9c6"></a>
 #define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga756ebe2b0566abfdfcd3c13fe7d1e9c6">CUB_DESTRUCTOR</a>&#160;&#160;&#160;__host__ __device__</td></tr>
 <tr class="memdesc:ga756ebe2b0566abfdfcd3c13fe7d1e9c6"><td class="mdescLeft">&#160;</td><td class="mdescRight">Execution space for destructors. <br/></td></tr>
@@ -135,6 +131,10 @@
 #define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga76769ce380739b38e0bd00f57e0dbe4f">CUB_PTX_ARCH</a>&#160;&#160;&#160;0</td></tr>
 <tr class="memdesc:ga76769ce380739b38e0bd00f57e0dbe4f"><td class="mdescLeft">&#160;</td><td class="mdescRight">CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). <br/></td></tr>
 <tr class="separator:ga76769ce380739b38e0bd00f57e0dbe4f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gac029dc29df3c1871ab3b8a38f50904a6"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="gac029dc29df3c1871ab3b8a38f50904a6"></a>
+#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#gac029dc29df3c1871ab3b8a38f50904a6">CUB_RUNTIME_ENABLED</a></td></tr>
+<tr class="memdesc:gac029dc29df3c1871ab3b8a38f50904a6"><td class="mdescLeft">&#160;</td><td class="mdescRight">Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. <br/></td></tr>
+<tr class="separator:gac029dc29df3c1871ab3b8a38f50904a6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
 <div class="textblock"><p>Static architectural properties by SM version. </p>
@@ -142,7 +142,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/util__debug_8cuh.html b/docs/html/util__debug_8cuh.html
index bbf9ffc539..402822835a 100644
--- a/docs/html/util__debug_8cuh.html
+++ b/docs/html/util__debug_8cuh.html
@@ -150,7 +150,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/util__device_8cuh.html b/docs/html/util__device_8cuh.html
index 437b4de1f4..ff411d4443 100644
--- a/docs/html/util__device_8cuh.html
+++ b/docs/html/util__device_8cuh.html
@@ -99,7 +99,6 @@
   <div class="summary">
 <a href="#nested-classes">Classes</a> &#124;
 <a href="#namespaces">Namespaces</a> &#124;
-<a href="#typedef-members">Typedefs</a> &#124;
 <a href="#enum-members">Enumerations</a> &#124;
 <a href="#func-members">Functions</a>  </div>
   <div class="headertitle">
@@ -123,13 +122,6 @@
 <tr class="memdesc:namespacecub"><td class="mdescLeft">&#160;</td><td class="mdescRight">Optional outer namespace(s) <br/></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
-<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="typedef-members"></a>
-Typedefs</h2></td></tr>
-<tr class="memitem:ga0888aca216483e15abb12b4d80da31a9"><td class="memItemLeft" align="right" valign="top"><a class="anchor" id="ga0888aca216483e15abb12b4d80da31a9"></a>
-typedef int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___util_module.html#ga0888aca216483e15abb12b4d80da31a9">cub::DeviceOrdinal</a></td></tr>
-<tr class="memdesc:ga0888aca216483e15abb12b4d80da31a9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Type for representing GPU device ordinals. <br/></td></tr>
-<tr class="separator:ga0888aca216483e15abb12b4d80da31a9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-</table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="enum-members"></a>
 Enumerations</h2></td></tr>
 <tr class="memitem:gaba62b43ab3ddc260aa2d3ee42035aa4e"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom">{ <b>INVALID_DEVICE_ORDINAL</b> = -1
@@ -156,7 +148,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/util__iterator_8cuh.html b/docs/html/util__iterator_8cuh.html
index 76e1b7f78e..3d1207a026 100644
--- a/docs/html/util__iterator_8cuh.html
+++ b/docs/html/util__iterator_8cuh.html
@@ -131,7 +131,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/util__type_8cuh.html b/docs/html/util__type_8cuh.html
index 1022d28481..57471e5533 100644
--- a/docs/html/util__type_8cuh.html
+++ b/docs/html/util__type_8cuh.html
@@ -106,6 +106,7 @@
 </div><!--header-->
 <div class="contents">
 <div class="textblock"><code>#include &lt;iostream&gt;</code><br/>
+<code>#include &lt;limits&gt;</code><br/>
 <code>#include &quot;util_namespace.cuh&quot;</code><br/>
 </div><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="nested-classes"></a>
@@ -181,7 +182,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/warp__reduce_8cuh.html b/docs/html/warp__reduce_8cuh.html
index 9fe4921c7d..dbec885562 100644
--- a/docs/html/warp__reduce_8cuh.html
+++ b/docs/html/warp__reduce_8cuh.html
@@ -133,7 +133,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/html/warp__scan_8cuh.html b/docs/html/warp__scan_8cuh.html
index 4c8010ef09..c38886d357 100644
--- a/docs/html/warp__scan_8cuh.html
+++ b/docs/html/warp__scan_8cuh.html
@@ -133,7 +133,7 @@
 <!-- HTML footer for doxygen 1.8.3.1-->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
-Generated on Wed May 1 2013 05:38:34 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
+Generated on Tue May 7 2013 16:26:25 for CUB by &#160;<a href="http://www.doxygen.org/index.html">
 <img class="footer" src="doxygen.png" alt="doxygen"/>
 </a> 1.8.3.1
 <br>
diff --git a/docs/mainpage.dox b/docs/mainpage.dox
index d69427b019..4fdbe60171 100644
--- a/docs/mainpage.dox
+++ b/docs/mainpage.dox
@@ -99,7 +99,7 @@
  *
  * \par
  * <table>
- * <tr> <td>04/30/2013</td> <td style="white-space: nowrap">[CUB v0.9.3 (update release)](https://github.com/NVlabs/cub/archive/0.9.3.zip)</td> 	<td>Introduced several new device-wide and block-wide primitives, including 256-bin histogram.  Misc. cosmetic and bug fixes.  See the [change-log](https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT) for further details.</td> </tr>
+ * <tr> <td>05/07/2013</td> <td style="white-space: nowrap">[CUB v0.9.4 (update release)](https://github.com/NVlabs/cub/archive/0.9.4.zip)</td> 	<td>Compilation fixes for several primitives on older architectures.  Introduced several new device-wide and block-wide primitives, including 256-bin histogram.  Misc. cosmetic and bug fixes.  See the [change-log](https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT) for further details.</td> </tr>
  * <tr> <td>04/04/2013</td> <td style="white-space: nowrap">[CUB v0.9.2 (update release)](https://github.com/NVlabs/cub/archive/0.9.2.zip)</td> 	<td>Minor cosmetic, feature, and compilation updates.  See the [change-log](https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT) for further details.</td> </tr>
  * <tr> <td>03/07/2013</td> <td style="white-space: nowrap">[CUB v0.9 ("preview" release)](https://github.com/NVlabs/cub/archive/0.9.zip)</td> 		<td>CUB is the first durable, high-performance library of cooperative threadblock, warp, and thread primitives for CUDA kernel programming. More primitives and examples coming soon!</td> </tr>
  * </table> 
diff --git a/test/Makefile b/test/Makefile
index 1820f34890..5011d4459c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -59,6 +59,37 @@ else
 	CNP_SUFFIX = nocnp
 endif
 
+#-------------------------------------------------------------------------------
+# SM Arch
+#-------------------------------------------------------------------------------
+
+# Architecture(s) to compile for, e.g., "sm=200,300,350" (SM20 by default).  
+COMMA = ,
+ifdef sm
+	SM_ARCH = $(subst $(COMMA),-,$(sm))
+else 
+    SM_ARCH = 200
+endif
+
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+    SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\" 
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+    SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+endif
+ifeq (200, $(findstring 200, $(SM_ARCH)))
+    SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+endif
+ifeq (130, $(findstring 130, $(SM_ARCH)))
+    SM_TARGETS += -gencode=arch=compute_13,code=\"sm_13,compute_13\" 
+endif
+ifeq (110, $(findstring 110, $(SM_ARCH)))
+    SM_TARGETS += -gencode=arch=compute_11,code=\"sm_11,compute_11\" 
+endif
+ifeq (100, $(findstring 100, $(SM_ARCH)))
+    SM_TARGETS += -gencode=arch=compute_10,code=\"sm_10,compute_10\" 
+endif
+
 
 #-------------------------------------------------------------------------------
 # Compiler Flags
@@ -119,36 +150,6 @@ endif
 SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CNP_SUFFIX)_$(CPU_ARCH_SUFFIX)
 
 
-#-------------------------------------------------------------------------------
-# SM Arch
-#-------------------------------------------------------------------------------
-
-# Architecture(s) to compile for, e.g., "sm=200,300,350" (SM20 by default).  
-ifdef sm
-	SM_ARCH = $(sm)
-else 
-    SM_ARCH = 200
-endif
-
-ifeq (350, $(findstring 350, $(SM_ARCH)))
-    SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\" 
-endif
-ifeq (300, $(findstring 300, $(SM_ARCH)))
-    SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
-endif
-ifeq (200, $(findstring 200, $(SM_ARCH)))
-    SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\"
-endif
-ifeq (130, $(findstring 130, $(SM_ARCH)))
-    SM_TARGETS += -gencode=arch=compute_13,code=\"sm_13,compute_13\" 
-endif
-ifeq (110, $(findstring 110, $(SM_ARCH)))
-    SM_TARGETS += -gencode=arch=compute_11,code=\"sm_11,compute_11\" 
-endif
-ifeq (100, $(findstring 100, $(SM_ARCH)))
-    SM_TARGETS += -gencode=arch=compute_10,code=\"sm_10,compute_10\" 
-endif
-
 
 #-------------------------------------------------------------------------------
 # Includes
diff --git a/test/test_block_radix_sort.cu b/test/test_block_radix_sort.cu
index 9b229a9136..4e1387e291 100644
--- a/test/test_block_radix_sort.cu
+++ b/test/test_block_radix_sort.cu
@@ -470,34 +470,29 @@ int main(int argc, char** argv)
     // Initialize command line
     CommandLineArgs args(argc, argv);
     g_verbose = args.CheckCmdLineFlag("v");
-    bool quick = args.CheckCmdLineFlag("quick");
 
     // Print usage
     if (args.CheckCmdLineFlag("help"))
     {
-        printf("%s [--device=<device-id>] [--v] [--quick]\n", argv[0]);
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
         exit(0);
     }
 
     // Initialize device
     CubDebugExit(args.DeviceInit());
 
-    if (quick)
-    {
-        // Quick test
-        typedef unsigned int T;
-        TestDriver<64, 2, 5, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, T, NullType>(0, 0, sizeof(T) * 8);
-    }
-    else
-    {
-/*
-        // Test threads
-        Test<32>();
-        Test<64>();
-        Test<128>();
-        Test<256>();
-*/
-    }
+    // Quick test
+    typedef unsigned int T;
+    TestDriver<64, 2, 5, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, T, NullType>(0, 0, sizeof(T) * 8);
+
+    // Test threads
+    Test<32>();
+    Test<64>();
+    Test<128>();
+    Test<256>();
 
     return 0;
 }
diff --git a/test/test_device_histo_256.cu b/test/test_device_histo_256.cu
index 9ae02c6731..6e6a4902dd 100644
--- a/test/test_device_histo_256.cu
+++ b/test/test_device_histo_256.cu
@@ -35,6 +35,7 @@
 
 #include <stdio.h>
 #include <limits>
+#include <string>
 
 #include <cub/cub.cuh>
 #include "test_util.h"
@@ -46,10 +47,10 @@ using namespace cub;
 // Globals, constants and typedefs
 //---------------------------------------------------------------------
 
-bool    g_verbose       = false;
-int     g_iterations    = 100;
-bool    g_atomic        = false;
-bool    g_verbose_input = false;
+bool                                g_verbose           = false;
+int                                 g_iterations        = 100;
+bool                                g_verbose_input     = false;
+PersistentBlockHisto256Algorithm    g_algorithm         = GRID_HISTO_256_SORT;
     
 
 /**
@@ -89,7 +90,7 @@ __global__ void CnpHisto(
 {
     cudaError_t error = cudaSuccess;
 
-#if CUB_CNP_ENABLED
+#ifdef CUB_RUNTIME_ENABLED
     for (int i = 0; i < iterations; ++i)
     {
         error = DeviceHisto256::SingleChannel(d_samples, d_out, num_samples, reduction_op, 0, STREAM_SYNCHRONOUS);
@@ -202,7 +203,8 @@ void Test(
     int cnp_compare     = 0;
     int total_bins      = ACTIVE_CHANNELS * 256;
 
-    printf("cub::DeviceHisto256 %d %s samples (%dB), %d channels, %d active channels, gen-mode %d\n\n",
+    printf("cub::DeviceHisto256 %s %d %s samples (%dB), %d channels, %d active channels, gen-mode %d\n\n",
+        (g_algorithm == GRID_HISTO_256_SHARED_ATOMIC) ? "satomic" : (g_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC) ? "gatomic" : "sort",
         num_samples,
         type_string,
         (int) sizeof(SampleType),
@@ -238,16 +240,19 @@ void Test(
     }
 
     // Create iterator wrapper for SampleType -> unsigned char conversion
-//    typedef TransformIteratorRA<unsigned char, BinOp, SampleType> BinningIterator;
     typedef TexTransformIteratorRA<unsigned char, BinOp, SampleType> BinningIterator;
     BinningIterator d_sample_itr(d_samples, bin_op);
 
     // Run warmup/correctness iteration
     printf("Host dispatch:\n"); fflush(stdout);
-    if (g_atomic)
+    if (g_algorithm == GRID_HISTO_256_SHARED_ATOMIC)
     {
         CubDebugExit(DeviceHisto256::MultiChannelAtomic<CHANNELS>(d_sample_itr, d_histograms, num_samples, 0, true));
     }
+    else if (g_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC)
+    {
+        CubDebugExit(DeviceHisto256::MultiChannelGlobalAtomic<CHANNELS>(d_sample_itr, d_histograms, num_samples, 0, true));
+    }
     else
     {
         CubDebugExit(DeviceHisto256::MultiChannel<CHANNELS>(d_sample_itr, d_histograms, num_samples, 0, true));
@@ -268,13 +273,17 @@ void Test(
     {
         gpu_timer.Start();
 
-        if (g_atomic)
+        if (g_algorithm == GRID_HISTO_256_SHARED_ATOMIC)
         {
-            CubDebugExit(DeviceHisto256::MultiChannelAtomic<CHANNELS>(d_sample_itr, d_histograms, num_samples));
+            CubDebugExit(DeviceHisto256::MultiChannelAtomic<CHANNELS>(d_sample_itr, d_histograms, num_samples, 0));
+        }
+        else if (g_algorithm == GRID_HISTO_256_GLOBAL_ATOMIC)
+        {
+            CubDebugExit(DeviceHisto256::MultiChannelGlobalAtomic<CHANNELS>(d_sample_itr, d_histograms, num_samples, 0));
         }
         else
         {
-            CubDebugExit(DeviceHisto256::MultiChannel<CHANNELS>(d_sample_itr, d_histograms, num_samples));
+            CubDebugExit(DeviceHisto256::MultiChannel<CHANNELS>(d_sample_itr, d_histograms, num_samples, 0));
         }
 
         gpu_timer.Stop();
@@ -297,7 +306,7 @@ void Test(
         printf("\n");
     }
 
-
+/*
     // Evaluate using CUDA nested parallelism
 #if (TEST_CNP == 1)
 
@@ -348,6 +357,7 @@ void Test(
     }
 
 #endif
+*/
 
     // Cleanup
     if (h_samples) delete[] h_samples;
@@ -380,16 +390,23 @@ int main(int argc, char** argv)
     args.GetCmdLineArgument("n", num_samples);          // Total number of samples across all channels
     args.GetCmdLineArgument("i", g_iterations);         // Timing iterations
     g_verbose = args.CheckCmdLineFlag("v");             // Display input/output data
-    g_atomic = args.CheckCmdLineFlag("atomic");         // Use atomic or regular (sorting) algorithm
     bool uniform = args.CheckCmdLineFlag("uniform");    // Random data vs. uniform (homogeneous)
 
+    // Get algorithm type
+    std::string type;
+    args.GetCmdLineArgument("algorithm", type);
+    if (type == std::string("satomic"))
+        g_algorithm = GRID_HISTO_256_SHARED_ATOMIC;
+    else if (type == std::string("gatomic"))
+        g_algorithm = GRID_HISTO_256_GLOBAL_ATOMIC;
+
     // Print usage
     if (args.CheckCmdLineFlag("help"))
     {
         printf("%s "
             "[--device=<device-id>] "
             "[--v] "
-            "[--atomic] "
+            "[--algorithm=<sort|satomic|gatomic>] "
             "[--uniform]"
             "[--n=<total number of samples across all channels>]"
             "[--i=<timing iterations>]"
@@ -438,7 +455,6 @@ int main(int argc, char** argv)
         num_samples,
         CUB_TYPE_STRING(unsigned short));
 
-
     // unsigned int
     printf("\n\n-- UINT32 -------------- \n"); fflush(stdout);
 
diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
index e498db5db9..781df9e81a 100644
--- a/test/test_device_reduce.cu
+++ b/test/test_device_reduce.cu
@@ -70,7 +70,7 @@ __global__ void CnpReduce(
 {
     cudaError_t error = cudaSuccess;
 
-#if CUB_CNP_ENABLED
+#ifdef CUB_RUNTIME_ENABLED
     for (int i = 0; i < iterations; ++i)
     {
         error = DeviceReduce::Reduce(d_in, d_out, num_items, reduction_op, 0, STREAM_SYNCHRONOUS);
@@ -310,7 +310,6 @@ int main(int argc, char** argv)
 
     // Quick test
     typedef int T;
-//    typedef unsigned short T;
     Test<T>(num_items, UNIFORM, Sum<T>(), CUB_TYPE_STRING(T));
 
 /*
diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu
index 6ffd080a54..ac250b4a3b 100644
--- a/test/test_warp_reduce.cu
+++ b/test/test_warp_reduce.cu
@@ -285,8 +285,9 @@ void Test(
 
     // Copy out and display results
     printf("\tReduction results: ");
-    AssertEquals(0, CompareDeviceResults(&aggregate, d_out, 1, g_verbose, g_verbose));
-    printf("\n");
+    int compare = CompareDeviceResults(&aggregate, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
 
     // Cleanup
     if (h_in) delete[] h_in;
diff --git a/test/test_warp_scan.cu b/test/test_warp_scan.cu
index d34891b18e..d976911e75 100644
--- a/test/test_warp_scan.cu
+++ b/test/test_warp_scan.cu
@@ -558,7 +558,6 @@ int main(int argc, char** argv)
     // Initialize command line
     CommandLineArgs args(argc, argv);
     g_verbose = args.CheckCmdLineFlag("v");
-    bool quick = args.CheckCmdLineFlag("quick");
 
     // Print usage
     if (args.CheckCmdLineFlag("help"))
@@ -566,7 +565,6 @@ int main(int argc, char** argv)
         printf("%s "
             "[--device=<device-id>] "
             "[--v] "
-            "[--quick]"
             "\n", argv[0]);
         exit(0);
     }
@@ -574,19 +572,14 @@ int main(int argc, char** argv)
     // Initialize device
     CubDebugExit(args.DeviceInit());
 
-    if (quick)
-    {
-        // Quick exclusive test
-        Test<32, PREFIX_AGGREGATE>(UNIFORM, Sum<int>(), (int) 0, (int) 99, CUB_TYPE_STRING(Sum<int>));
-    }
-    else
-    {
-        // Test logical warp sizes
-        Test<32>();
-        Test<16>();
-        Test<9>();
-        Test<7>();
-    }
+    // Quick exclusive test
+    Test<32, PREFIX_AGGREGATE>(UNIFORM, Sum<int>(), (int) 0, (int) 99, CUB_TYPE_STRING(Sum<int>));
+
+    // Test logical warp sizes
+    Test<32>();
+    Test<16>();
+    Test<9>();
+    Test<7>();
 
     return 0;
 }
diff --git a/tune/Makefile b/tune/Makefile
index 7c2883e9d4..59a0a30e05 100644
--- a/tune/Makefile
+++ b/tune/Makefile
@@ -45,7 +45,7 @@ OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 # Includes
 #-------------------------------------------------------------------------------
 
-INC = -I. -I../cub -I../test
+INC = -I. -I.. -I../test
 
 #-------------------------------------------------------------------------------
 # Defines
diff --git a/tune/tune_device_reduce.cu b/tune/tune_device_reduce.cu
index 34fc9d6a21..e69e74025f 100644
--- a/tune/tune_device_reduce.cu
+++ b/tune/tune_device_reduce.cu
@@ -186,24 +186,24 @@ struct Schmoo
      * Must have smem that fits in the SM
      * Must have vector load length that divides items per thread
      */
-    template <typename BlockReduceTilesPolicy>
+    template <typename TilesReducePolicy, typename ReductionOp>
     struct SmemSize
     {
         enum
         {
-            BYTES = sizeof(typename BlockReduceTiles<BlockReduceTilesPolicy, T*, SizeT>::SmemStorage),
+            BYTES = sizeof(typename TilesReduce<TilesReducePolicy, T*, SizeT, ReductionOp>::SmemStorage),
             IS_OK = ((BYTES < ArchProps<TUNE_ARCH>::SMEM_BYTES) &&
-                     (BlockReduceTilesPolicy::ITEMS_PER_THREAD % BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH == 0))
+                     (TilesReducePolicy::ITEMS_PER_THREAD % TilesReducePolicy::VECTOR_LOAD_LENGTH == 0))
         };
     };
 
 
     /**
-     * Specialization that allows kernel generation with the specified BlockReduceTilesPolicy
+     * Specialization that allows kernel generation with the specified TilesReducePolicy
      */
     template <
-        typename BlockReduceTilesPolicy,
-        bool IsOk = SmemSize<BlockReduceTilesPolicy>::IS_OK>
+        typename    TilesReducePolicy,
+        bool        IsOk = SmemSize<TilesReducePolicy, ReductionOp>::IS_OK>
     struct Ok
     {
         /// Enumerate multi-block kernel and add to the list
@@ -213,8 +213,8 @@ struct Schmoo
             int subscription_factor)
         {
             MultiDispatchTuple tuple;
-            tuple.params.template Init<BlockReduceTilesPolicy>(subscription_factor);
-            tuple.kernel_ptr = MultiBlockDeviceReduceKernel<BlockReduceTilesPolicy, T*, T*, SizeT, ReductionOp>;
+            tuple.params.template Init<TilesReducePolicy>(subscription_factor);
+            tuple.kernel_ptr = MultiBlockReduceKernel<TilesReducePolicy, T*, T*, SizeT, ReductionOp>;
             multi_kernels.push_back(tuple);
         }
 
@@ -224,17 +224,17 @@ struct Schmoo
         static void GenerateSingle(KernelsVector &single_kernels)
         {
             SingleDispatchTuple tuple;
-            tuple.params.template Init<BlockReduceTilesPolicy>();
-            tuple.kernel_ptr = SingleBlockDeviceReduceKernel<BlockReduceTilesPolicy, T*, T*, SizeT, ReductionOp>;
+            tuple.params.template Init<TilesReducePolicy>();
+            tuple.kernel_ptr = SingleBlockReduceKernel<TilesReducePolicy, T*, T*, SizeT, ReductionOp>;
             single_kernels.push_back(tuple);
         }
     };
 
     /**
-     * Specialization that rejects kernel generation with the specified BlockReduceTilesPolicy
+     * Specialization that rejects kernel generation with the specified TilesReducePolicy
      */
-    template <typename BlockReduceTilesPolicy>
-    struct Ok<BlockReduceTilesPolicy, false>
+    template <typename TilesReducePolicy>
+    struct Ok<TilesReducePolicy, false>
     {
         template <typename KernelsVector>
         static void GenerateMulti(KernelsVector &multi_kernels, int subscription_factor) {}
@@ -254,16 +254,16 @@ struct Schmoo
     void Enumerate()
     {
         // Multi-block kernels
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 1);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 2);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 4);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 8);
+        Ok<TilesReducePolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 1);
+        Ok<TilesReducePolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 2);
+        Ok<TilesReducePolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 4);
+        Ok<TilesReducePolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateMulti(multi_kernels, 8);
 #if TUNE_ARCH >= 200
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
+        Ok<TilesReducePolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
 #endif
 
         // Single-block kernels
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateSingle(single_kernels);
+        Ok<TilesReducePolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_EVEN_SHARE> >::GenerateSingle(single_kernels);
     }
 
 
@@ -321,13 +321,13 @@ struct Schmoo
 //      Enumerate<BLOCK_THREADS, 13>();
 //      Enumerate<BLOCK_THREADS, 15>();
         Enumerate<BLOCK_THREADS, 16>();
-//      Enumerate<BLOCK_THREADS, 17>();
+        Enumerate<BLOCK_THREADS, 17>();
 //      Enumerate<BLOCK_THREADS, 19>();
         Enumerate<BLOCK_THREADS, 20>();
 //      Enumerate<BLOCK_THREADS, 21>();
-//      Enumerate<BLOCK_THREADS, 23>();
+        Enumerate<BLOCK_THREADS, 23>();
         Enumerate<BLOCK_THREADS, 24>();
-//      Enumerate<BLOCK_THREADS, 25>();
+        Enumerate<BLOCK_THREADS, 25>();
     }
 
 
@@ -447,10 +447,10 @@ struct Schmoo
         ReductionOp             reduction_op)
     {
         // Simple single kernel tuple for use with multi kernel sweep
-        typedef typename DeviceReduce::TunedPolicies<T, SizeT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
+        typedef typename DeviceReduce::TunedPolicies<T, SizeT, TUNE_ARCH>::SingleBlockPolicy SimpleSingleBlockPolicy;
         SingleDispatchTuple simple_single_tuple;
-        simple_single_tuple.params.template Init<SimpleSinglePolicy>();
-        simple_single_tuple.kernel_ptr = SingleBlockDeviceReduceKernel<SimpleSinglePolicy, T*, T*, SizeT, ReductionOp>;
+        simple_single_tuple.params.template Init<SimpleSingleBlockPolicy>();
+        simple_single_tuple.kernel_ptr = SingleBlockReduceKernel<SimpleSingleBlockPolicy, T*, T*, SizeT, ReductionOp>;
 
         double max_exponent      = log2(double(g_max_items));
         double min_exponent      = log2(double(simple_single_tuple.params.tile_size));