NVIDIA · gevtushenko · Jun 23, 2022 · Jun 9, 2022 · Jun 21, 2022
diff --git a/cub/agent/agent_adjacent_difference.cuh b/cub/agent/agent_adjacent_difference.cuh
@@ -143,16 +143,40 @@ struct AgentDifference
     {
       if (IS_FIRST_TILE)
       {
-        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
-          .SubtractLeft(input, output, difference_op);
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input,
+                                     output,
+                                     difference_op,
+                                     num_remaining);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeft(input, output, difference_op);
+        }
       }
       else
       {
-        InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx]
-                                          : *(input_it + tile_base - 1);
-
-        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
-          .SubtractLeft(input, output, difference_op, tile_prev_input);
+        InputT tile_prev_input = MayAlias 
+                               ? first_tile_previous[tile_idx]
+                               : *(input_it + tile_base - 1);
+
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input,
+                                     output,
+                                     difference_op,
+                                     num_remaining,
+                                     tile_prev_input);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeft(input, output, difference_op, tile_prev_input);
+        }
       }
     }
     else

diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh
@@ -489,15 +489,16 @@ public:
     }
 
     /**
-     * @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+     * @brief Subtracts the left element of each adjacent pair of elements 
+     *        partitioned across a CUDA thread block.
      *
      * @par
      * - \rowmajor
      * - \smemreuse
      *
      * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference to
-     * compute the left difference between adjacent elements.
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * to compute the left difference between adjacent elements.
      *
      * @par
      * @code
@@ -515,30 +516,152 @@ public:
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize BlockAdjacentDifference for a 1D block of 
-     *     // 128 threads of type int
-     *     using BlockAdjacentDifferenceT =
-     *        cub::BlockAdjacentDifference<int, 128>;
+     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // 128 threads of type int
+     *   using BlockAdjacentDifferenceT =
+     *      cub::BlockAdjacentDifference<int, 128>;
+     *
+     *   // Allocate shared memory for BlockDiscontinuity
+     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *   // Obtain a segment of consecutive items that are blocked across threads
+     *   int thread_data[4];
+     *   ...
+     *   int valid_items = 9;
+     *
+     *   // Collectively compute adjacent_difference
+     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+     *       thread_data,
+     *       thread_data,
+     *       CustomDifference(),
+     *       valid_items);
      *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
+     * The corresponding output `result` in those threads will be
+     * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
      *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
+     * @param[out] output
+     *   Calling thread's adjacent difference result
      *
-     *     // Collectively compute adjacent_difference
-     *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-     *         thread_data,
-     *         thread_data,
-     *         CustomDifference());
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to \p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     *
+     * @param[in] valid_items
+     *   Number of valid items in thread block
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputType,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
+                            OutputType (&output)[ITEMS_PER_THREAD],
+                            DifferenceOpT difference_op,
+                            int valid_items)
+    {
+      // Share last item
+      temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+      CTA_SYNC();
+
+      if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          output[item] = difference_op(input[item], input[item - 1]);
+        }
+      }
+      else
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+          if (idx < valid_items)
+          {
+            output[item] = difference_op(input[item], input[item - 1]);
+          }
+          else
+          {
+            output[item] = input[item];
+          }
+        }
+      }
+
+      if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
+      {
+        output[0] = input[0];
+      }
+      else
+      {
+        output[0] = difference_op(input[0],
+                                  temp_storage.last_items[linear_tid - 1]);
+      }
+    }
+
+    /**
+     * @brief Subtracts the left element of each adjacent pair of elements 
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * to compute the left difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // 128 threads of type int
+     *   using BlockAdjacentDifferenceT =
+     *      cub::BlockAdjacentDifference<int, 128>;
+     *
+     *   // Allocate shared memory for BlockDiscontinuity
+     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *   // Obtain a segment of consecutive items that are blocked across threads
+     *   int thread_data[4];
+     *   ...
+     *   int valid_items = 9;
+     *   int tile_predecessor_item = 4;
+     *
+     *   // Collectively compute adjacent_difference
+     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+     *       thread_data,
+     *       thread_data,
+     *       CustomDifference(),
+     *       valid_items,
+     *       tile_predecessor_item);
      *
      * @endcode
      * @par
      * Suppose the set of input `thread_data` across the block of threads is
      * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
      * The corresponding output `result` in those threads will be
-     * `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
+     * `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
      *
      * @param[out] output
      *   Calling thread's adjacent difference result
@@ -551,6 +674,11 @@ public:
      *
      * @param[in] valid_items
      *   Number of valid items in thread block
+     *
+     * @param[in] tile_predecessor_item
+     *   **[<em>thread</em><sub>0</sub> only]** item which is going to be
+     *   subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
+     *   <em>thread</em><sub>0</sub>).
      */
     template <int ITEMS_PER_THREAD,
               typename OutputType,
@@ -559,7 +687,8 @@ public:
     SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
                             OutputType (&output)[ITEMS_PER_THREAD],
                             DifferenceOpT difference_op,
-                            int valid_items)
+                            int valid_items,
+                            T tile_predecessor_item)
     {
       // Share last item
       temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
@@ -592,10 +721,15 @@ public:
         }
       }
 
-      if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
+      if (valid_items <= linear_tid * ITEMS_PER_THREAD)
       {
         output[0] = input[0];
       }
+      else if (linear_tid == 0) 
+      {
+        output[0] = difference_op(input[0], 
+                                  tile_predecessor_item);
+      }
       else
       {
         output[0] = difference_op(input[0],

diff --git a/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -199,30 +199,27 @@ struct DispatchAdjacentDifference : public SelectedPolicy
                                              sizeof(InputT);
 
       void *allocations[1]            = {nullptr};
-      std::size_t allocation_sizes[1] = {first_tile_previous_size};
+      std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};
 
-      if (MayAlias)
+      if (CubDebug(error = AliasTemporaries(d_temp_storage,
+                                            temp_storage_bytes,
+                                            allocations,
+                                            allocation_sizes)))
       {
-        if (CubDebug(error = AliasTemporaries(d_temp_storage,
-                                              temp_storage_bytes,
-                                              allocations,
-                                              allocation_sizes)))
-        {
-          break;
-        }
-
-        if (d_temp_storage == nullptr)
-        {
-          // Return if the caller is simply requesting the size of the storage
-          // allocation
+        break;
+      }
 
-          if (temp_storage_bytes == 0)
-          {
-            temp_storage_bytes = 1;
-          }
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
 
-          break;
+        if (temp_storage_bytes == 0)
+        {
+          temp_storage_bytes = 1;
         }
+
+        break;
       }
 
       if (num_items == OffsetT{})