diff --git a/cub/agent/agent_adjacent_difference.cuh b/cub/agent/agent_adjacent_difference.cuh
index fd07731639..b135fbbf53 100644
--- a/cub/agent/agent_adjacent_difference.cuh
+++ b/cub/agent/agent_adjacent_difference.cuh
@@ -143,16 +143,40 @@ struct AgentDifference
     {
       if (IS_FIRST_TILE)
       {
-        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
-          .SubtractLeft(input, output, difference_op);
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input,
+                                     output,
+                                     difference_op,
+                                     num_remaining);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeft(input, output, difference_op);
+        }
       }
       else
       {
-        InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx]
-                                          : *(input_it + tile_base - 1);
-
-        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
-          .SubtractLeft(input, output, difference_op, tile_prev_input);
+        InputT tile_prev_input = MayAlias 
+                               ? first_tile_previous[tile_idx]
+                               : *(input_it + tile_base - 1);
+
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input,
+                                     output,
+                                     difference_op,
+                                     num_remaining,
+                                     tile_prev_input);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeft(input, output, difference_op, tile_prev_input);
+        }
       }
     }
     else
diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh
index 5c7be8a729..524ffbebfa 100644
--- a/cub/block/block_adjacent_difference.cuh
+++ b/cub/block/block_adjacent_difference.cuh
@@ -489,15 +489,16 @@ public:
     }
 
     /**
-     * @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+     * @brief Subtracts the left element of each adjacent pair of elements 
+     *        partitioned across a CUDA thread block.
      *
      * @par
      * - \rowmajor
      * - \smemreuse
      *
      * @par Snippet
-     * The code snippet below illustrates how to use @p BlockAdjacentDifference to
-     * compute the left difference between adjacent elements.
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * to compute the left difference between adjacent elements.
      *
      * @par
      * @code
@@ -515,30 +516,152 @@ public:
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize BlockAdjacentDifference for a 1D block of 
-     *     // 128 threads of type int
-     *     using BlockAdjacentDifferenceT =
-     *        cub::BlockAdjacentDifference<int, 128>;
+     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // 128 threads of type int
+     *   using BlockAdjacentDifferenceT =
+     *      cub::BlockAdjacentDifference<int, 128>;
+     *
+     *   // Allocate shared memory for BlockDiscontinuity
+     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *   // Obtain a segment of consecutive items that are blocked across threads
+     *   int thread_data[4];
+     *   ...
+     *   int valid_items = 9;
+     *
+     *   // Collectively compute adjacent_difference
+     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+     *       thread_data,
+     *       thread_data,
+     *       CustomDifference(),
+     *       valid_items);
      *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
+     * The corresponding output `result` in those threads will be
+     * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
      *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
+     * @param[out] output
+     *   Calling thread's adjacent difference result
      *
-     *     // Collectively compute adjacent_difference
-     *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
-     *         thread_data,
-     *         thread_data,
-     *         CustomDifference());
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to \p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     *
+     * @param[in] valid_items
+     *   Number of valid items in thread block
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputType,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
+                            OutputType (&output)[ITEMS_PER_THREAD],
+                            DifferenceOpT difference_op,
+                            int valid_items)
+    {
+      // Share last item
+      temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+      CTA_SYNC();
+
+      if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          output[item] = difference_op(input[item], input[item - 1]);
+        }
+      }
+      else
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+          if (idx < valid_items)
+          {
+            output[item] = difference_op(input[item], input[item - 1]);
+          }
+          else
+          {
+            output[item] = input[item];
+          }
+        }
+      }
+
+      if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
+      {
+        output[0] = input[0];
+      }
+      else
+      {
+        output[0] = difference_op(input[0],
+                                  temp_storage.last_items[linear_tid - 1]);
+      }
+    }
+
+    /**
+     * @brief Subtracts the left element of each adjacent pair of elements 
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * to compute the left difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // 128 threads of type int
+     *   using BlockAdjacentDifferenceT =
+     *      cub::BlockAdjacentDifference<int, 128>;
+     *
+     *   // Allocate shared memory for BlockDiscontinuity
+     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *   // Obtain a segment of consecutive items that are blocked across threads
+     *   int thread_data[4];
+     *   ...
+     *   int valid_items = 9;
+     *   int tile_predecessor_item = 4;
+     *
+     *   // Collectively compute adjacent_difference
+     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+     *       thread_data,
+     *       thread_data,
+     *       CustomDifference(),
+     *       valid_items,
+     *       tile_predecessor_item);
      *
      * @endcode
      * @par
      * Suppose the set of input `thread_data` across the block of threads is
      * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
      * The corresponding output `result` in those threads will be
-     * `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
+     * `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
      *
      * @param[out] output
      *   Calling thread's adjacent difference result
@@ -551,6 +674,11 @@ public:
      *
      * @param[in] valid_items
      *   Number of valid items in thread block
+     *
+     * @param[in] tile_predecessor_item
+     *   **[<em>thread</em><sub>0</sub> only]** item which is going to be
+     *   subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
+     *   <em>thread</em><sub>0</sub>).
      */
     template <int ITEMS_PER_THREAD,
               typename OutputType,
@@ -559,7 +687,8 @@ public:
     SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
                             OutputType (&output)[ITEMS_PER_THREAD],
                             DifferenceOpT difference_op,
-                            int valid_items)
+                            int valid_items,
+                            T tile_predecessor_item)
     {
       // Share last item
       temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
@@ -592,10 +721,15 @@ public:
         }
       }
 
-      if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
+      if (valid_items <= linear_tid * ITEMS_PER_THREAD)
       {
         output[0] = input[0];
       }
+      else if (linear_tid == 0) 
+      {
+        output[0] = difference_op(input[0], 
+                                  tile_predecessor_item);
+      }
       else
       {
         output[0] = difference_op(input[0],
diff --git a/test/test_block_adjacent_difference.cu b/test/test_block_adjacent_difference.cu
index 26b61b1e5a..cf78d72c57 100644
--- a/test/test_block_adjacent_difference.cu
+++ b/test/test_block_adjacent_difference.cu
@@ -55,6 +55,12 @@
 
 using namespace cub;
 
+template <typename DataType>
+using CountingIteratorT =
+  typename thrust::counting_iterator<DataType,
+    thrust::use_default,
+    std::size_t,
+    std::size_t>;
 
 /**
  * \brief Generates integer sequence \f$S_n=i(i-1)/2\f$.
@@ -133,14 +139,14 @@ struct CustomDifference
   }
 };
 
-
 template <typename DataType,
           unsigned int ThreadsInBlock,
           unsigned int ItemsPerThread,
-          bool ReadLeft = false>
-__global__ void LastTileTestKernel(const DataType *input,
-                                   DataType *output,
-                                   unsigned int valid_items)
+          typename ActionT>
+__global__ void AdjDiffKernel(const DataType *input,
+                              DataType *output,
+                              ActionT action,
+                              bool in_place)
 {
   using BlockAdjacentDifferenceT =
     cub::BlockAdjacentDifference<DataType, ThreadsInBlock>;
@@ -158,252 +164,132 @@ __global__ void LastTileTestKernel(const DataType *input,
   }
   __syncthreads();
 
-  if (ReadLeft)
-  {
-    BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
-      thread_data,
-      thread_result,
-      CustomDifference(),
-      valid_items);
-  }
-  else
-  {
-    BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
-      thread_data,
-      thread_result,
-      CustomDifference(),
-      valid_items);
-  }
+  BlockAdjacentDifferenceT block_adj_diff(temp_storage);
 
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
+  if (in_place)
   {
-    output[thread_offset + item] = thread_result[item];
-  }
-}
-
+    action(thread_data, thread_data, block_adj_diff);
 
-template <typename DataType,
-          unsigned int ThreadsInBlock,
-          unsigned int ItemsPerThread,
-          bool ReadLeft = false>
-__global__ void MiddleTileTestKernel(const DataType *input,
-                                     DataType *output,
-                                     DataType neighbour_tile_value)
-{
-  using BlockAdjacentDifferenceT =
-    cub::BlockAdjacentDifference<DataType, ThreadsInBlock>;
-
-  __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-
-  DataType thread_data[ItemsPerThread];
-  DataType thread_result[ItemsPerThread];
-
-  const unsigned int thread_offset = threadIdx.x * ItemsPerThread;
-
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    thread_data[item] = input[thread_offset + item];
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+      output[thread_offset + item] = thread_data[item];
+    }
   }
-  __syncthreads();
-
-  if (ReadLeft)
+  else 
   {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractLeft(thread_data,
-                    thread_result,
-                    CustomDifference(),
-                    neighbour_tile_value);
-  }
-  else
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractRight(thread_data,
-                     thread_result,
-                     CustomDifference(),
-                     neighbour_tile_value);
-  }
+    action(thread_data, thread_result, block_adj_diff);
 
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    output[thread_offset + item] = thread_result[item];
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+      output[thread_offset + item] = thread_result[item];
+    }
   }
 }
 
-
 template <typename DataType,
-          unsigned int ThreadsInBlock,
           unsigned int ItemsPerThread,
-          bool ReadLeft = false>
-__global__ void MiddleTileInplaceTestKernel(const DataType *input,
-                                            DataType *output,
-                                            DataType neighbour_tile_value)
-{
-  using BlockAdjacentDifferenceT =
-    cub::BlockAdjacentDifference<DataType, ThreadsInBlock>;
-
-  __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-
-  DataType thread_data[ItemsPerThread];
-
-  const unsigned int thread_offset = threadIdx.x * ItemsPerThread;
-
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    thread_data[item] = input[thread_offset + item];
-  }
-  __syncthreads();
-
-  if (ReadLeft)
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractLeft(thread_data,
-                    thread_data,
-                    CustomDifference(),
-                    neighbour_tile_value);
-  }
-  else
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractRight(thread_data,
-                     thread_data,
-                     CustomDifference(),
-                     neighbour_tile_value);
-  }
-
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    output[thread_offset + item] = thread_data[item];
-  }
-}
-
-
-template <typename DataType,
           unsigned int ThreadsInBlock,
-          unsigned int ItemsPerThread,
-          bool ReadLeft = false>
-__global__ void TestKernel(DataType *data)
+          typename ActionT>
+void AdjDiffTest(const DataType *input,
+                 DataType *output,
+                 ActionT action,
+                 bool in_place = false)
 {
-  using BlockAdjacentDifferenceT =
-    cub::BlockAdjacentDifference<DataType, ThreadsInBlock>;
-
-  __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-
-  DataType thread_data[ItemsPerThread];
-  DataType thread_result[ItemsPerThread];
-
-  const unsigned int thread_offset = threadIdx.x * ItemsPerThread;
-
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    thread_data[item] = data[thread_offset + item];
-  }
-  __syncthreads();
+  AdjDiffKernel<DataType, ThreadsInBlock, ItemsPerThread, ActionT>
+    <<<1, ThreadsInBlock>>>(input, output, action, in_place);
 
-  if (ReadLeft)
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractLeft(thread_data, thread_result, CustomDifference());
-  }
-  else
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractRight(thread_data, thread_result, CustomDifference());
-  }
-
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    data[thread_offset + item] = thread_result[item];
-  }
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
 }
 
-
-template <typename DataType,
-          unsigned int ThreadsInBlock,
-          unsigned int ItemsPerThread,
-          bool ReadLeft = false>
-__global__ void LastTileTestInplaceKernel(const DataType *input,
-                                          DataType *output,
-                                          unsigned int valid_items)
+template <bool ReadLeft>
+struct LastTileOpT
 {
-  using BlockAdjacentDifferenceT =
-    cub::BlockAdjacentDifference<DataType, ThreadsInBlock>;
-
-  __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-
-  DataType thread_data[ItemsPerThread];
-
-  const unsigned int thread_offset = threadIdx.x * ItemsPerThread;
-
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    thread_data[item] = input[thread_offset + item];
-  }
-  __syncthreads();
+  unsigned int m_valid_items{};
 
-  if (ReadLeft)
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractLeftPartialTile(thread_data,
-                               thread_data,
-                               CustomDifference(),
-                               valid_items);
-  }
-  else
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractRightPartialTile(thread_data,
-                                thread_data,
-                                CustomDifference(),
-                                valid_items);
-  }
+  __host__ LastTileOpT(unsigned int valid_items)
+    : m_valid_items(valid_items)
+  {}
 
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    output[thread_offset + item] = thread_data[item];
+  template <int ITEMS_PER_THREAD,
+            typename T,
+            typename BlockAdjDiff>
+  __device__ void operator()(T (&input)[ITEMS_PER_THREAD],
+                             T (&output)[ITEMS_PER_THREAD],
+                             BlockAdjDiff &block_adj_diff) const
+  {
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeftPartialTile(input,
+                                             output,
+                                             CustomDifference(),
+                                             m_valid_items);
+    }
+    else
+    {
+      block_adj_diff.SubtractRightPartialTile(input,
+                                              output,
+                                              CustomDifference(),
+                                              m_valid_items);
+    }
   }
-}
+};
 
-template <typename DataType,
-          unsigned int ThreadsInBlock,
-          unsigned int ItemsPerThread,
-          bool ReadLeft = false>
-__global__ void TestInplaceKernel(DataType *data)
+template <typename DataType, 
+          bool ReadLeft>
+struct MiddleTileOpT
 {
-  using BlockAdjacentDifferenceT =
-    cub::BlockAdjacentDifference<DataType, ThreadsInBlock>;
-
-  __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
-
-  DataType thread_data[ItemsPerThread];
+  DataType m_neighbour_tile_value;
 
-  const unsigned int thread_offset = threadIdx.x * ItemsPerThread;
+  __host__ MiddleTileOpT(DataType neighbour_tile_value)
+    : m_neighbour_tile_value(neighbour_tile_value)
+  {}
 
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    thread_data[item] = data[thread_offset + item];
+  template <int ITEMS_PER_THREAD, typename T, typename BlockAdjDiff>
+  __device__ void operator()(T (&input)[ITEMS_PER_THREAD],
+                             T (&output)[ITEMS_PER_THREAD],
+                             BlockAdjDiff &block_adj_diff) const
+  {
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeft(input,
+                                  output,
+                                  CustomDifference(),
+                                  m_neighbour_tile_value);
+    }
+    else
+    {
+      block_adj_diff.SubtractRight(input,
+                                   output,
+                                   CustomDifference(),
+                                   m_neighbour_tile_value);
+    }
   }
-  __syncthreads();
+};
 
-  if (ReadLeft)
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractLeft(thread_data,
-                    thread_data,
-                    CustomDifference());
-  }
-  else
-  {
-    BlockAdjacentDifferenceT(temp_storage)
-      .SubtractRight(thread_data,
-                     thread_data,
-                     CustomDifference());
+template <bool ReadLeft>
+struct BaseOpT
+{
+  template <int ITEMS_PER_THREAD, typename T, typename BlockAdjDiff>
+  __device__ void operator()(T (&input)[ITEMS_PER_THREAD],
+                             T (&output)[ITEMS_PER_THREAD],
+                             BlockAdjDiff &block_adj_diff) const
+  {
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeft(input,
+                                  output,
+                                  CustomDifference());
+    }
+    else
+    {
+      block_adj_diff.SubtractRight(input,
+                                   output,
+                                   CustomDifference());
+    }
   }
+};
 
-  for (unsigned int item = 0; item < ItemsPerThread; item++)
-  {
-    data[thread_offset + item] = thread_data[item];
-  }
-}
 
 template <typename DataType,
           unsigned int ItemsPerThread,
@@ -411,59 +297,71 @@ template <typename DataType,
           bool ReadLeft = false>
 void LastTileTest(const DataType *input,
                   DataType *output,
-                  unsigned int valid_items)
+                  unsigned int valid_items,
+                  bool in_place)
 {
-  LastTileTestKernel<DataType, ThreadsInBlock, ItemsPerThread, ReadLeft>
-    <<<1, ThreadsInBlock>>>(input, output, valid_items);
-
-  CubDebugExit(cudaPeekAtLastError());
-  CubDebugExit(cudaDeviceSynchronize());
+  AdjDiffTest<DataType, ItemsPerThread, ThreadsInBlock>(input,
+                                                        output,
+                                                        LastTileOpT<ReadLeft>{
+                                                          valid_items},
+                                                        in_place);
 }
 
-
-template <typename DataType,
-          unsigned int ItemsPerThread,
-          unsigned int ThreadsInBlock,
-          bool ReadLeft = false>
-void Test(DataType *data)
+template <typename DataType>
+struct LastTileWithPredOpT
 {
-  TestKernel<DataType, ThreadsInBlock, ItemsPerThread, ReadLeft>
-    <<<1, ThreadsInBlock>>>(data);
+  unsigned int m_valid_items;
+  DataType m_neighbour_tile_value;
 
-  CubDebugExit(cudaPeekAtLastError());
-  CubDebugExit(cudaDeviceSynchronize());
-}
+  __host__ LastTileWithPredOpT(
+      unsigned int valid_items,
+      DataType neighbour_tile_value)
+    : m_valid_items(valid_items)
+    , m_neighbour_tile_value(neighbour_tile_value)
+  {
+  }
 
+  template <int ITEMS_PER_THREAD, typename T, typename BlockAdjDiff>
+  __device__ void operator()(T (&input)[ITEMS_PER_THREAD],
+                             T (&output)[ITEMS_PER_THREAD],
+                             BlockAdjDiff &block_adj_diff) const
+  {
+    block_adj_diff.SubtractLeftPartialTile(input,
+                                           output,
+                                           CustomDifference(),
+                                           m_valid_items,
+                                           m_neighbour_tile_value);
+  }
+};
 
 template <typename DataType,
           unsigned int ItemsPerThread,
-          unsigned int ThreadsInBlock,
-          bool ReadLeft = false>
-void MiddleTileTest(const DataType *input,
-                    DataType *output,
-                    DataType neighbour_tile_value)
+          unsigned int ThreadsInBlock>
+void LastTileWithPredTest(const DataType *input,
+                          DataType *output,
+                          unsigned int valid_items,
+                          DataType neighbour_tile_value,
+                          bool in_place)
 {
-  MiddleTileTestKernel<DataType, ThreadsInBlock, ItemsPerThread, ReadLeft>
-    <<<1, ThreadsInBlock>>>(input, output, neighbour_tile_value);
-
-  CubDebugExit(cudaPeekAtLastError());
-  CubDebugExit(cudaDeviceSynchronize());
+  AdjDiffTest<DataType, ItemsPerThread, ThreadsInBlock>(
+    input,
+    output,
+    LastTileWithPredOpT<DataType>{valid_items, neighbour_tile_value},
+    in_place);
 }
 
-
 template <typename DataType,
           unsigned int ItemsPerThread,
           unsigned int ThreadsInBlock,
           bool ReadLeft = false>
-void LastTileInplaceTest(const DataType *input,
-                         DataType *output,
-                         unsigned int valid_items)
+void Test(DataType *data,
+          bool in_place)
 {
-  LastTileTestInplaceKernel<DataType, ThreadsInBlock, ItemsPerThread, ReadLeft>
-    <<<1, ThreadsInBlock>>>(input, output, valid_items);
-
-  CubDebugExit(cudaPeekAtLastError());
-  CubDebugExit(cudaDeviceSynchronize());
+  AdjDiffTest<DataType, ItemsPerThread, ThreadsInBlock>(
+    data,
+    data,
+    BaseOpT<ReadLeft>{},
+    in_place);
 }
 
 
@@ -471,29 +369,16 @@ template <typename DataType,
           unsigned int ItemsPerThread,
           unsigned int ThreadsInBlock,
           bool ReadLeft = false>
-void InplaceTest(DataType *data)
-{
-  TestInplaceKernel<DataType, ThreadsInBlock, ItemsPerThread, ReadLeft>
-    <<<1, ThreadsInBlock>>>(data);
-
-  CubDebugExit(cudaPeekAtLastError());
-  CubDebugExit(cudaDeviceSynchronize());
-}
-
-
-template <typename DataType,
-          unsigned int ItemsPerThread,
-          unsigned int ThreadsInBlock,
-          bool ReadLeft = false>
-void MiddleTileInplaceTest(const DataType *input,
-                           DataType *output,
-                           DataType neighbour_tile_value)
+void MiddleTileTest(const DataType *input,
+                    DataType *output,
+                    DataType neighbour_tile_value,
+                    bool in_place)
 {
-  MiddleTileInplaceTestKernel<DataType, ThreadsInBlock, ItemsPerThread, ReadLeft>
-    <<<1, ThreadsInBlock>>>(input, output, neighbour_tile_value);
-
-  CubDebugExit(cudaPeekAtLastError());
-  CubDebugExit(cudaDeviceSynchronize());
+  AdjDiffTest<DataType, ItemsPerThread, ThreadsInBlock>(
+    input,
+    output,
+    MiddleTileOpT<DataType, ReadLeft>{neighbour_tile_value},
+    in_place);
 }
 
 
@@ -532,56 +417,48 @@ void TestLastTile(bool inplace,
   DataType *d_input_ptr = thrust::raw_pointer_cast(d_input.data());
   DataType *d_output_ptr = thrust::raw_pointer_cast(d_output.data());
 
-  if (inplace)
-  {
-    LastTileInplaceTest<DataType, ItemsPerThread, ThreadsInBlock, read_left>(
-      d_input_ptr,
-      d_output_ptr,
-      num_items);
-  }
-  else
-  {
-    LastTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_left>(
-      d_input_ptr,
-      d_output_ptr,
-      num_items);
-  }
+  LastTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_left>(
+    d_input_ptr,
+    d_output_ptr,
+    num_items,
+    inplace);
 
   {
-    using CountingIteratorT =
-      typename thrust::counting_iterator<DataType,
-        thrust::use_default,
-        std::size_t,
-        std::size_t>;
-
     AssertEquals(d_output.front(), d_input.front());
     AssertTrue(CheckResult(d_output.begin() + 1,
                            d_output.begin() + num_items,
-                           CountingIteratorT(DataType{0})));
+                           CountingIteratorT<DataType>(DataType{0})));
     AssertTrue(CheckResult(d_output.begin() + num_items,
                            d_output.end(),
                            d_input.begin() + num_items));
   }
 
+  if (num_items > 0)
+  {
+    LastTileWithPredTest<DataType, ItemsPerThread, ThreadsInBlock>(
+      d_input_ptr +  1,
+      d_output_ptr,
+      num_items - 1,
+      TestSequenceGenerator<DataType>{}(0),
+      inplace);
+
+    AssertTrue(CheckResult(d_output.begin(),
+                           d_output.begin() + num_items - 1,
+                           CountingIteratorT<DataType>(DataType{0})));
+    AssertTrue(CheckResult(d_output.begin() + num_items - 1,
+                           d_output.end() - 1,
+                           d_input.begin() + num_items));
+  }
 
   thrust::tabulate(d_input.begin(),
                    d_input.end(),
                    TestSequenceGenerator<DataType>{});
 
-  if (inplace)
-  {
-    LastTileInplaceTest<DataType, ItemsPerThread, ThreadsInBlock, read_right>(
-      d_input_ptr,
-      d_output_ptr,
-      num_items);
-  }
-  else
-  {
-    LastTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_right>(
-      d_input_ptr,
-      d_output_ptr,
-      num_items);
-  }
+  LastTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_right>(
+    d_input_ptr,
+    d_output_ptr,
+    num_items,
+    inplace);
 
   {
     thrust::device_vector<DataType> reference(num_items);
@@ -622,51 +499,27 @@ void TestMiddleTile(bool inplace,
     TestSequenceGenerator<DataType>{}(d_input.size())
   };
 
-  if (inplace)
-  {
-    MiddleTileInplaceTest<DataType, ItemsPerThread, ThreadsInBlock, read_left>(
-      d_input_ptr,
-      d_output_ptr,
-      left_tile_last_value);
-  }
-  else
-  {
-    MiddleTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_left>(
-      d_input_ptr,
-      d_output_ptr,
-      left_tile_last_value);
-  }
+  MiddleTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_left>(
+    d_input_ptr,
+    d_output_ptr,
+    left_tile_last_value,
+    inplace);
 
   {
-    using CountingIteratorT =
-      typename thrust::counting_iterator<DataType,
-                                         thrust::use_default,
-                                         std::size_t,
-                                         std::size_t>;
-
     AssertTrue(CheckResult(d_output.begin(),
                            d_output.end(),
-                           CountingIteratorT(DataType{0})));
+                           CountingIteratorT<DataType>(DataType{0})));
   }
 
   thrust::tabulate(d_input.begin(),
                    d_input.end(),
                    TestSequenceGenerator<DataType>{});
 
-  if (inplace)
-  {
-    MiddleTileInplaceTest<DataType, ItemsPerThread, ThreadsInBlock, read_right>(
-      d_input_ptr,
-      d_output_ptr,
-      right_tile_first_value);
-  }
-  else
-  {
-    MiddleTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_right>(
-      d_input_ptr,
-      d_output_ptr,
-      right_tile_first_value);
-  }
+  MiddleTileTest<DataType, ItemsPerThread, ThreadsInBlock, read_right>(
+    d_input_ptr,
+    d_output_ptr,
+    right_tile_first_value,
+    inplace);
 
   {
     thrust::device_vector<DataType> reference(d_input.size());
@@ -716,42 +569,21 @@ void TestFullTile(bool inplace,
 
   DataType *d_data_ptr = thrust::raw_pointer_cast(d_data.data());
 
-  if (inplace)
-  {
-    InplaceTest<DataType, ItemsPerThread, ThreadsInBlock, read_left>(
-      d_data_ptr);
-  }
-  else
-  {
-    Test<DataType, ItemsPerThread, ThreadsInBlock, read_left>(d_data_ptr);
-  }
-
+  Test<DataType, ItemsPerThread, ThreadsInBlock, read_left>(d_data_ptr,
+                                                            inplace);
   {
-    using CountingIteratorT =
-    typename thrust::counting_iterator<DataType,
-      thrust::use_default,
-      std::size_t,
-      std::size_t>;
-
     AssertEquals(d_data.front(), TestSequenceGenerator<DataType>{}(0));
     AssertTrue(CheckResult(d_data.begin() + 1,
                            d_data.end(),
-                           CountingIteratorT(DataType{0})));
+                           CountingIteratorT<DataType>(DataType{0})));
   }
 
   thrust::tabulate(d_data.begin(),
                    d_data.end(),
                    TestSequenceGenerator<DataType>{});
 
-  if (inplace)
-  {
-    InplaceTest<DataType, ItemsPerThread, ThreadsInBlock, read_right>(
-      d_data_ptr);
-  }
-  else
-  {
-    Test<DataType, ItemsPerThread, ThreadsInBlock, read_right>(d_data_ptr);
-  }
+  Test<DataType, ItemsPerThread, ThreadsInBlock, read_right>(d_data_ptr,
+                                                             inplace);
 
   {
     thrust::device_vector<DataType> reference(d_data.size());
@@ -780,15 +612,8 @@ void TestCustomType(bool inplace,
   constexpr bool read_left  = true;
   constexpr bool read_right = false;
 
-  if (inplace)
-  {
-    InplaceTest<CustomType, ItemsPerThread, ThreadsInBlock, read_left>(
-      d_data_ptr);
-  }
-  else
-  {
-    Test<CustomType, ItemsPerThread, ThreadsInBlock, read_left>(d_data_ptr);
-  }
+  Test<CustomType, ItemsPerThread, ThreadsInBlock, read_left>(d_data_ptr,
+                                                              inplace);
 
   {
     const std::size_t expected_count = d_data.size();
@@ -800,15 +625,8 @@ void TestCustomType(bool inplace,
 
   thrust::tabulate(d_data.begin(), d_data.end(), IntToCustomType{});
 
-  if (inplace)
-  {
-    InplaceTest<CustomType, ItemsPerThread, ThreadsInBlock, read_right>(
-      d_data_ptr);
-  }
-  else
-  {
-    Test<CustomType, ItemsPerThread, ThreadsInBlock, read_right>(d_data_ptr);
-  }
+  Test<CustomType, ItemsPerThread, ThreadsInBlock, read_right>(d_data_ptr,
+                                                               inplace);
 
   {
     const auto unsigned_minus_one = static_cast<unsigned int>(-1);
@@ -898,3 +716,4 @@ int main(int argc, char** argv)
 
   return 0;
 }
+
diff --git a/test/test_device_adjacent_difference.cu b/test/test_device_adjacent_difference.cu
index ee26bc9ed8..8c1a8a0476 100644
--- a/test/test_device_adjacent_difference.cu
+++ b/test/test_device_adjacent_difference.cu
@@ -34,6 +34,7 @@
 
 #include <thrust/count.h>
 #include <thrust/device_vector.h>
+#include <thrust/fill.h>
 #include <thrust/host_vector.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -646,6 +647,47 @@ void TestAdjacentDifferenceWithBigIndexes()
   TestAdjacentDifferenceWithBigIndexesHelper(33);
 }
 
+struct InvocationsCounter
+{
+  int *m_d_counts{};
+
+  explicit InvocationsCounter(int *d_counts) : m_d_counts(d_counts) {}
+
+  __device__ int operator()(int l, int /* r */) const 
+  {
+    atomicAdd(m_d_counts + l, 1);
+  }
+};
+
+void TestAdjacentDifferenceOpInvocationsNum(int num_items)
+{
+  auto in = thrust::make_counting_iterator(0);
+  auto out = thrust::make_discard_iterator();
+
+  thrust::device_vector<int> num_of_invocations(num_items, 0);
+  InvocationsCounter op{thrust::raw_pointer_cast(num_of_invocations.data())};
+
+  AdjacentDifferenceCopy<READ_LEFT>(in, out, op, num_items);
+  AssertEquals(
+    num_items - 1,
+    thrust::count(num_of_invocations.begin() + 1, num_of_invocations.end(), 1));
+  AssertEquals(0, num_of_invocations[0]);
+
+  thrust::fill_n(num_of_invocations.begin(), num_items, 0);
+  AdjacentDifferenceCopy<READ_RIGHT>(in, out, op, num_items);
+  AssertEquals(
+    num_items - 1,
+    thrust::count(num_of_invocations.begin(), num_of_invocations.end() - 1, 1));
+  AssertEquals(0, num_of_invocations[num_items - 1]);
+}
+
+void TestAdjacentDifferenceOpInvocationsNum()
+{
+  for (int num_items = 1; num_items < 4096; num_items *= 2)
+  {
+    TestAdjacentDifferenceOpInvocationsNum(num_items);
+  }
+}
 
 int main(int argc, char** argv)
 {
@@ -660,6 +702,7 @@ int main(int argc, char** argv)
     TestSize(1ull << power_of_two);
   }
   TestAdjacentDifferenceWithBigIndexes();
+  TestAdjacentDifferenceOpInvocationsNum();
 
   return 0;
 }