diff --git a/cub/agent/agent_adjacent_difference.cuh b/cub/agent/agent_adjacent_difference.cuh index fd07731639..b135fbbf53 100644 --- a/cub/agent/agent_adjacent_difference.cuh +++ b/cub/agent/agent_adjacent_difference.cuh @@ -143,16 +143,40 @@ struct AgentDifference { if (IS_FIRST_TILE) { - BlockAdjacentDifferenceT(temp_storage.adjacent_difference) - .SubtractLeft(input, output, difference_op); + if (IS_LAST_TILE) + { + BlockAdjacentDifferenceT(temp_storage.adjacent_difference) + .SubtractLeftPartialTile(input, + output, + difference_op, + num_remaining); + } + else + { + BlockAdjacentDifferenceT(temp_storage.adjacent_difference) + .SubtractLeft(input, output, difference_op); + } } else { - InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx] - : *(input_it + tile_base - 1); - - BlockAdjacentDifferenceT(temp_storage.adjacent_difference) - .SubtractLeft(input, output, difference_op, tile_prev_input); + InputT tile_prev_input = MayAlias + ? first_tile_previous[tile_idx] + : *(input_it + tile_base - 1); + + if (IS_LAST_TILE) + { + BlockAdjacentDifferenceT(temp_storage.adjacent_difference) + .SubtractLeftPartialTile(input, + output, + difference_op, + num_remaining, + tile_prev_input); + } + else + { + BlockAdjacentDifferenceT(temp_storage.adjacent_difference) + .SubtractLeft(input, output, difference_op, tile_prev_input); + } } } else diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh index 5c7be8a729..524ffbebfa 100644 --- a/cub/block/block_adjacent_difference.cuh +++ b/cub/block/block_adjacent_difference.cuh @@ -489,15 +489,16 @@ public: } /** - * @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block. + * @brief Subtracts the left element of each adjacent pair of elements + * partitioned across a CUDA thread block. * * @par * - \rowmajor * - \smemreuse * * @par Snippet - * The code snippet below illustrates how to use @p BlockAdjacentDifference to - * compute the left difference between adjacent elements. + * The code snippet below illustrates how to use @p BlockAdjacentDifference + * to compute the left difference between adjacent elements. * * @par * @code @@ -515,30 +516,152 @@ public: * * __global__ void ExampleKernel(...) * { - * // Specialize BlockAdjacentDifference for a 1D block of - * // 128 threads of type int - * using BlockAdjacentDifferenceT = - * cub::BlockAdjacentDifference; + * // Specialize BlockAdjacentDifference for a 1D block of + * // 128 threads of type int + * using BlockAdjacentDifferenceT = + * cub::BlockAdjacentDifference; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * int valid_items = 9; + * + * // Collectively compute adjacent_difference + * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile( + * thread_data, + * thread_data, + * CustomDifference(), + * valid_items); * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; + * @endcode + * @par + * Suppose the set of input `thread_data` across the block of threads is + * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. + * The corresponding output `result` in those threads will be + * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`. * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... + * @param[out] output + * Calling thread's adjacent difference result * - * // Collectively compute adjacent_difference - * BlockAdjacentDifferenceT(temp_storage).SubtractLeft( - * thread_data, - * thread_data, - * CustomDifference()); + * @param[in] input + * Calling thread's input items (may be aliased to \p output) + * + * @param[in] difference_op + * Binary difference operator + * + * @param[in] valid_items + * Number of valid items in thread block + */ + template + __device__ __forceinline__ void + SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD], + OutputType (&output)[ITEMS_PER_THREAD], + DifferenceOpT difference_op, + int valid_items) + { + // Share last item + temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items) + { + #pragma unroll + for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) + { + output[item] = difference_op(input[item], input[item - 1]); + } + } + else + { + #pragma unroll + for (int item = ITEMS_PER_THREAD - 1; item > 0; item--) + { + const int idx = linear_tid * ITEMS_PER_THREAD + item; + + if (idx < valid_items) + { + output[item] = difference_op(input[item], input[item - 1]); + } + else + { + output[item] = input[item]; + } + } + } + + if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD) + { + output[0] = input[0]; + } + else + { + output[0] = difference_op(input[0], + temp_storage.last_items[linear_tid - 1]); + } + } + + /** + * @brief Subtracts the left element of each adjacent pair of elements + * partitioned across a CUDA thread block. + * + * @par + * - \rowmajor + * - \smemreuse + * + * @par Snippet + * The code snippet below illustrates how to use @p BlockAdjacentDifference + * to compute the left difference between adjacent elements. + * + * @par + * @code + * #include + * // or equivalently + * + * struct CustomDifference + * { + * template + * __device__ DataType operator()(DataType &lhs, DataType &rhs) + * { + * return lhs - rhs; + * } + * }; + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockAdjacentDifference for a 1D block of + * // 128 threads of type int + * using BlockAdjacentDifferenceT = + * cub::BlockAdjacentDifference; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * int valid_items = 9; + * int tile_predecessor_item = 4; + * + * // Collectively compute adjacent_difference + * BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile( + * thread_data, + * thread_data, + * CustomDifference(), + * valid_items, + * tile_predecessor_item); * * @endcode * @par * Suppose the set of input `thread_data` across the block of threads is * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`. * The corresponding output `result` in those threads will be - * `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`. + * `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`. * * @param[out] output * Calling thread's adjacent difference result @@ -551,6 +674,11 @@ public: * * @param[in] valid_items * Number of valid items in thread block + * + * @param[in] tile_predecessor_item + * **[thread0 only]** item which is going to be + * subtracted from the first tile item (input0 from + * thread0). */ template +using CountingIteratorT = + typename thrust::counting_iterator; /** * \brief Generates integer sequence \f$S_n=i(i-1)/2\f$. @@ -133,14 +139,14 @@ struct CustomDifference } }; - template -__global__ void LastTileTestKernel(const DataType *input, - DataType *output, - unsigned int valid_items) + typename ActionT> +__global__ void AdjDiffKernel(const DataType *input, + DataType *output, + ActionT action, + bool in_place) { using BlockAdjacentDifferenceT = cub::BlockAdjacentDifference; @@ -158,252 +164,132 @@ __global__ void LastTileTestKernel(const DataType *input, } __syncthreads(); - if (ReadLeft) - { - BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile( - thread_data, - thread_result, - CustomDifference(), - valid_items); - } - else - { - BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile( - thread_data, - thread_result, - CustomDifference(), - valid_items); - } + BlockAdjacentDifferenceT block_adj_diff(temp_storage); - for (unsigned int item = 0; item < ItemsPerThread; item++) + if (in_place) { - output[thread_offset + item] = thread_result[item]; - } -} - + action(thread_data, thread_data, block_adj_diff); -template -__global__ void MiddleTileTestKernel(const DataType *input, - DataType *output, - DataType neighbour_tile_value) -{ - using BlockAdjacentDifferenceT = - cub::BlockAdjacentDifference; - - __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; - - DataType thread_data[ItemsPerThread]; - DataType thread_result[ItemsPerThread]; - - const unsigned int thread_offset = threadIdx.x * ItemsPerThread; - - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - thread_data[item] = input[thread_offset + item]; + for (unsigned int item = 0; item < ItemsPerThread; item++) + { + output[thread_offset + item] = thread_data[item]; + } } - __syncthreads(); - - if (ReadLeft) + else { - BlockAdjacentDifferenceT(temp_storage) - .SubtractLeft(thread_data, - thread_result, - CustomDifference(), - neighbour_tile_value); - } - else - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractRight(thread_data, - thread_result, - CustomDifference(), - neighbour_tile_value); - } + action(thread_data, thread_result, block_adj_diff); - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - output[thread_offset + item] = thread_result[item]; + for (unsigned int item = 0; item < ItemsPerThread; item++) + { + output[thread_offset + item] = thread_result[item]; + } } } - template -__global__ void MiddleTileInplaceTestKernel(const DataType *input, - DataType *output, - DataType neighbour_tile_value) -{ - using BlockAdjacentDifferenceT = - cub::BlockAdjacentDifference; - - __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; - - DataType thread_data[ItemsPerThread]; - - const unsigned int thread_offset = threadIdx.x * ItemsPerThread; - - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - thread_data[item] = input[thread_offset + item]; - } - __syncthreads(); - - if (ReadLeft) - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractLeft(thread_data, - thread_data, - CustomDifference(), - neighbour_tile_value); - } - else - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractRight(thread_data, - thread_data, - CustomDifference(), - neighbour_tile_value); - } - - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - output[thread_offset + item] = thread_data[item]; - } -} - - -template -__global__ void TestKernel(DataType *data) + typename ActionT> +void AdjDiffTest(const DataType *input, + DataType *output, + ActionT action, + bool in_place = false) { - using BlockAdjacentDifferenceT = - cub::BlockAdjacentDifference; - - __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; - - DataType thread_data[ItemsPerThread]; - DataType thread_result[ItemsPerThread]; - - const unsigned int thread_offset = threadIdx.x * ItemsPerThread; - - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - thread_data[item] = data[thread_offset + item]; - } - __syncthreads(); + AdjDiffKernel + <<<1, ThreadsInBlock>>>(input, output, action, in_place); - if (ReadLeft) - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractLeft(thread_data, thread_result, CustomDifference()); - } - else - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractRight(thread_data, thread_result, CustomDifference()); - } - - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - data[thread_offset + item] = thread_result[item]; - } + CubDebugExit(cudaPeekAtLastError()); + CubDebugExit(cudaDeviceSynchronize()); } - -template -__global__ void LastTileTestInplaceKernel(const DataType *input, - DataType *output, - unsigned int valid_items) +template +struct LastTileOpT { - using BlockAdjacentDifferenceT = - cub::BlockAdjacentDifference; - - __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; - - DataType thread_data[ItemsPerThread]; - - const unsigned int thread_offset = threadIdx.x * ItemsPerThread; - - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - thread_data[item] = input[thread_offset + item]; - } - __syncthreads(); + unsigned int m_valid_items{}; - if (ReadLeft) - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractLeftPartialTile(thread_data, - thread_data, - CustomDifference(), - valid_items); - } - else - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractRightPartialTile(thread_data, - thread_data, - CustomDifference(), - valid_items); - } + __host__ LastTileOpT(unsigned int valid_items) + : m_valid_items(valid_items) + {} - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - output[thread_offset + item] = thread_data[item]; + template + __device__ void operator()(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockAdjDiff &block_adj_diff) const + { + if (ReadLeft) + { + block_adj_diff.SubtractLeftPartialTile(input, + output, + CustomDifference(), + m_valid_items); + } + else + { + block_adj_diff.SubtractRightPartialTile(input, + output, + CustomDifference(), + m_valid_items); + } } -} +}; -template -__global__ void TestInplaceKernel(DataType *data) +template +struct MiddleTileOpT { - using BlockAdjacentDifferenceT = - cub::BlockAdjacentDifference; - - __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage; - - DataType thread_data[ItemsPerThread]; + DataType m_neighbour_tile_value; - const unsigned int thread_offset = threadIdx.x * ItemsPerThread; + __host__ MiddleTileOpT(DataType neighbour_tile_value) + : m_neighbour_tile_value(neighbour_tile_value) + {} - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - thread_data[item] = data[thread_offset + item]; + template + __device__ void operator()(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockAdjDiff &block_adj_diff) const + { + if (ReadLeft) + { + block_adj_diff.SubtractLeft(input, + output, + CustomDifference(), + m_neighbour_tile_value); + } + else + { + block_adj_diff.SubtractRight(input, + output, + CustomDifference(), + m_neighbour_tile_value); + } } - __syncthreads(); +}; - if (ReadLeft) - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractLeft(thread_data, - thread_data, - CustomDifference()); - } - else - { - BlockAdjacentDifferenceT(temp_storage) - .SubtractRight(thread_data, - thread_data, - CustomDifference()); +template +struct BaseOpT +{ + template + __device__ void operator()(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockAdjDiff &block_adj_diff) const + { + if (ReadLeft) + { + block_adj_diff.SubtractLeft(input, + output, + CustomDifference()); + } + else + { + block_adj_diff.SubtractRight(input, + output, + CustomDifference()); + } } +}; - for (unsigned int item = 0; item < ItemsPerThread; item++) - { - data[thread_offset + item] = thread_data[item]; - } -} template void LastTileTest(const DataType *input, DataType *output, - unsigned int valid_items) + unsigned int valid_items, + bool in_place) { - LastTileTestKernel - <<<1, ThreadsInBlock>>>(input, output, valid_items); - - CubDebugExit(cudaPeekAtLastError()); - CubDebugExit(cudaDeviceSynchronize()); + AdjDiffTest(input, + output, + LastTileOpT{ + valid_items}, + in_place); } - -template -void Test(DataType *data) +template +struct LastTileWithPredOpT { - TestKernel - <<<1, ThreadsInBlock>>>(data); + unsigned int m_valid_items; + DataType m_neighbour_tile_value; - CubDebugExit(cudaPeekAtLastError()); - CubDebugExit(cudaDeviceSynchronize()); -} + __host__ LastTileWithPredOpT( + unsigned int valid_items, + DataType neighbour_tile_value) + : m_valid_items(valid_items) + , m_neighbour_tile_value(neighbour_tile_value) + { + } + template + __device__ void operator()(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockAdjDiff &block_adj_diff) const + { + block_adj_diff.SubtractLeftPartialTile(input, + output, + CustomDifference(), + m_valid_items, + m_neighbour_tile_value); + } +}; template -void MiddleTileTest(const DataType *input, - DataType *output, - DataType neighbour_tile_value) + unsigned int ThreadsInBlock> +void LastTileWithPredTest(const DataType *input, + DataType *output, + unsigned int valid_items, + DataType neighbour_tile_value, + bool in_place) { - MiddleTileTestKernel - <<<1, ThreadsInBlock>>>(input, output, neighbour_tile_value); - - CubDebugExit(cudaPeekAtLastError()); - CubDebugExit(cudaDeviceSynchronize()); + AdjDiffTest( + input, + output, + LastTileWithPredOpT{valid_items, neighbour_tile_value}, + in_place); } - template -void LastTileInplaceTest(const DataType *input, - DataType *output, - unsigned int valid_items) +void Test(DataType *data, + bool in_place) { - LastTileTestInplaceKernel - <<<1, ThreadsInBlock>>>(input, output, valid_items); - - CubDebugExit(cudaPeekAtLastError()); - CubDebugExit(cudaDeviceSynchronize()); + AdjDiffTest( + data, + data, + BaseOpT{}, + in_place); } @@ -471,29 +369,16 @@ template -void InplaceTest(DataType *data) -{ - TestInplaceKernel - <<<1, ThreadsInBlock>>>(data); - - CubDebugExit(cudaPeekAtLastError()); - CubDebugExit(cudaDeviceSynchronize()); -} - - -template -void MiddleTileInplaceTest(const DataType *input, - DataType *output, - DataType neighbour_tile_value) +void MiddleTileTest(const DataType *input, + DataType *output, + DataType neighbour_tile_value, + bool in_place) { - MiddleTileInplaceTestKernel - <<<1, ThreadsInBlock>>>(input, output, neighbour_tile_value); - - CubDebugExit(cudaPeekAtLastError()); - CubDebugExit(cudaDeviceSynchronize()); + AdjDiffTest( + input, + output, + MiddleTileOpT{neighbour_tile_value}, + in_place); } @@ -532,56 +417,48 @@ void TestLastTile(bool inplace, DataType *d_input_ptr = thrust::raw_pointer_cast(d_input.data()); DataType *d_output_ptr = thrust::raw_pointer_cast(d_output.data()); - if (inplace) - { - LastTileInplaceTest( - d_input_ptr, - d_output_ptr, - num_items); - } - else - { - LastTileTest( - d_input_ptr, - d_output_ptr, - num_items); - } + LastTileTest( + d_input_ptr, + d_output_ptr, + num_items, + inplace); { - using CountingIteratorT = - typename thrust::counting_iterator; - AssertEquals(d_output.front(), d_input.front()); AssertTrue(CheckResult(d_output.begin() + 1, d_output.begin() + num_items, - CountingIteratorT(DataType{0}))); + CountingIteratorT(DataType{0}))); AssertTrue(CheckResult(d_output.begin() + num_items, d_output.end(), d_input.begin() + num_items)); } + if (num_items > 0) + { + LastTileWithPredTest( + d_input_ptr + 1, + d_output_ptr, + num_items - 1, + TestSequenceGenerator{}(0), + inplace); + + AssertTrue(CheckResult(d_output.begin(), + d_output.begin() + num_items - 1, + CountingIteratorT(DataType{0}))); + AssertTrue(CheckResult(d_output.begin() + num_items - 1, + d_output.end() - 1, + d_input.begin() + num_items)); + } thrust::tabulate(d_input.begin(), d_input.end(), TestSequenceGenerator{}); - if (inplace) - { - LastTileInplaceTest( - d_input_ptr, - d_output_ptr, - num_items); - } - else - { - LastTileTest( - d_input_ptr, - d_output_ptr, - num_items); - } + LastTileTest( + d_input_ptr, + d_output_ptr, + num_items, + inplace); { thrust::device_vector reference(num_items); @@ -622,51 +499,27 @@ void TestMiddleTile(bool inplace, TestSequenceGenerator{}(d_input.size()) }; - if (inplace) - { - MiddleTileInplaceTest( - d_input_ptr, - d_output_ptr, - left_tile_last_value); - } - else - { - MiddleTileTest( - d_input_ptr, - d_output_ptr, - left_tile_last_value); - } + MiddleTileTest( + d_input_ptr, + d_output_ptr, + left_tile_last_value, + inplace); { - using CountingIteratorT = - typename thrust::counting_iterator; - AssertTrue(CheckResult(d_output.begin(), d_output.end(), - CountingIteratorT(DataType{0}))); + CountingIteratorT(DataType{0}))); } thrust::tabulate(d_input.begin(), d_input.end(), TestSequenceGenerator{}); - if (inplace) - { - MiddleTileInplaceTest( - d_input_ptr, - d_output_ptr, - right_tile_first_value); - } - else - { - MiddleTileTest( - d_input_ptr, - d_output_ptr, - right_tile_first_value); - } + MiddleTileTest( + d_input_ptr, + d_output_ptr, + right_tile_first_value, + inplace); { thrust::device_vector reference(d_input.size()); @@ -716,42 +569,21 @@ void TestFullTile(bool inplace, DataType *d_data_ptr = thrust::raw_pointer_cast(d_data.data()); - if (inplace) - { - InplaceTest( - d_data_ptr); - } - else - { - Test(d_data_ptr); - } - + Test(d_data_ptr, + inplace); { - using CountingIteratorT = - typename thrust::counting_iterator; - AssertEquals(d_data.front(), TestSequenceGenerator{}(0)); AssertTrue(CheckResult(d_data.begin() + 1, d_data.end(), - CountingIteratorT(DataType{0}))); + CountingIteratorT(DataType{0}))); } thrust::tabulate(d_data.begin(), d_data.end(), TestSequenceGenerator{}); - if (inplace) - { - InplaceTest( - d_data_ptr); - } - else - { - Test(d_data_ptr); - } + Test(d_data_ptr, + inplace); { thrust::device_vector reference(d_data.size()); @@ -780,15 +612,8 @@ void TestCustomType(bool inplace, constexpr bool read_left = true; constexpr bool read_right = false; - if (inplace) - { - InplaceTest( - d_data_ptr); - } - else - { - Test(d_data_ptr); - } + Test(d_data_ptr, + inplace); { const std::size_t expected_count = d_data.size(); @@ -800,15 +625,8 @@ void TestCustomType(bool inplace, thrust::tabulate(d_data.begin(), d_data.end(), IntToCustomType{}); - if (inplace) - { - InplaceTest( - d_data_ptr); - } - else - { - Test(d_data_ptr); - } + Test(d_data_ptr, + inplace); { const auto unsigned_minus_one = static_cast(-1); @@ -898,3 +716,4 @@ int main(int argc, char** argv) return 0; } + diff --git a/test/test_device_adjacent_difference.cu b/test/test_device_adjacent_difference.cu index ee26bc9ed8..8c1a8a0476 100644 --- a/test/test_device_adjacent_difference.cu +++ b/test/test_device_adjacent_difference.cu @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -646,6 +647,47 @@ void TestAdjacentDifferenceWithBigIndexes() TestAdjacentDifferenceWithBigIndexesHelper(33); } +struct InvocationsCounter +{ + int *m_d_counts{}; + + explicit InvocationsCounter(int *d_counts) : m_d_counts(d_counts) {} + + __device__ int operator()(int l, int /* r */) const + { + atomicAdd(m_d_counts + l, 1); + } +}; + +void TestAdjacentDifferenceOpInvocationsNum(int num_items) +{ + auto in = thrust::make_counting_iterator(0); + auto out = thrust::make_discard_iterator(); + + thrust::device_vector num_of_invocations(num_items, 0); + InvocationsCounter op{thrust::raw_pointer_cast(num_of_invocations.data())}; + + AdjacentDifferenceCopy(in, out, op, num_items); + AssertEquals( + num_items - 1, + thrust::count(num_of_invocations.begin() + 1, num_of_invocations.end(), 1)); + AssertEquals(0, num_of_invocations[0]); + + thrust::fill_n(num_of_invocations.begin(), num_items, 0); + AdjacentDifferenceCopy(in, out, op, num_items); + AssertEquals( + num_items - 1, + thrust::count(num_of_invocations.begin(), num_of_invocations.end() - 1, 1)); + AssertEquals(0, num_of_invocations[num_items - 1]); +} + +void TestAdjacentDifferenceOpInvocationsNum() +{ + for (int num_items = 1; num_items < 4096; num_items *= 2) + { + TestAdjacentDifferenceOpInvocationsNum(num_items); + } +} int main(int argc, char** argv) { @@ -660,6 +702,7 @@ int main(int argc, char** argv) TestSize(1ull << power_of_two); } TestAdjacentDifferenceWithBigIndexes(); + TestAdjacentDifferenceOpInvocationsNum(); return 0; }