Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Fix temp storage allocation in adjacent difference copy #508

Merged
merged 2 commits into from
Jun 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -143,16 +143,40 @@ struct AgentDifference
{
if (IS_FIRST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op);
if (IS_LAST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeftPartialTile(input,
output,
difference_op,
num_remaining);
}
else
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op);
}
}
else
{
InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);

BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op, tile_prev_input);
InputT tile_prev_input = MayAlias
? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);

if (IS_LAST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeftPartialTile(input,
output,
difference_op,
num_remaining,
tile_prev_input);
}
else
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op, tile_prev_input);
}
}
}
else
Expand Down
174 changes: 154 additions & 20 deletions cub/block/block_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -489,15 +489,16 @@ public:
}

/**
* @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
* @brief Subtracts the left element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference to
* compute the left difference between adjacent elements.
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the left difference between adjacent elements.
*
* @par
* @code
Expand All @@ -515,30 +516,152 @@ public:
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference<int, 128>;
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference<int, 128>;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
* int valid_items = 9;
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
* thread_data,
* thread_data,
* CustomDifference(),
* valid_items);
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
* The corresponding output `result` in those threads will be
* `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
* @param[out] output
* Calling thread's adjacent difference result
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
* thread_data,
* thread_data,
* CustomDifference());
* @param[in] input
* Calling thread's input items (may be aliased to \p output)
*
* @param[in] difference_op
* Binary difference operator
*
* @param[in] valid_items
* Number of valid items in thread block
*/
template <int ITEMS_PER_THREAD,
typename OutputType,
typename DifferenceOpT>
__device__ __forceinline__ void
SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];

CTA_SYNC();

if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
{
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
output[item] = difference_op(input[item], input[item - 1]);
}
}
else
{
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
const int idx = linear_tid * ITEMS_PER_THREAD + item;

if (idx < valid_items)
{
output[item] = difference_op(input[item], input[item - 1]);
}
else
{
output[item] = input[item];
}
}
}

if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
{
output[0] = input[0];
}
else
{
output[0] = difference_op(input[0],
temp_storage.last_items[linear_tid - 1]);
}
}

/**
* @brief Subtracts the left element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the left difference between adjacent elements.
*
* @par
* @code
* #include <cub/cub.cuh>
* // or equivalently <cub/block/block_adjacent_difference.cuh>
*
* struct CustomDifference
* {
* template <typename DataType>
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference<int, 128>;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
* int valid_items = 9;
* int tile_predecessor_item = 4;
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
* thread_data,
* thread_data,
* CustomDifference(),
* valid_items,
* tile_predecessor_item);
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
* The corresponding output `result` in those threads will be
* `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
* `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
*
* @param[out] output
* Calling thread's adjacent difference result
Expand All @@ -551,6 +674,11 @@ public:
*
* @param[in] valid_items
* Number of valid items in thread block
*
* @param[in] tile_predecessor_item
* **[<em>thread</em><sub>0</sub> only]** item which is going to be
* subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
* <em>thread</em><sub>0</sub>).
*/
template <int ITEMS_PER_THREAD,
typename OutputType,
Expand All @@ -559,7 +687,8 @@ public:
SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
int valid_items,
T tile_predecessor_item)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
Expand Down Expand Up @@ -592,10 +721,15 @@ public:
}
}

if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
if (valid_items <= linear_tid * ITEMS_PER_THREAD)
{
output[0] = input[0];
}
else if (linear_tid == 0)
{
output[0] = difference_op(input[0],
tile_predecessor_item);
}
else
{
output[0] = difference_op(input[0],
Expand Down
35 changes: 16 additions & 19 deletions cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -199,30 +199,27 @@ struct DispatchAdjacentDifference : public SelectedPolicy
sizeof(InputT);

void *allocations[1] = {nullptr};
std::size_t allocation_sizes[1] = {first_tile_previous_size};
std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};

if (MayAlias)
if (CubDebug(error = AliasTemporaries(d_temp_storage,
temp_storage_bytes,
allocations,
allocation_sizes)))
{
if (CubDebug(error = AliasTemporaries(d_temp_storage,
temp_storage_bytes,
allocations,
allocation_sizes)))
{
break;
}

if (d_temp_storage == nullptr)
{
// Return if the caller is simply requesting the size of the storage
// allocation
break;
}

if (temp_storage_bytes == 0)
{
temp_storage_bytes = 1;
}
if (d_temp_storage == nullptr)
{
// Return if the caller is simply requesting the size of the storage
// allocation

break;
if (temp_storage_bytes == 0)
{
temp_storage_bytes = 1;
}

break;
}

if (num_items == OffsetT{})
Expand Down
Loading