Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Stride to Subscript and Slice Kernel #5007

Merged
merged 25 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
0152190
add docker-build folder to gitignore, clang-format slice_cpu.h, level…
5had3z Jul 15, 2023
a9f7387
clang-format + black format, removed slice notimpl errors
5had3z Jul 15, 2023
898ba89
add step to slice args, multiply in stride by step, clang-format
5had3z Jul 15, 2023
d2cc8e6
update defauly pyver and add new runtime images
5had3z Jul 29, 2023
a936d16
fix build script
5had3z Jul 29, 2023
c84adfb
added .devcontainer and dockerfile
5had3z Aug 4, 2023
53fce55
remove deps post-compile, move pre-commit install
5had3z Aug 4, 2023
5eec656
added more devcontainer components, add step arg (can't easily handle…
5had3z Aug 5, 2023
734f5cd
step > 1 works (-ve not), add nsight to devctr
5had3z Aug 5, 2023
170df2f
Add nvjpeg2k and nvcomp to image
5had3z Aug 6, 2023
996d88d
remove dimension inlining and anchor embedding to enable stepping to …
5had3z Aug 19, 2023
aeec1f2
fix default values for step to be 1, clang-format
5had3z Aug 20, 2023
32f8471
add more tests for hi/lo, fix last element logic for reverse stride
5had3z Aug 20, 2023
831c572
added more tests, updated docs
5had3z Aug 20, 2023
51168dc
remove devcontainer and revert docker/build.sh
5had3z Aug 21, 2023
d9c9553
re-added dimension flattening with fixed logic + conditions
5had3z Aug 21, 2023
55cc4a7
re-added slicenopad flatten w/ step + anchor cond
5had3z Aug 21, 2023
b00063c
preapply anchor and step if no padding
5had3z Aug 21, 2023
e30c269
Update dali/kernels/slice/slice_kernel_utils.h
5had3z Aug 22, 2023
3c0f993
Add UnitCubeShape utility.
mzient Aug 22, 2023
948639d
added helper function to TensorShape to create filled tensor
5had3z Aug 22, 2023
fe0ac97
fix assertions, fix missing template param for ndim, removed unnessec…
5had3z Aug 23, 2023
c9ac7a8
Simplify step alongside anchor and shape.
mzient Aug 23, 2023
8e372b2
Add a targetted test for collapsing untouched dims.
mzient Aug 23, 2023
6d0fcc0
Restore formatting and comments.
mzient Aug 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ docs/op_autodoc
docs/fn_autodoc
docs/nvidia.ico
.DS_Store
build-docker-*
249 changes: 111 additions & 138 deletions dali/kernels/slice/slice_cpu.h

Large diffs are not rendered by default.

159 changes: 85 additions & 74 deletions dali/kernels/slice/slice_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#include "dali/kernels/kernel.h"
#include "dali/kernels/slice/slice_kernel_utils.h"

__device__ DALI_FORCEINLINE bool __ldg(const bool* ptr) {
__device__ DALI_FORCEINLINE bool __ldg(const bool *ptr) {
return __ldg(reinterpret_cast<const dali::kernels::type_of_size<sizeof(bool)> *>(ptr));
}

Expand All @@ -59,6 +59,7 @@ struct SliceSampleDesc {
TensorShape<Dims> out_shape;
TensorShape<Dims> in_shape;
TensorShape<Dims> anchor;
TensorShape<Dims> step;

const void *__restrict__ fill_values;
int channel_dim;
Expand All @@ -74,22 +75,22 @@ struct SliceBlockDesc {
uint64_t size;
};

template<typename T>
template <typename T>
union PackedBuffer {
using PackedType = uint32_t;
static constexpr size_t kCapacity = sizeof(T) >= sizeof(PackedType) ?
1 : sizeof(PackedType) / sizeof(T);
static constexpr size_t kCapacity =
sizeof(T) >= sizeof(PackedType) ? 1 : sizeof(PackedType) / sizeof(T);

T values[kCapacity];
PackedType raw;

__device__ inline void store(T* mem, size_t count) {
__device__ inline void store(T *mem, size_t count) {
if (kCapacity == 1) {
*mem = *values;
} else if (count == kCapacity && reinterpret_cast<uintptr_t>(mem) % sizeof(PackedType) == 0) {
*reinterpret_cast<PackedType*>(mem) = raw;
*reinterpret_cast<PackedType *>(mem) = raw;
} else {
#pragma unroll
#pragma unroll
for (size_t i = 0; i < count; i++) {
mem[i] = values[i];
}
Expand All @@ -99,35 +100,38 @@ union PackedBuffer {

/**
* @brief Simplified algorithm when no padding is necessary
* @remarks `in` already refers to the slice anchor start
* @remark "in" should have "anchor" pre-applied and "stride" should have "step" pre-applied
*/
template <int Dims, typename OutputType, typename InputType>
__device__ void SliceFuncNoPad(OutputType *__restrict__ out, const InputType *__restrict__ in,
const fast_div<uint64_t> *out_strides, const int64_t *in_strides,
uint64_t offset, uint64_t block_end) {
if (Dims > 1 && out_strides[Dims - 1] == static_cast<uint32_t>(in_strides[Dims - 1])) {
const int64_t *anchor, const int64_t *step, uint64_t offset,
uint64_t block_end) {
5had3z marked this conversation as resolved.
Show resolved Hide resolved
5had3z marked this conversation as resolved.
Show resolved Hide resolved
if (Dims > 1 && step[Dims - 1] == 1 && step[Dims - 2] == 1 && anchor[Dims - 1] == 0 &&
out_strides[Dims - 1] == static_cast<uint32_t>(in_strides[Dims - 1])) {
const int NextDims = Dims > 1 ? Dims - 1 : 1;
SliceFuncNoPad<NextDims, OutputType, InputType>(out, in, out_strides, in_strides, offset,
block_end);
SliceFuncNoPad<NextDims, OutputType, InputType>(out, in, out_strides, in_strides, anchor, step,
offset, block_end);
return;
}

for (; offset < block_end; offset += blockDim.x * PackedBuffer<OutputType>::kCapacity) {
PackedBuffer<OutputType> result;

uint64_t i;
#pragma unroll
for (i = 0; i < PackedBuffer<OutputType>::kCapacity; i++) {
uint64_t i = 0;
#pragma unroll
for (; i < PackedBuffer<OutputType>::kCapacity; i++) {
uint64_t idx = offset + i;
if (idx >= block_end) break;
if (idx >= block_end)
break;
uint64_t in_idx = 0;

#pragma unroll
#pragma unroll
for (int d = 0; d < Dims; d++) {
int i_d = div_mod(idx, idx, out_strides[d]);
in_idx += i_d * in_strides[d];
}
in_idx += idx; // remaining dims have equal strides
in_idx += idx * step[Dims - 1];
result.values[i] = clamp<OutputType>(in[in_idx]);
}
result.store(&out[offset], i);
Expand All @@ -137,65 +141,64 @@ __device__ void SliceFuncNoPad(OutputType *__restrict__ out, const InputType *__
/**
* @brief General algorithm that allows for padding in any dimension
* @remarks `in` refers to the beginning of the input (not the slice anchor)
* @remarks `AllDims=true` means that Dims refer to the actual number of dimensions,
* meaning we haven't skipped last dimensions that have same input and output strides
*/
template <int Dims, typename OutputType, typename InputType, bool AllDims = true>
__device__ void SliceFunc(OutputType *__restrict__ out, const InputType *__restrict__ in,
const fast_div<uint64_t> *out_strides, const int64_t *in_strides,
const int64_t *out_shape, const int64_t *in_shape, const int64_t *anchor,
const OutputType *__restrict__ fill_values, int channel_dim,
uint64_t offset, uint64_t block_end) {
if (Dims > 1 && anchor[Dims - 1] == 0 && in_shape[Dims - 1] == out_shape[Dims - 1] &&
channel_dim != Dims - 1) {
5had3z marked this conversation as resolved.
Show resolved Hide resolved
const int64_t *step, const OutputType *__restrict__ fill_values,
int channel_dim, uint64_t offset, uint64_t block_end) {
if (Dims > 1 && step[Dims - 1] == 1 && step[Dims - 2] == 1 && anchor[Dims - 1] == 0 &&
in_shape[Dims - 1] == out_shape[Dims - 1] && channel_dim != Dims - 1) {
const int NextDims = Dims > 1 ? Dims - 1 : 1;
SliceFunc<NextDims, OutputType, InputType, false>(out, in, out_strides, in_strides, out_shape,
in_shape, anchor, fill_values, channel_dim,
offset, block_end);
in_shape, anchor, step, fill_values,
channel_dim, offset, block_end);
return;
}

constexpr int LastDim = Dims - 1;
int64_t inner_in_anchor = anchor[LastDim];
int64_t inner_in_extent = in_shape[LastDim];
if (!AllDims) { // if we fused dimensions, adjust inner dimension's anchor and extent
if (!AllDims) {
inner_in_anchor = anchor[LastDim] * in_strides[LastDim];
inner_in_extent = Dims > 1 ? in_strides[LastDim - 1] : in_shape[LastDim] * in_strides[LastDim];
}

for (; offset < block_end; offset += blockDim.x * PackedBuffer<OutputType>::kCapacity) {
PackedBuffer<OutputType> result;

uint64_t i;
#ifndef __clang__
#pragma unroll
#endif
for (i = 0; i < PackedBuffer<OutputType>::kCapacity; i++) {
uint64_t i = 0;
#ifndef __clang__
#pragma unroll
#endif
for (; i < PackedBuffer<OutputType>::kCapacity; i++) {
uint64_t idx = offset + i;
if (idx >= block_end) break;
if (idx >= block_end)
break;

// If no dimensions were skipped (AllDims=true) we can avoid division in the last dimension,
// because know the strides are 1 (or we treat them as 1 if we fused dimensions)
int i_c = 0;
int i_d;
bool out_of_bounds = false;
uint64_t in_idx = 0;

#pragma unroll
#pragma unroll
for (int d = 0; d < Dims - 1; d++) {
i_d = div_mod(idx, idx, out_strides[d]);
if (d == channel_dim)
i_c = i_d;
out_of_bounds |= is_out_of_bounds(anchor[d] + i_d, in_shape[d]);
i_d = anchor[d] + i_d * step[d];
out_of_bounds |= is_out_of_bounds(i_d, in_shape[d]);
in_idx += i_d * in_strides[d];
}

constexpr int d = LastDim;
i_d = idx; // out_strides[d] is 1
i_d = idx;
if (AllDims && d == channel_dim)
i_c = i_d;
out_of_bounds |= is_out_of_bounds(inner_in_anchor + i_d, inner_in_extent);
in_idx += i_d; // in_strides[d] is 1
i_d = inner_in_anchor + i_d * step[d];
out_of_bounds |= is_out_of_bounds(i_d, inner_in_extent);
in_idx += i_d;

// Fill values are reused a lot, so let's make sure they are cached (by using __ldg())
OutputType value = __ldg(&fill_values[i_c]);
Expand All @@ -213,20 +216,21 @@ __global__ void SliceKernel(const SliceSampleDesc<Dims> *samples, const SliceBlo
uint64_t offset = blocks[blockIdx.x].offset + threadIdx.x * PackedBuffer<OutputType>::kCapacity;
uint64_t block_end = blocks[blockIdx.x].offset + blocks[blockIdx.x].size;
auto sample = samples[sampleIdx];
auto *out = static_cast<OutputType*>(sample.out);
auto *in = static_cast<const InputType*>(sample.in);
auto *out = static_cast<OutputType *>(sample.out);
auto *in = static_cast<const InputType *>(sample.in);
auto *out_strides = sample.out_strides;
auto *in_strides = sample.in_strides.data();
auto *anchor = sample.anchor.data();
auto *step = sample.step.data();
if (SupportPad && sample.need_pad) {
auto *anchor = sample.anchor.data();
auto *in_shape = sample.in_shape.data();
auto *out_shape = sample.out_shape.data();
auto *fill_values = static_cast<const OutputType*>(sample.fill_values);
auto *fill_values = static_cast<const OutputType *>(sample.fill_values);
auto channel_dim = sample.channel_dim;
SliceFunc<Dims>(out, in, out_strides, in_strides, out_shape, in_shape, anchor, fill_values,
channel_dim, offset, block_end);
SliceFunc<Dims>(out, in, out_strides, in_strides, out_shape, in_shape, anchor, step,
fill_values, channel_dim, offset, block_end);
} else {
SliceFuncNoPad<Dims>(out, in, out_strides, in_strides, offset, block_end);
SliceFuncNoPad<Dims>(out, in, out_strides, in_strides, anchor, step, offset, block_end);
}
}

Expand All @@ -244,15 +248,14 @@ class SliceGPU {
int blocks_per_sm_ = 0;

public:
KernelRequirements Setup(KernelContext &context,
const InListGPU<InputType, Dims> &in,
KernelRequirements Setup(KernelContext &context, const InListGPU<InputType, Dims> &in,
const std::vector<SliceArgs<OutputType, Dims>> &slice_args) {
KernelRequirements req;
ScratchpadEstimator se;
auto num_samples = in.size();

nfill_values_ = 0;
for (const auto& args : slice_args) {
for (const auto &args : slice_args) {
if (nfill_values_ == 0) {
nfill_values_ = args.fill_values.size();
} else {
Expand All @@ -265,10 +268,9 @@ class SliceGPU {
default_fill_values_ = true;
nfill_values_ = 1;
} else if (nfill_values_ > 1) {
for (const auto& args : slice_args) {
for (const auto &args : slice_args) {
if (args.channel_dim < 0 || args.channel_dim >= Dims)
throw std::invalid_argument(
"Channel dim must be valid for multi-channel fill values");
throw std::invalid_argument("Channel dim must be valid for multi-channel fill values");
if (nfill_values_ != args.shape[args.channel_dim])
throw std::invalid_argument(
"The number of fill values should match the number of channels in the output slice");
Expand All @@ -290,15 +292,18 @@ class SliceGPU {
}

if (blocks_per_sm_ == 0) {
CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm_,
slice_impl::SliceKernel<OutputType, InputType, Dims, false>, kBlockDim, 0));
CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm_, slice_impl::SliceKernel<OutputType, InputType, Dims, false>, kBlockDim,
0));
}
unsigned max_active_blocks = blocks_per_sm_ * GetSmCount();
uint64_t waves = div_ceil(total_volume + 1, kMaxBlockSize * max_active_blocks);
unsigned block_align = 32 * slice_impl::PackedBuffer<OutputType>::kCapacity;
block_size_ = align_up(div_ceil(total_volume, max_active_blocks * waves), block_align);
if (block_size_ < kMinBlockSize) block_size_ = kMinBlockSize;
if (block_size_ > kMaxBlockSize) block_size_ = kMaxBlockSize;
if (block_size_ < kMinBlockSize)
block_size_ = kMinBlockSize;
if (block_size_ > kMaxBlockSize)
block_size_ = kMaxBlockSize;

block_count_ = 0;
for (auto sample_size : sample_sizes) {
Expand All @@ -309,12 +314,11 @@ class SliceGPU {
se.add<mm::memory_kind::device, slice_impl::SliceBlockDesc>(block_count_);
req.scratch_sizes = se.sizes;

req.output_shapes = { GetOutputShapes<Dims>(in.shape, slice_args) };
req.output_shapes = {GetOutputShapes<Dims>(in.shape, slice_args)};
return req;
}

void Run(KernelContext &context,
OutListGPU<OutputType, Dims> &out,
void Run(KernelContext &context, OutListGPU<OutputType, Dims> &out,
const InListGPU<InputType, Dims> &in,
const std::vector<SliceArgs<OutputType, Dims>> &slice_args) {
if (block_count_ == 0) {
Expand Down Expand Up @@ -355,20 +359,28 @@ class SliceGPU {
sample_desc.anchor = anchor;
sample_desc.in_shape = in_shape;
sample_desc.out_shape = out_shape;

const InputType *in_data = in.tensor_data(i);
// `sample_desc.in` is expected to point to the slice anchor
for (int d = 0; d < Dims; d++)
in_data += anchor[d] * sample_desc.in_strides[d];
sample_desc.step = slice_args[i].step;

sample_desc.out = out.tensor_data(i);
sample_desc.in = in_data;
sample_sizes[i] = volume(out_shape);

// fill values points to gpu memory
sample_desc.fill_values = fill_values_gpu + i * nfill_values_;
sample_desc.channel_dim = nfill_values_ > 1 ? slice_args[i].channel_dim : -1;
sample_desc.need_pad = NeedPad(Dims, anchor, in_shape, out_shape);

// pre-anchor and step if there is no padding
if (!sample_desc.need_pad) {
const InputType *in_data = in.tensor_data(i);
for (int d = 0; d < Dims; ++d) {
in_data += sample_desc.anchor[d] * sample_desc.in_strides[d];
sample_desc.in_strides[d] *= sample_desc.step[d];
}
sample_desc.in = in_data;
} else {
sample_desc.in = in.tensor_data(i);
}

any_padded_sample |= sample_desc.need_pad;
}

Expand All @@ -386,17 +398,16 @@ class SliceGPU {

slice_impl::SliceSampleDesc<Dims> *sample_descs;
slice_impl::SliceBlockDesc *block_descs;
std::tie(sample_descs, block_descs) =
context.scratchpad->ToContiguousGPU(context.gpu.stream,
make_cspan(sample_descs_cpu, num_samples),
make_cspan(block_descs_cpu, block_count_));
std::tie(sample_descs, block_descs) = context.scratchpad->ToContiguousGPU(
context.gpu.stream, make_cspan(sample_descs_cpu, num_samples),
make_cspan(block_descs_cpu, block_count_));
CUDA_CALL(cudaGetLastError());

const auto grid = block_count_;
BOOL_SWITCH(any_padded_sample, NeedPad, (
slice_impl::SliceKernel<OutputType, InputType, Dims, NeedPad>
<<<grid, kBlockDim, 0, context.gpu.stream>>>(sample_descs, block_descs);
)); // NOLINT
BOOL_SWITCH(
any_padded_sample, NeedPad,
(slice_impl::SliceKernel<OutputType, InputType, Dims, NeedPad>
<<<grid, kBlockDim, 0, context.gpu.stream>>>(sample_descs, block_descs);)); // NOLINT
CUDA_CALL(cudaGetLastError());
}

Expand Down
Loading