Skip to content

Commit

Permalink
Add GaussianBlur Gpu Operator (#2314)
Browse files Browse the repository at this point in the history
Plug SeparableConvolutionGpu that uses Cutlass based
ConvolutionGpu kernel into GaussianBlur Operator.

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
  • Loading branch information
klecki committed Oct 2, 2020
1 parent f3b4930 commit ec53d42
Show file tree
Hide file tree
Showing 20 changed files with 689 additions and 115 deletions.
2 changes: 2 additions & 0 deletions dali/operators/image/convolution/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

add_subdirectory(gaussian_blur_gpu)

# Get all the source files and dump test files
collect_headers(DALI_INST_HDRS PARENT_SCOPE)
collect_sources(DALI_OPERATOR_SRCS PARENT_SCOPE)
Expand Down
82 changes: 9 additions & 73 deletions dali/operators/image/convolution/gaussian_blur.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,13 @@
#include "dali/kernels/imgproc/convolution/separable_convolution_cpu.h"
#include "dali/kernels/kernel_manager.h"
#include "dali/operators/image/convolution/gaussian_blur.h"
#include "dali/operators/image/convolution/gaussian_blur_params.h"
#include "dali/pipeline/data/views.h"
#include "dali/pipeline/operator/common.h"

namespace dali {

using namespace gaussian_blur; // NOLINT

constexpr static const char* kSigmaArgName = "sigma";
constexpr static const char* kWindowSizeArgName = "window_size";

DALI_SCHEMA(GaussianBlur)
.DocStr(R"code(Applies a Gaussian Blur to the input.
Expand Down Expand Up @@ -75,79 +71,16 @@ The same input can be provided as per-sample tensors.
Supported type: `FLOAT`. If not set, the input type is used.)code",
DALI_NO_TYPE);

/**
* @brief Fill the result span with the argument which can be provided as:
* * ArgumentInput - {result.size()}-shaped Tensor
* * ArgumentInput - {1}-shaped Tensor, the value will be replicated `result.size()` times
* * Vector input - single "repeated argument" of length {result.size()} or {1}
* * scalar argument - it will be replicated `result.size()` times
*
* TODO(klecki): we may want to make this a generic utility and propagate the span-approach to
* the rest of the related argument gettters
*/
template <typename T>
void GetGeneralizedArg(span<T> result, const std::string name, int sample_idx, const OpSpec& spec,
const ArgumentWorkspace& ws) {
int argument_length = result.size();
if (spec.HasTensorArgument(name)) {
const auto& tv = ws.ArgumentInput(name);
const auto& tensor = tv[sample_idx];
DALI_ENFORCE(tensor.shape().sample_dim() == 1,
make_string("Argument ", name, " for sample ", sample_idx,
" is expected to be 1D, got: ", tensor.shape().sample_dim(), "."));
DALI_ENFORCE(tensor.shape()[0] == 1 || tensor.shape()[0] == argument_length,
make_string("Argument ", name, " for sample ", sample_idx,
" is expected to have shape equal to {1} or {", argument_length,
"}, got: ", tensor.shape(), "."));
if (tensor.shape()[0] == 1) {
for (int i = 0; i < argument_length; i++) {
result[i] = tensor.data<T>()[0];
}
} else {
memcpy(result.data(), tensor.data<T>(), sizeof(T) * argument_length);
}
return;
}
std::vector<T> tmp;
// we already handled the argument input, this handles spec-related arguments only
GetSingleOrRepeatedArg(spec, tmp, name, argument_length);
memcpy(result.data(), tmp.data(), sizeof(T) * argument_length);
}

template <int axes>
GaussianBlurParams<axes> GetSampleParams(int sample, const OpSpec& spec,
const ArgumentWorkspace& ws) {
GaussianBlurParams<axes> params;
GetGeneralizedArg<float>(make_span(params.sigmas), kSigmaArgName, sample, spec, ws);
GetGeneralizedArg<int>(make_span(params.window_sizes), kWindowSizeArgName, sample, spec, ws);
for (int i = 0; i < axes; i++) {
DALI_ENFORCE(
!(params.sigmas[i] == 0 && params.window_sizes[i] == 0),
make_string("`sigma` and `window_size` shouldn't be 0 at the same time for sample: ",
sample, ", axis: ", i, "."));
DALI_ENFORCE(params.sigmas[i] >= 0,
make_string("`sigma` must have non-negative values, got ", params.sigmas[i],
" for sample: ", sample, ", axis: ", i, "."));
DALI_ENFORCE(params.window_sizes[i] >= 0,
make_string("`window_size` must have non-negative values, got ",
params.window_sizes[i], " for sample: ", sample, ", axis : ", i, "."));
if (params.window_sizes[i] == 0) {
params.window_sizes[i] = SigmaToDiameter(params.sigmas[i]);
} else if (params.sigmas[i] == 0.f) {
params.sigmas[i] = DiameterToSigma(params.window_sizes[i]);
}
}
return params;
}

namespace gaussian_blur {
DimDesc ParseAndValidateDim(int ndim, TensorLayout layout) {
static constexpr int kMaxDim = 3;
if (layout.empty()) {
// assuming plain data with no channels
DALI_ENFORCE(ndim <= kMaxDim,
make_string("Input data with empty layout cannot have more than ", kMaxDim,
" dimensions, got input with ", ndim, " dimensions."));
return {0, ndim, false, false};
return {0, ndim, ndim, false, false};
}
// not-empty layout
int axes_start = 0;
Expand Down Expand Up @@ -177,7 +110,7 @@ DimDesc ParseAndValidateDim(int ndim, TensorLayout layout) {
DALI_ENFORCE(axes_count <= kMaxDim,
make_string("Too many dimensions, found: ", axes_count,
" data axes, maximum supported is: ", kMaxDim, "."));
return {axes_start, axes_count, has_channels, axes_start != 0};
return {axes_start, axes_count, axes_count + (axes_start != 0), has_channels, axes_start != 0};
}

// axes here is dimension of element processed by kernel - in case of sequence it's 1 less than the
Expand Down Expand Up @@ -206,7 +139,7 @@ class GaussianBlurOpCpu : public OpImplBase<CPUBackend> {
kmgr_.template Resize<Kernel>(nthreads, nsamples);

for (int i = 0; i < nsamples; i++) {
params_[i] = GetSampleParams<axes>(i, spec_, ws);
params_[i] = ObtainSampleParams<axes>(i, spec_, ws);
windows_[i].PrepareWindows(params_[i]);
// We take only last `ndim` siginificant dimensions to handle sequences as well
auto elem_shape = input[i].shape().template last<ndim>();
Expand Down Expand Up @@ -266,6 +199,9 @@ class GaussianBlurOpCpu : public OpImplBase<CPUBackend> {
std::vector<GaussianWindows<axes>> windows_;
};


} // namespace gaussian_blur

template <>
bool GaussianBlur<CPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
const workspace_t<CPUBackend>& ws) {
Expand All @@ -277,7 +213,7 @@ bool GaussianBlur<CPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
"Output data type must be same as input, FLOAT or skipped (defaults to input type)");

// clang-format off
TYPE_SWITCH(input.type().id(), type2id, In, GAUSSIAN_BLUR_SUPPORTED_TYPES, (
TYPE_SWITCH(input.type().id(), type2id, In, GAUSSIAN_BLUR_CPU_SUPPORTED_TYPES, (
VALUE_SWITCH(dim_desc.usable_axes_count, AXES, GAUSSIAN_BLUR_SUPPORTED_AXES, (
VALUE_SWITCH(static_cast<int>(dim_desc.has_channels), HAS_CHANNELS, (0, 1), (
constexpr bool has_ch = HAS_CHANNELS;
Expand All @@ -286,7 +222,7 @@ bool GaussianBlur<CPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
} else {
impl_ = std::make_unique<GaussianBlurOpCpu<float, In, AXES, has_ch>>(spec_, dim_desc);
}
), (DALI_FAIL("Got value different than {0, 1} when converting bool to int."))); // NOLINT
), ()); // NOLINT, no other possible conversion
), DALI_FAIL("Axis count out of supported range.")); // NOLINT
), DALI_FAIL(make_string("Unsupported data type: ", input.type().id()))); // NOLINT
// clang-format on
Expand Down
103 changes: 103 additions & 0 deletions dali/operators/image/convolution/gaussian_blur.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <functional>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "dali/core/static_switch.h"
#include "dali/kernels/imgproc/convolution/separable_convolution_gpu.h"
#include "dali/kernels/kernel_manager.h"
#include "dali/operators/image/convolution/gaussian_blur.h"
#include "dali/operators/image/convolution/gaussian_blur_params.h"
#include "dali/pipeline/data/views.h"
#include "dali/pipeline/operator/common.h"
#include "dali/operators/image/convolution/gaussian_blur_gpu.h"

namespace dali {

using namespace gaussian_blur; // NOLINT

namespace gaussian_blur {

// Functions below are explicitly instantiated in separate compilation unit to allow
// for parallel compitlation of underlying kernels.

extern template op_impl_uptr GetGaussianBlurGpuImpl<uint8_t, uint8_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint8_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<int8_t, int8_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int8_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<uint16_t, uint16_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint16_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<int16_t, int16_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int16_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<uint32_t, uint32_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint32_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<int32_t, int32_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int32_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<uint64_t, uint64_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint64_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<int64_t, int64_t>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int64_t>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<float16, float16>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, float16>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<float, float>(const OpSpec&, DimDesc);

extern template op_impl_uptr GetGaussianBlurGpuImpl<double, double>(const OpSpec&, DimDesc);
extern template op_impl_uptr GetGaussianBlurGpuImpl<float, double>(const OpSpec&, DimDesc);

} // namespace gaussian_blur

template <>
bool GaussianBlur<GPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
const workspace_t<GPUBackend>& ws) {
const auto& input = ws.template InputRef<GPUBackend>(0);
auto layout = input.GetLayout();
auto dim_desc = ParseAndValidateDim(input.shape().sample_dim(), layout);
dtype_ = dtype_ != DALI_NO_TYPE ? dtype_ : input.type().id();
DALI_ENFORCE(dtype_ == input.type().id() || dtype_ == DALI_FLOAT,
"Output data type must be same as input, FLOAT or skipped (defaults to input type)");

// clang-format off
TYPE_SWITCH(input.type().id(), type2id, In, GAUSSIAN_BLUR_GPU_SUPPORTED_TYPES, (
if (dtype_ == input.type().id()) {
impl_ = GetGaussianBlurGpuImpl<In, In>(spec_, dim_desc);
} else {
impl_ = GetGaussianBlurGpuImpl<float, In>(spec_, dim_desc);
}
), DALI_FAIL(make_string("Unsupported data type: ", input.type().id()))); // NOLINT
// clang-format on

return impl_->SetupImpl(output_desc, ws);
}

template <>
void GaussianBlur<GPUBackend>::RunImpl(workspace_t<GPUBackend>& ws) {
impl_->RunImpl(ws);
}

DALI_REGISTER_OPERATOR(GaussianBlur, GaussianBlur<GPUBackend>, GPU);

} // namespace dali
49 changes: 47 additions & 2 deletions dali/operators/image/convolution/gaussian_blur.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,21 @@

#include "dali/pipeline/operator/operator.h"
#include "dali/pipeline/util/operator_impl_utils.h"
#include "dali/operators/image/convolution/gaussian_blur_params.h"
#include "dali/pipeline/operator/common.h"

namespace dali {

#define GAUSSIAN_BLUR_SUPPORTED_TYPES \
(uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t, int64_t, float, float16)
#define GAUSSIAN_BLUR_CPU_SUPPORTED_TYPES \
(uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t, int64_t, float16, float, double)

// TODO(klecki): float16 support - it's not easily compatible with float window,
// need to introduce some cast in between and expose it in the kernels
#define GAUSSIAN_BLUR_GPU_SUPPORTED_TYPES \
(uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t, int64_t, float, double)

#define GAUSSIAN_BLUR_SUPPORTED_AXES (1, 2, 3)

template <typename Backend>
class GaussianBlur : public Operator<Backend> {
public:
Expand All @@ -50,6 +58,43 @@ class GaussianBlur : public Operator<Backend> {
std::unique_ptr<OpImplBase<Backend>> impl_;
};

namespace gaussian_blur {

constexpr static const char* kSigmaArgName = "sigma";
constexpr static const char* kWindowSizeArgName = "window_size";

/**
* @brief Obtain the parameters needed for generating Gaussian Windows for GaussianBlur Operator.
*/
template <int axes>
inline GaussianBlurParams<axes> ObtainSampleParams(int sample, const OpSpec& spec,
const ArgumentWorkspace& ws) {
GaussianBlurParams<axes> params;
GetGeneralizedArg<float>(make_span(params.sigmas), kSigmaArgName, sample, spec, ws);
GetGeneralizedArg<int>(make_span(params.window_sizes), kWindowSizeArgName, sample, spec, ws);
for (int i = 0; i < axes; i++) {
DALI_ENFORCE(
!(params.sigmas[i] == 0 && params.window_sizes[i] == 0),
make_string("`sigma` and `window_size` shouldn't be 0 at the same time for sample: ",
sample, ", axis: ", i, "."));
DALI_ENFORCE(params.sigmas[i] >= 0,
make_string("`sigma` must have non-negative values, got ", params.sigmas[i],
" for sample: ", sample, ", axis: ", i, "."));
DALI_ENFORCE(params.window_sizes[i] >= 0,
make_string("`window_size` must have non-negative values, got ",
params.window_sizes[i], " for sample: ", sample, ", axis : ", i, "."));
if (params.window_sizes[i] == 0) {
params.window_sizes[i] = SigmaToDiameter(params.sigmas[i]);
} else if (params.sigmas[i] == 0.f) {
params.sigmas[i] = DiameterToSigma(params.window_sizes[i]);
}
}
return params;
}

DimDesc ParseAndValidateDim(int ndim, TensorLayout layout);

} // namespace gaussian_blur
} // namespace dali

#endif // DALI_OPERATORS_IMAGE_CONVOLUTION_GAUSSIAN_BLUR_H_
Loading

0 comments on commit ec53d42

Please sign in to comment.