Add GaussianBlur Gpu Operator (#2314)

Plug SeparableConvolutionGpu that uses Cutlass based ConvolutionGpu kernel into GaussianBlur Operator. Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
NVIDIA · Oct 2, 2020 · ec53d42 · ec53d42
1 parent f3b4930
commit ec53d42
Show file tree

Hide file tree

Showing 20 changed files with 689 additions and 115 deletions.
diff --git a/dali/operators/image/convolution/CMakeLists.txt b/dali/operators/image/convolution/CMakeLists.txt
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+add_subdirectory(gaussian_blur_gpu)
+
 # Get all the source files and dump test files
 collect_headers(DALI_INST_HDRS PARENT_SCOPE)
 collect_sources(DALI_OPERATOR_SRCS PARENT_SCOPE)

diff --git a/dali/operators/image/convolution/gaussian_blur.cc b/dali/operators/image/convolution/gaussian_blur.cc
@@ -22,17 +22,13 @@
 #include "dali/kernels/imgproc/convolution/separable_convolution_cpu.h"
 #include "dali/kernels/kernel_manager.h"
 #include "dali/operators/image/convolution/gaussian_blur.h"
-#include "dali/operators/image/convolution/gaussian_blur_params.h"
 #include "dali/pipeline/data/views.h"
 #include "dali/pipeline/operator/common.h"
 
 namespace dali {
 
 using namespace gaussian_blur;  // NOLINT
 
-constexpr static const char* kSigmaArgName = "sigma";
-constexpr static const char* kWindowSizeArgName = "window_size";
-
 DALI_SCHEMA(GaussianBlur)
     .DocStr(R"code(Applies a Gaussian Blur to the input.
 
@@ -75,79 +71,16 @@ The same input can be provided as per-sample tensors.
 Supported type: `FLOAT`. If not set, the input type is used.)code",
         DALI_NO_TYPE);
 
-/**
- * @brief Fill the result span with the argument which can be provided as:
- * * ArgumentInput - {result.size()}-shaped Tensor
- * * ArgumentInput - {1}-shaped Tensor, the value will be replicated `result.size()` times
- * * Vector input - single "repeated argument" of length {result.size()} or {1}
- * * scalar argument - it will be replicated `result.size()` times
- *
- * TODO(klecki): we may want to make this a generic utility and propagate the span-approach to
- * the rest of the related argument gettters
- */
-template <typename T>
-void GetGeneralizedArg(span<T> result, const std::string name, int sample_idx, const OpSpec& spec,
-                       const ArgumentWorkspace& ws) {
-  int argument_length = result.size();
-  if (spec.HasTensorArgument(name)) {
-    const auto& tv = ws.ArgumentInput(name);
-    const auto& tensor = tv[sample_idx];
-    DALI_ENFORCE(tensor.shape().sample_dim() == 1,
-                 make_string("Argument ", name, " for sample ", sample_idx,
-                             " is expected to be 1D, got: ", tensor.shape().sample_dim(), "."));
-    DALI_ENFORCE(tensor.shape()[0] == 1 || tensor.shape()[0] == argument_length,
-                 make_string("Argument ", name, " for sample ", sample_idx,
-                             " is expected to have shape equal to {1} or {", argument_length,
-                             "}, got: ", tensor.shape(), "."));
-    if (tensor.shape()[0] == 1) {
-      for (int i = 0; i < argument_length; i++) {
-        result[i] = tensor.data<T>()[0];
-      }
-    } else {
-      memcpy(result.data(), tensor.data<T>(), sizeof(T) * argument_length);
-    }
-    return;
-  }
-  std::vector<T> tmp;
-  // we already handled the argument input, this handles spec-related arguments only
-  GetSingleOrRepeatedArg(spec, tmp, name, argument_length);
-  memcpy(result.data(), tmp.data(), sizeof(T) * argument_length);
-}
-
-template <int axes>
-GaussianBlurParams<axes> GetSampleParams(int sample, const OpSpec& spec,
-                                         const ArgumentWorkspace& ws) {
-  GaussianBlurParams<axes> params;
-  GetGeneralizedArg<float>(make_span(params.sigmas), kSigmaArgName, sample, spec, ws);
-  GetGeneralizedArg<int>(make_span(params.window_sizes), kWindowSizeArgName, sample, spec, ws);
-  for (int i = 0; i < axes; i++) {
-    DALI_ENFORCE(
-        !(params.sigmas[i] == 0 && params.window_sizes[i] == 0),
-        make_string("`sigma` and `window_size` shouldn't be 0 at the same time for sample: ",
-                    sample, ", axis: ", i, "."));
-    DALI_ENFORCE(params.sigmas[i] >= 0,
-                 make_string("`sigma` must have non-negative values, got ", params.sigmas[i],
-                             " for sample: ", sample, ", axis: ", i, "."));
-    DALI_ENFORCE(params.window_sizes[i] >= 0,
-                 make_string("`window_size` must have non-negative values, got ",
-                             params.window_sizes[i], " for sample: ", sample, ", axis : ", i, "."));
-    if (params.window_sizes[i] == 0) {
-      params.window_sizes[i] = SigmaToDiameter(params.sigmas[i]);
-    } else if (params.sigmas[i] == 0.f) {
-      params.sigmas[i] = DiameterToSigma(params.window_sizes[i]);
-    }
-  }
-  return params;
-}
 
+namespace gaussian_blur {
 DimDesc ParseAndValidateDim(int ndim, TensorLayout layout) {
   static constexpr int kMaxDim = 3;
   if (layout.empty()) {
     // assuming plain data with no channels
     DALI_ENFORCE(ndim <= kMaxDim,
                  make_string("Input data with empty layout cannot have more than ", kMaxDim,
                              " dimensions, got input with ", ndim, " dimensions."));
-    return {0, ndim, false, false};
+    return {0, ndim, ndim, false, false};
   }
   // not-empty layout
   int axes_start = 0;
@@ -177,7 +110,7 @@ DimDesc ParseAndValidateDim(int ndim, TensorLayout layout) {
   DALI_ENFORCE(axes_count <= kMaxDim,
                make_string("Too many dimensions, found: ", axes_count,
                            " data axes, maximum supported is: ", kMaxDim, "."));
-  return {axes_start, axes_count, has_channels, axes_start != 0};
+  return {axes_start, axes_count, axes_count + (axes_start != 0), has_channels, axes_start != 0};
 }
 
 // axes here is dimension of element processed by kernel - in case of sequence it's 1 less than the
@@ -206,7 +139,7 @@ class GaussianBlurOpCpu : public OpImplBase<CPUBackend> {
     kmgr_.template Resize<Kernel>(nthreads, nsamples);
 
     for (int i = 0; i < nsamples; i++) {
-      params_[i] = GetSampleParams<axes>(i, spec_, ws);
+      params_[i] = ObtainSampleParams<axes>(i, spec_, ws);
       windows_[i].PrepareWindows(params_[i]);
       // We take only last `ndim` siginificant dimensions to handle sequences as well
       auto elem_shape = input[i].shape().template last<ndim>();
@@ -266,6 +199,9 @@ class GaussianBlurOpCpu : public OpImplBase<CPUBackend> {
   std::vector<GaussianWindows<axes>> windows_;
 };
 
+
+}  // namespace gaussian_blur
+
 template <>
 bool GaussianBlur<CPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
                                          const workspace_t<CPUBackend>& ws) {
@@ -277,7 +213,7 @@ bool GaussianBlur<CPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
                "Output data type must be same as input, FLOAT or skipped (defaults to input type)");
 
   // clang-format off
-  TYPE_SWITCH(input.type().id(), type2id, In, GAUSSIAN_BLUR_SUPPORTED_TYPES, (
+  TYPE_SWITCH(input.type().id(), type2id, In, GAUSSIAN_BLUR_CPU_SUPPORTED_TYPES, (
     VALUE_SWITCH(dim_desc.usable_axes_count, AXES, GAUSSIAN_BLUR_SUPPORTED_AXES, (
       VALUE_SWITCH(static_cast<int>(dim_desc.has_channels), HAS_CHANNELS, (0, 1), (
         constexpr bool has_ch = HAS_CHANNELS;
@@ -286,7 +222,7 @@ bool GaussianBlur<CPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
         } else {
           impl_ = std::make_unique<GaussianBlurOpCpu<float, In, AXES, has_ch>>(spec_, dim_desc);
         }
-      ), (DALI_FAIL("Got value different than {0, 1} when converting bool to int."))); // NOLINT
+      ), ()); // NOLINT, no other possible conversion
     ), DALI_FAIL("Axis count out of supported range."));  // NOLINT
   ), DALI_FAIL(make_string("Unsupported data type: ", input.type().id())));  // NOLINT
   // clang-format on

diff --git a/dali/operators/image/convolution/gaussian_blur.cu b/dali/operators/image/convolution/gaussian_blur.cu
@@ -0,0 +1,103 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dali/core/static_switch.h"
+#include "dali/kernels/imgproc/convolution/separable_convolution_gpu.h"
+#include "dali/kernels/kernel_manager.h"
+#include "dali/operators/image/convolution/gaussian_blur.h"
+#include "dali/operators/image/convolution/gaussian_blur_params.h"
+#include "dali/pipeline/data/views.h"
+#include "dali/pipeline/operator/common.h"
+#include "dali/operators/image/convolution/gaussian_blur_gpu.h"
+
+namespace dali {
+
+using namespace gaussian_blur;  // NOLINT
+
+namespace gaussian_blur {
+
+// Functions below are explicitly instantiated in separate compilation unit to allow
+// for parallel compitlation of underlying kernels.
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<uint8_t, uint8_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint8_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<int8_t, int8_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int8_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<uint16_t, uint16_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint16_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<int16_t, int16_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int16_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<uint32_t, uint32_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint32_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<int32_t, int32_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int32_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<uint64_t, uint64_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, uint64_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<int64_t, int64_t>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, int64_t>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float16, float16>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, float16>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, float>(const OpSpec&, DimDesc);
+
+extern template op_impl_uptr GetGaussianBlurGpuImpl<double, double>(const OpSpec&, DimDesc);
+extern template op_impl_uptr GetGaussianBlurGpuImpl<float, double>(const OpSpec&, DimDesc);
+
+}  // namespace gaussian_blur
+
+template <>
+bool GaussianBlur<GPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
+                                         const workspace_t<GPUBackend>& ws) {
+  const auto& input = ws.template InputRef<GPUBackend>(0);
+  auto layout = input.GetLayout();
+  auto dim_desc = ParseAndValidateDim(input.shape().sample_dim(), layout);
+  dtype_ = dtype_ != DALI_NO_TYPE ? dtype_ : input.type().id();
+  DALI_ENFORCE(dtype_ == input.type().id() || dtype_ == DALI_FLOAT,
+               "Output data type must be same as input, FLOAT or skipped (defaults to input type)");
+
+  // clang-format off
+  TYPE_SWITCH(input.type().id(), type2id, In, GAUSSIAN_BLUR_GPU_SUPPORTED_TYPES, (
+      if (dtype_ == input.type().id()) {
+        impl_ = GetGaussianBlurGpuImpl<In, In>(spec_, dim_desc);
+      } else {
+        impl_ = GetGaussianBlurGpuImpl<float, In>(spec_, dim_desc);
+      }
+  ), DALI_FAIL(make_string("Unsupported data type: ", input.type().id())));  // NOLINT
+  // clang-format on
+
+  return impl_->SetupImpl(output_desc, ws);
+}
+
+template <>
+void GaussianBlur<GPUBackend>::RunImpl(workspace_t<GPUBackend>& ws) {
+  impl_->RunImpl(ws);
+}
+
+DALI_REGISTER_OPERATOR(GaussianBlur, GaussianBlur<GPUBackend>, GPU);
+
+}  // namespace dali
diff --git a/dali/operators/image/convolution/gaussian_blur.h b/dali/operators/image/convolution/gaussian_blur.h
@@ -20,13 +20,21 @@
 
 #include "dali/pipeline/operator/operator.h"
 #include "dali/pipeline/util/operator_impl_utils.h"
+#include "dali/operators/image/convolution/gaussian_blur_params.h"
+#include "dali/pipeline/operator/common.h"
 
 namespace dali {
 
-#define GAUSSIAN_BLUR_SUPPORTED_TYPES \
-  (uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t, int64_t, float, float16)
+#define GAUSSIAN_BLUR_CPU_SUPPORTED_TYPES \
+  (uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t, int64_t, float16, float, double)
+
+// TODO(klecki): float16 support - it's not easily compatible with float window,
+// need to introduce some cast in between and expose it in the kernels
+#define GAUSSIAN_BLUR_GPU_SUPPORTED_TYPES \
+  (uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t, int64_t, float, double)
 
 #define GAUSSIAN_BLUR_SUPPORTED_AXES (1, 2, 3)
+
 template <typename Backend>
 class GaussianBlur : public Operator<Backend> {
  public:
@@ -50,6 +58,43 @@ class GaussianBlur : public Operator<Backend> {
   std::unique_ptr<OpImplBase<Backend>> impl_;
 };
 
+namespace gaussian_blur {
+
+constexpr static const char* kSigmaArgName = "sigma";
+constexpr static const char* kWindowSizeArgName = "window_size";
+
+/**
+ * @brief Obtain the parameters needed for generating Gaussian Windows for GaussianBlur Operator.
+ */
+template <int axes>
+inline GaussianBlurParams<axes> ObtainSampleParams(int sample, const OpSpec& spec,
+                                                   const ArgumentWorkspace& ws) {
+  GaussianBlurParams<axes> params;
+  GetGeneralizedArg<float>(make_span(params.sigmas), kSigmaArgName, sample, spec, ws);
+  GetGeneralizedArg<int>(make_span(params.window_sizes), kWindowSizeArgName, sample, spec, ws);
+  for (int i = 0; i < axes; i++) {
+    DALI_ENFORCE(
+        !(params.sigmas[i] == 0 && params.window_sizes[i] == 0),
+        make_string("`sigma` and `window_size` shouldn't be 0 at the same time for sample: ",
+                    sample, ", axis: ", i, "."));
+    DALI_ENFORCE(params.sigmas[i] >= 0,
+                 make_string("`sigma` must have non-negative values, got ", params.sigmas[i],
+                             " for sample: ", sample, ", axis: ", i, "."));
+    DALI_ENFORCE(params.window_sizes[i] >= 0,
+                 make_string("`window_size` must have non-negative values, got ",
+                             params.window_sizes[i], " for sample: ", sample, ", axis : ", i, "."));
+    if (params.window_sizes[i] == 0) {
+      params.window_sizes[i] = SigmaToDiameter(params.sigmas[i]);
+    } else if (params.sigmas[i] == 0.f) {
+      params.sigmas[i] = DiameterToSigma(params.window_sizes[i]);
+    }
+  }
+  return params;
+}
+
+DimDesc ParseAndValidateDim(int ndim, TensorLayout layout);
+
+}  // namespace gaussian_blur
 }  // namespace dali
 
 #endif  // DALI_OPERATORS_IMAGE_CONVOLUTION_GAUSSIAN_BLUR_H_