NVIDIA · szalpal · Oct 6, 2020 · Sep 28, 2020 · Sep 28, 2020 · Sep 28, 2020
diff --git a/dali/kernels/common/join/tensor_join_cpu.h b/dali/kernels/common/join/tensor_join_cpu.h
@@ -0,0 +1,155 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_TENSOR_JOIN_CPU_H
+#define DALI_TENSOR_JOIN_CPU_H
+
+#include "dali/kernels/kernel.h"
+#include "dali/core/tensor_shape_print.h"
+#include "dali/core/format.h"
+
+namespace dali {
+namespace kernels {
+
+enum JoinMode {
+  STACK,
+  CONCAT
+};
+
+namespace detail {
+
+template<typename T>
+void TransferBuffers(span<T> output, const TensorShape<> &output_shape,
+                     span<TensorView<StorageCPU, const T>> inputs, int axis) {
+  vector<int64_t> copy_sizes(inputs.size());
+  for (int t = 0; t < inputs.size(); t++) {
+    copy_sizes[t] = volume(inputs[t].shape.begin() + axis, inputs[t].shape.end());
+  }
+  auto nouter = volume(output_shape.begin(), output_shape.end());
+  auto *out = output.data();
+  for (ptrdiff_t outer = 0; outer < nouter; outer++) {
+    for (int t = 0; t < inputs.size(); t++) {
+      auto *src = inputs[t].data + outer * copy_sizes[t];
+      for (ptrdiff_t inner = 0; inner < copy_sizes[t]; inner++) {
+        *out++ = src[inner];
+      }
+    }
+  }
+}
+
+
+template<JoinMode mode=STACK>
+TensorShape<> DetermineShape(span<const TensorShape<>> in_shapes, int axis) {
+  TensorShape<> ret;
+  auto &insh = in_shapes[0];
+  ret.resize(insh.size() + 1);
+  for (int i = 0; i < axis; i++) {
+    ret[i] = insh[i];
+  }
+  ret[axis] = in_shapes.size();
+  for (int i = axis + 1; i < ret.size(); i++) {
+    ret[i] = insh[i - 1];
+  }
+  return ret;
+}
+
+
+template<>
+TensorShape<> DetermineShape<CONCAT>(span<const TensorShape<>> in_shapes, int axis) {
+  TensorShape<> ret = in_shapes[0];
+  ret[axis] *= in_shapes.size();
+  return ret;
+}
+
+}  // namespace detail
+
+template<typename Out, typename In, JoinMode mode = STACK, int dims = -1>
+struct TensorJoinCpu {
+  KernelRequirements Setup(KernelContext &ctx, span<const TensorShape<dims>> in_shapes, int axis) {
+    n_input_tensors_ = in_shapes.size();
+    orig_shapes_(in_shapes);
+    auto ndims = in_shapes[0].sample_dim();
+    DALI_ENFORCE(axis < ndims && axis > -ndims, "Incorrect axis. Actual: ", axis, ". Expected in [",
+                 -ndims + 1, ", ", ndims - 1, "] interval");
+    axis_ = axis >= 0 ? axis : ndims + axis;
+
+    {
+      const auto &ref = in_shapes[0];
+      for (int i = 1; i < n_input_tensors_; i++) {
+        DALI_ENFORCE(in_shapes[i].sample_dim() == ref.sample_dim(),
+                     "Every input shape must have the same number of dimensions.");
+        for (int j = 0; j < ref.size(); j++) {
+          if (mode == CONCAT) {
+            DALI_ENFORCE(in_shapes[i][j] == ref.shape[j] || (j == axis_ && mode == CONCAT),
+                         make_string(
+                                 "Number of samples in every dimension "
+                                 "(but the one along which concatenation occurs) must be the same "
+                                 "(CONCAT mode). 0-th shape at index ", j, " has dimension ",
+                                 ref.shape[j], ", while ", i, "-th shape at index ", j,
+                                 " has dimension ", in_shapes[i][j]));
+          } else {
+            DALI_ENFORCE(in_shapes[i][j] == ref.shape[j], make_string(
+                    "Number of samples in every dimension must be the same (STACK mode). "
-                    "Number of samples in every dimension must be the same (STACK mode). "
+                    "STACK: Number of samples in every dimension must be the same. "
-                    "Number of samples in every dimension must be the same (STACK mode). "
+                    "STACK: Number of samples in every dimension must be the same. "
+                    "0-th shape at index ", j, " has dimension ", ref.shape[j], ", while ", i,
+                    "-th shape at index ", j, " has dimension ", in_shapes[i][j]));
+          }
+        }
+      }
+    }
+
+    KernelRequirements kr;
+    output_shape_ = detail::DetermineShape<mode>(in_shapes, axis);
+    kr.output_shapes.resize(1);
+    TensorListShape<> tmp({output_shape_});  // clang's destructor bug still haunting
+    kr.output_shapes[0] = tmp;
+    return kr;
+  }
+
+
+  KernelRequirements Setup(KernelContext &ctx, span<const InTensorCPU<In, dims>> in, int axis) {
+    std::vector<TensorShape<>> in_shapes(in.size());
+    for (int i = 0; i < in.size(); i++) {
+      in_shapes[i] = in[i].shape;
+    }
+    return Setup(ctx, in_shapes, axis);
+  }
+
+
+  void Run(KernelContext &ctx, const OutTensorCPU<Out, dims> &out,
+           span<const InTensorCPU<In, dims>> in) {
+    DALI_ENFORCE(in.size() == n_input_tensors_, make_string(
+            "Input must have the same number of tensors as was specified in call to Setup. Expected: ",
+            n_input_tensors_, "Actual: ", in.size()));
+    for (int i = 0; i < n_input_tensors_; i++) {
+      DALI_ENFORCE(in[i].shape == orig_shapes_[i], make_string(
+              "Input must have the same shapes as was specified in call to Setup. Expected: ",
+              orig_shapes_[i], "Actual: ", in[i].shape));
+    }
+
+    auto output = make_span(out);
+    detail::TransferBuffers(output, output_shape_, in, axis_);
+
+
+  }
+
+
+  int axis_, n_input_tensors_;
-  int axis_, n_input_tensors_;
+  int axis_ = -1, n_input_tensors_ = -1;
-  int axis_, n_input_tensors_;
+  int axis_ = -1, n_input_tensors_ = -1;
+  TensorShape<dims> output_shape_;
+  std::vector<TensorShape<dims>> orig_shapes_;
+};
+
+}
+}
+#endif //DALI_TENSOR_JOIN_CPU_H
+
diff --git a/dali/kernels/common/join/tensor_join_cpu_test.cc b/dali/kernels/common/join/tensor_join_cpu_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "tensor_join_cpu.h"
+
+namespace dali {
+namespace kernels {
+namespace test {
+
+TEST(TensorJoinCpuTest, DetermineShapeStack) {
+  std::vector<TensorShape<>> shin = {{4, 5, 7, 8}, 2};
+  TensorShape<> sh0 = {2, 4, 5, 7, 8};
+  TensorShape<> sh1 = {4, 2, 5, 7, 8};
+  TensorShape<> sh2 = {4, 5, 2, 7, 8};
+  TensorShape<> sh3 = {4, 5, 7, 2, 8};
+  TensorShape<> sh4 = {4, 5, 7, 8, 2};
+  EXPECT_EQ(detail::DetermineShape<STACK>(make_span(shin), 0), sh0);
+  EXPECT_EQ(detail::DetermineShape<STACK>(make_span(shin), 1), sh1);
+  EXPECT_EQ(detail::DetermineShape<STACK>(make_span(shin), 2), sh2);
+  EXPECT_EQ(detail::DetermineShape<STACK>(make_span(shin), 3), sh3);
+  EXPECT_EQ(detail::DetermineShape<STACK>(make_span(shin), 4), sh4);
+}
+
+
+TEST(TensorJoinCpuTest, DetermineShapeConcat) {
+  std::vector<TensorShape<>> shin = {{4, 5, 7, 8}, 2};
+  TensorShape<> sh0 = {8, 5, 7, 8};
+  TensorShape<> sh1 = {4, 10, 7, 8};
+  TensorShape<> sh2 = {4, 5, 14, 8};
+  TensorShape<> sh3 = {4, 5, 7, 16};
+  EXPECT_EQ(detail::DetermineShape<CONCAT>(make_span(shin), 0), sh0);
+  EXPECT_EQ(detail::DetermineShape<CONCAT>(make_span(shin), 1), sh1);
+  EXPECT_EQ(detail::DetermineShape<CONCAT>(make_span(shin), 2), sh2);
+  EXPECT_EQ(detail::DetermineShape<CONCAT>(make_span(shin), 3), sh3);
+}
+
+
+TEST(TensorJoinCpuTest, TransferBufferTest) {
+  using namespace std;
+
+  vector<vector<int>> arr = {{6, 8, 5, 1, 3, 5, 1, 6, 8, 3, 7, 5},
+                             {4, 5, 1, 8, 4, 4, 1, 4, 1, 7, 6, 6}};
-  vector<vector<int>> arr = {{6, 8, 5, 1, 3, 5, 1, 6, 8, 3, 7, 5},
-                             {4, 5, 1, 8, 4, 4, 1, 4, 1, 7, 6, 6}};
+  vector<vector<int>> arr = {{
+    100, 101, 102, 103,
+    104, 105, 106, 107,
+    108, 109, 110, 112,
+  }, {
+    200, 201, 202, 203,
+    204, 205, 206, 207,
+    208, 209, 210, 212,
+  }};
-  vector<vector<int>> arr = {{6, 8, 5, 1, 3, 5, 1, 6, 8, 3, 7, 5},
-                             {4, 5, 1, 8, 4, 4, 1, 4, 1, 7, 6, 6}};
+  vector<vector<int>> arr = {{
+    100, 101, 102, 103,
+    104, 105, 106, 107,
+    108, 109, 110, 112,
+  }, {
+    200, 201, 202, 203,
+    204, 205, 206, 207,
+    208, 209, 210, 212,
+  }};
+  vector<TensorShape<>> sh = {{3, 4},
+                              {3, 4}};
+  vector<TensorView<StorageCPU, const int>> in;
+  for (size_t i = 0; i < arr.size(); i++) {
+    in.emplace_back(arr[i].data(), sh[i]);
+  }
+
+  // Output shape for this buffer: {2, 3, 4} (STACK) or {6, 4} (CONCAT)
+  vector<int> arr0 = {6, 8, 5, 1, 3, 5, 1, 6, 8, 3, 7, 5, 4, 5, 1, 8, 4, 4, 1, 4, 1, 7, 6, 6};
+
+  // Output shape for this buffer: {3, 2, 4} (STACK) or {3, 8} (CONCAT)
+  vector<int> arr1 = {6, 8, 5, 1, 4, 5, 1, 8, 3, 5, 1, 6, 4, 4, 1, 4, 8, 3, 7, 5, 1, 7, 6, 6};
+
+  // Output shape for this buffer: {4, 3, 2} (STACK), CONCAT unavailable
+  vector<int> arr2 = {6, 4, 8, 5, 5, 1, 1, 8, 3, 4, 5, 4, 1, 1, 6, 4, 8, 1, 3, 7, 7, 6, 5, 6};
+
+  vector<vector<int>> ref_arr;
+  ref_arr.emplace_back(arr0);
+  ref_arr.emplace_back(arr1);
+  ref_arr.emplace_back(arr2);
+
+  for (size_t ax = 0; ax < ref_arr.size(); ax++) {
+    auto outsh = detail::DetermineShape<STACK>(make_span(sh), ax);
+    vector<int> outbuf(volume(outsh));
+    detail::TransferBuffers(make_span(outbuf), outsh, make_span(in), ax);
+    EXPECT_EQ(outbuf, ref_arr[ax]);
+  }
+
+  for (size_t ax = 0; ax < ref_arr.size() - 1; ax++) {
+    auto outsh = detail::DetermineShape<CONCAT>(make_span(sh), ax);
+    vector<int> outbuf(volume(outsh));
+    detail::TransferBuffers(make_span(outbuf), outsh, make_span(in), ax);
+    EXPECT_EQ(outbuf, ref_arr[ax]);
+  }
+}
+
+//TEST(TensorStackCpuTest, KernelTest) {
+//  using namespace std;
+//
+//  vector<vector<int>> arr = {{6, 8, 5, 1, 3, 5, 1, 6, 8, 3, 7, 5},
+//                             {4, 5, 1, 8, 4, 4, 1, 4, 1, 7, 6, 6}};
+//  vector<TensorShape<>> sh = {{3, 4},
+//                              {3, 4}};
+//  vector<TensorView<StorageCPU, const int>> in;
+//  for (size_t i = 0; i < arr.size(); i++) {
+//    in.emplace_back(arr[i].data(), sh[i]);
+//  }
+//
+//  // Output shape for this buffer: {2, 3, 4} (STACK) or {6, 4} (CONCAT)
+//  vector<int> arr0 = {6, 8, 5, 1, 3, 5, 1, 6, 8, 3, 7, 5, 4, 5, 1, 8, 4, 4, 1, 4, 1, 7, 6, 6};
+//
+//  // Output shape for this buffer: {3, 2, 4} (STACK) or {3, 8} (CONCAT)
+//  vector<int> arr1 = {6, 8, 5, 1, 4, 5, 1, 8, 3, 5, 1, 6, 4, 4, 1, 4, 8, 3, 7, 5, 1, 7, 6, 6};
+//
+//  // Output shape for this buffer: {4, 3, 2} (STACK), CONCAT unavailable
+//  vector<int> arr2 = {6, 4, 8, 5, 5, 1, 1, 8, 3, 4, 5, 4, 1, 1, 6, 4, 8, 1, 3, 7, 7, 6, 5, 6};
+//
+//  vector<vector<int>> ref_arr;
+//  ref_arr.emplace_back(arr0);
+//  ref_arr.emplace_back(arr1);
+//  ref_arr.emplace_back(arr2);
+//
+//  TensorStackCpu<int, int> kernel;
+//  KernelContext ctx;
+//  auto kr = kernel.Setup(ctx, make_cspan(in), 0);
+//
+//
+//}
+
+
+}  // namespace test
+}  // namespace kernels
+}  // namespace dali