NVIDIA · mzient · Oct 4, 2020 · Oct 1, 2020 · Oct 1, 2020 · Oct 1, 2020
diff --git a/dali/kernels/imgproc/convolution/convolution_cpu.h b/dali/kernels/imgproc/convolution/convolution_cpu.h
@@ -349,8 +349,9 @@ struct ConvolutionCpu {
     if (axis == ndim - has_channels - 1) {
       int num_channels = has_channels ? in_shape[ndim - 1] : 1;
       return num_channels * window_size;
+    } else {
+      return kStripSize * window_size;
     }
-    return kStripSize * window_size;
   }
 };
 

diff --git a/dali/kernels/imgproc/convolution/cutlass/threadblock/predicated_tile_iterator.h b/dali/kernels/imgproc/convolution/cutlass/threadblock/predicated_tile_iterator.h
@@ -650,8 +650,9 @@ class PositionPredicatedTileIterator<Shape_, Element_, layout::PitchLinear, Adva
   CUTLASS_DEVICE int get_distance(int abs_window_element, int lo_offset) {
     if (mirrored) {
       return get_mirrored_element(abs_window_element) - lo_offset;
+    } else {
+      return abs_window_element - lo_offset;
     }
-    return abs_window_element - lo_offset;
   }
 
   struct aligned_offset_data {

diff --git a/dali/kernels/reduce/reduce_all_gpu_test.cu b/dali/kernels/reduce/reduce_all_gpu_test.cu
@@ -214,9 +214,9 @@ void ReduceAllGPUTest<Reduction>::TestReduceAllKernel(int min_size, int max_size
 
   kernels::reduce::ReduceAllGPU<Out, In, Reduction> kernel;
 
-  auto out_shape = TensorListShape<1>::make_uniform(nsamples, TensorShape<1>{1});
-  TestTensorList<Out, 1> out;
-  out.reshape(out_shape.to_static<1>());
+  auto out_shape = TensorListShape<0>(nsamples);
+  TestTensorList<Out, 0> out;
+  out.reshape(out_shape);
 
   auto in_view_gpu = in.gpu();
   auto out_view_gpu = out.gpu();

diff --git a/dali/kernels/reduce/reduce_all_kernel_gpu.h b/dali/kernels/reduce/reduce_all_kernel_gpu.h
@@ -63,12 +63,12 @@ class DLL_PUBLIC ReduceAllGPU {
 
     KernelRequirements req;
     req.scratch_sizes = se.sizes;
-    req.output_shapes = {TensorListShape<1>::make_uniform(num_samples, TensorShape<1>{1})};
+    req.output_shapes = {TensorListShape<0>(num_samples)};
     return req;
   }
 
   DLL_PUBLIC void Run(KernelContext &context,
-                      const OutListGPU<Out, 1> &out,
+                      const OutListGPU<Out, 0> &out,
                       const InListGPU<In, DynamicDimensions> &in) {
     DALI_ENFORCE(out.is_contiguous(), "Reduce all kernel expects the output to be contiguous");
     auto* out_start = out[0].data;

diff --git a/dali/kernels/signal/decibel/to_decibels_gpu.cu b/dali/kernels/signal/decibel/to_decibels_gpu.cu
@@ -71,7 +71,7 @@ KernelRequirements ToDecibelsGpu<T>::Setup(KernelContext &context,
 template <typename T>
 void ToDecibelsGpu<T>::Run(KernelContext &context, const OutListGPU<T, DynamicDimensions> &out,
                            const InListGPU<T, DynamicDimensions> &in, const ToDecibelsArgs<T> &args,
-                           InListGPU<T, 1> max_values) {
+                           InListGPU<T, 0> max_values) {
   DALI_ENFORCE(max_values.empty() || max_values.is_contiguous(),
       "Reduce all kernel expects the output to be contiguous");
   const T* max_values_data = max_values.empty() ? nullptr : max_values[0].data;

diff --git a/dali/kernels/signal/decibel/to_decibels_gpu.h b/dali/kernels/signal/decibel/to_decibels_gpu.h
@@ -42,7 +42,7 @@ class DLL_PUBLIC ToDecibelsGpu {
                       const OutListGPU<T, DynamicDimensions> &out,
                       const InListGPU<T, DynamicDimensions> &in,
                       const ToDecibelsArgs<T> &args,
-                      InListGPU<T, 1> max_values = {});
+                      InListGPU<T, 0> max_values = {});
 };
 
 }  // namespace signal

diff --git a/dali/kernels/signal/decibel/to_decibels_gpu_test.cc b/dali/kernels/signal/decibel/to_decibels_gpu_test.cc
@@ -92,7 +92,7 @@ TEST_P(ToDecibelsGpuTest, ToDecibelsGpuTest) {
 
   std::vector<T> max_values(batch_size, 0.0);
   memory::KernelUniquePtr<T> max_values_gpu;
-  InListGPU<T, 1> max_values_arg;
+  InListGPU<T, 0> max_values_arg;
   if (args.ref_max) {
     for (int b = 0; b < batch_size; ++b) {
       int64_t sz = volume(data_shape_[b]);
@@ -106,7 +106,7 @@ TEST_P(ToDecibelsGpuTest, ToDecibelsGpuTest) {
     cudaMemcpy(max_values_gpu.get(), max_values.data(), batch_size * sizeof(T),
                cudaMemcpyHostToDevice);
     max_values_arg = {max_values_gpu.get(),
-                      TensorListShape<1>::make_uniform(batch_size, TensorShape<1>{1})};
+                      TensorListShape<0>(batch_size)};
   }
 
   kernel.Run(ctx, out.gpu(), in_.gpu(), args, max_values_arg);

diff --git a/dali/operators/audio/nonsilence_op.cc b/dali/operators/audio/nonsilence_op.cc
@@ -66,7 +66,7 @@ bool NonsilenceOperatorCpu::SetupImpl(std::vector<OutputDesc> &output_desc,
   AcquireArgs(spec_, ws);
   TypeInfo output_type;
   output_type.SetType<int>(TypeTable::GetTypeID<int>());
-  TensorShape<> scalar_shape = {1};
+  TensorShape<> scalar_shape = {};
 
   output_desc.resize(detail::kNumOutputs);
   for (int i = 0; i < detail::kNumOutputs; i++) {

diff --git a/dali/operators/decoder/audio/audio_decoder_op.cc b/dali/operators/decoder/audio/audio_decoder_op.cc
@@ -73,7 +73,7 @@ AudioDecoderCpu::SetupImpl(std::vector<OutputDesc> &output_desc, const workspace
   // Currently, metadata is only the sampling rate.
   // On the event something else would emerge,
   // this approach should be completely redefined
-  TensorListShape<> shape_rate(batch_size, 1);
+  TensorListShape<> shape_rate(batch_size, 0);
   TensorListShape<> shape_data(batch_size, downmix_ ? 1 : 2);
 
   for (int i = 0; i < batch_size; i++) {
@@ -84,7 +84,7 @@ AudioDecoderCpu::SetupImpl(std::vector<OutputDesc> &output_desc, const workspace
     TensorShape<> data_sample_shape = DecodedAudioShape(
         meta, use_resampling_ ? target_sample_rates_[i] : -1.0f, downmix_);
     shape_data.set_tensor_shape(i, data_sample_shape);
-    shape_rate.set_tensor_shape(i, {1});
+    shape_rate.set_tensor_shape(i, {});
     files_names_[i] = input[i].GetSourceInfo();
   }
 
@@ -130,7 +130,7 @@ AudioDecoderCpu::DecodeSample(const TensorView<StorageCPU, OutputType, DynamicDi
 template <typename OutputType, typename DecoderOutputType>
 void AudioDecoderCpu::DecodeBatch(workspace_t<Backend> &ws) {
   auto decoded_output = view<OutputType, DynamicDimensions>(ws.template OutputRef<Backend>(0));
-  auto sample_rate_output = view<float, 1>(ws.template OutputRef<Backend>(1));
+  auto sample_rate_output = view<float, 0>(ws.template OutputRef<Backend>(1));
   int batch_size = decoded_output.shape.num_samples();
   auto &tp = ws.GetThreadPool();
 

diff --git a/dali/operators/generic/constant.h b/dali/operators/generic/constant.h
@@ -33,14 +33,15 @@ class Constant : public Operator<Backend> {
   using Workspace = workspace_t<Backend>;
 
   explicit Constant(const OpSpec &spec) : Operator<Backend>(spec) {
+    bool has_shape = spec.ArgumentDefined("shape");
     spec.TryGetRepeatedArgument<int>(shape_arg_, "shape");
     output_type_ = spec.GetArgument<DALIDataType>("dtype");
     if (spec.HasArgument("fdata")) {
       DALI_ENFORCE(!spec.HasArgument("idata"), "Constant node: `fdata` and `idata` arguments are "
         "mutually exclusive");
       fdata_ = spec.GetRepeatedArgument<float>("fdata");
-      if (shape_arg_.empty()) {
-        shape_arg_.push_back(fdata_.size());
+      if (!has_shape) {
+        shape_arg_ = { static_cast<int>(fdata_.size()) };
       } else {
         DALI_ENFORCE(fdata_.size() == static_cast<size_t>(volume(shape_arg_)) || fdata_.size() == 1,
           "The number of values does not match the shape specified");
@@ -56,8 +57,8 @@ class Constant : public Operator<Backend> {
         output_type_ = DALI_INT32;
 
       idata_ = spec.GetRepeatedArgument<int>("idata");
-      if (shape_arg_.empty()) {
-        shape_arg_.push_back(idata_.size());
+      if (!has_shape) {
+        shape_arg_ = { static_cast<int>(idata_.size()) };
       } else {
         DALI_ENFORCE(idata_.size() == static_cast<size_t>(volume(shape_arg_)) || idata_.size() == 1,
           "The number of values does not match the shape specified");

diff --git a/dali/operators/generic/slice/slice_attr.h b/dali/operators/generic/slice/slice_attr.h
@@ -63,8 +63,8 @@ class SliceAttr {
                              crop_anchor.type().id(), " and ", crop_shape.type().id()));
     auto args_dtype = crop_anchor.type().id();
     TYPE_SWITCH(args_dtype, type2id, ArgsType, SLICE_ARGS_TYPES, (
-      auto anchor_view = view<const ArgsType, 1>(crop_anchor);
-      auto shape_view = view<const ArgsType, 1>(crop_shape);
+      auto anchor_view = view<const ArgsType>(crop_anchor);
+      auto shape_view = view<const ArgsType>(crop_shape);
       for (size_t data_idx = 0; data_idx < batch_size__; data_idx++) {
         VerifyArgsShape(anchor_view.tensor_shape(data_idx), shape_view.tensor_shape(data_idx));
         ProcessArgumentsHelper(data_idx,
@@ -152,6 +152,8 @@ class SliceAttr {
   void VerifyArgsShape(const TensorShape<>& crop_anchor_shape,
                        const TensorShape<>& crop_shape_shape) {
     DALI_ENFORCE(crop_anchor_shape == crop_shape_shape);
+    DALI_ENFORCE(crop_anchor_shape.sample_dim() <= 1,
+                 "Anchor and shape must be 1D tensors or scalars");
     size_t args_size = volume(crop_anchor_shape);
     auto axes_size = !axis_names_.empty() ? axis_names_.size() : axes_.size();
     DALI_ENFORCE(args_size == axes_size,

diff --git a/dali/operators/math/expressions/arithmetic.h b/dali/operators/math/expressions/arithmetic.h
@@ -170,7 +170,7 @@ inline TensorListShape<> ShapePromotion(std::string op, span<const TensorListSha
                               *out_shape, ", ", *shapes[i], ")."));
     }
   }
-  return out_shape ? *out_shape : uniform_list_shape(batch_size, {1});
+  return out_shape ? *out_shape : TensorListShape<0>(batch_size);
 }
 
 /**
@@ -183,7 +183,7 @@ DLL_PUBLIC inline const TensorListShape<> &PropagateShapes(ExprNode &expr,
                                                            const workspace_t<Backend> &ws,
                                                            int batch_size) {
   if (expr.GetNodeType() == NodeType::Constant) {
-    expr.SetShape(uniform_list_shape(batch_size, {1}));
+    expr.SetShape(TensorListShape<0>(batch_size));
     return expr.GetShape();
   }
   if (expr.GetNodeType() == NodeType::Tensor) {

diff --git a/dali/operators/math/expressions/arithmetic_meta.h b/dali/operators/math/expressions/arithmetic_meta.h
@@ -712,11 +712,11 @@ inline ArithmeticOp NameToOp(const std::string &op_name) {
 /**
  * @brief Check if input of given `shape` should be considered to represent (tensor of) scalars.
  *
- * A tensor of scalars is uniform tensor with sample dimension equal 1 and only 1 (scalar) element
- * in every sample.
+ * As a backward compatibility, 1D 1-element tensors are considered scalars (in addition to true
+ * scalars).
  */
 inline bool IsScalarLike(const TensorListShape<> &shape) {
-  return is_uniform(shape) && shape.sample_dim() == 1 && shape.tensor_shape_span(0)[0] == 1;
+  return is_uniform(shape) && shape.sample_dim() <= 1 && volume(shape.tensor_shape_span(0)) == 1;
 }
 
 }  // namespace dali

diff --git a/dali/operators/math/expressions/arithmetic_test.cc b/dali/operators/math/expressions/arithmetic_test.cc
@@ -64,21 +64,21 @@ TEST(ArithmeticOpsTest, TreePropagation) {
   EXPECT_EQ(func[1].GetOutputDesc(), "CC:int32");
 }
 
-TEST(ArithmeticOpsTest, PropagateScalarLike) {
+
+TEST(ArithmeticOpsTest, PropagateScalarInput) {
   std::string expr_str = "sub(&0 $1:int32))";
   auto expr = ParseExpressionString(expr_str);
   auto &expr_ref = *expr;
   HostWorkspace ws;
   std::shared_ptr<TensorVector<CPUBackend>> in[1];
   for (auto &ptr : in) {
     ptr = std::make_shared<TensorVector<CPUBackend>>();
-    ptr->Resize({{1}, {1}});
-    ptr->SetLayout(TensorLayout("HW"));
+    ptr->Resize({{}, {}});
   }
   ws.AddInput(in[0]);
 
   auto result_shape = PropagateShapes<CPUBackend>(expr_ref, ws, 2);
-  auto expected_shpe = TensorListShape<>{{1}, {1}};
+  auto expected_shpe = TensorListShape<>{{}, {}};
   EXPECT_EQ(result_shape, expected_shpe);
 }
 
@@ -514,7 +514,7 @@ class ArithmeticOpsScalarTest :  public ::testing::TestWithParam<shape_sequence>
       for (int tensor_idx = 0; tensor_idx < result_shape.num_samples(); tensor_idx++) {
         for (int j = 0; j < result_shape[tensor_idx].num_elements(); j++) {
           auto is_scalar = [] (auto &shape, int tensor_idx) {
-            return shape[tensor_idx] == TensorShape<>{1};
+            return volume(shape[tensor_idx]) == 1;
           };
           int expected = data0[offset_in[0] + (is_scalar(s[0], tensor_idx) ? 0 : j)] +
                          data1[offset_in[1] + (is_scalar(s[1], tensor_idx) ? 0 : j)];
@@ -541,9 +541,12 @@ namespace {
 
 std::array<TensorListShape<>, 3> GetShapesForSequence(int batch_size, int left_elems,
                                                       int right_elems) {
-  return {uniform_list_shape(batch_size, {left_elems}),
-          uniform_list_shape(batch_size, {right_elems}),
-          uniform_list_shape(batch_size, {std::max(left_elems, right_elems)})};
+  auto GetTensorOrScalar = [=](int elems) {
+    return elems != 1 ? uniform_list_shape(batch_size, {elems}) : TensorListShape<0>(batch_size);
+  };
+  return {GetTensorOrScalar(left_elems),
+          GetTensorOrScalar(right_elems),
+          GetTensorOrScalar(std::max(left_elems, right_elems))};
 }
 
 /**

diff --git a/dali/operators/random/coin_flip.h b/dali/operators/random/coin_flip.h
@@ -43,7 +43,7 @@ class CoinFlip : public Operator<CPUBackend> {
 
   bool SetupImpl(std::vector<OutputDesc> &output_desc, const HostWorkspace &ws) override {
     output_desc.resize(1);
-    output_desc[0].shape = uniform_list_shape(batch_size_, {1});
+    output_desc[0].shape = TensorListShape<0>(batch_size_);
     output_desc[0].type = TypeTable::GetTypeInfo(DALI_INT32);
     return true;
   }

diff --git a/dali/operators/random/normal_distribution_op.h b/dali/operators/random/normal_distribution_op.h
@@ -120,7 +120,7 @@ class NormalDistribution : public Operator<Backend> {
 
 
   TensorListShape<> ShapeForDefaultConfig(const workspace_t<Backend> &ws) {
-    return uniform_list_shape(batch_size_, {1});
+    return TensorListShape<0>(batch_size_);
   }
 
 

diff --git a/dali/operators/random/uniform.h b/dali/operators/random/uniform.h
@@ -32,7 +32,7 @@ class Uniform : public Operator<CPUBackend> {
     GetSingleOrRepeatedArg(spec, range, "range", 2);
     dis_ = std::uniform_real_distribution<float>(range[0], range[1]);
 
-    std::vector<int> shape_arg{1};
+    std::vector<int> shape_arg{};
     if (spec.HasArgument("shape"))
       shape_arg = spec.GetRepeatedArgument<int>("shape");
     shape_ = std::vector<int64_t>{std::begin(shape_arg), std::end(shape_arg)};

diff --git a/dali/operators/reader/loader/coco_loader.cc b/dali/operators/reader/loader/coco_loader.cc
@@ -227,7 +227,6 @@ void parse_annotations(
               parser.NextArrayValue();
               annotation.rle_.w_ = parser.GetInt();
               parser.NextArrayValue();
-              RAPIDJSON_ASSERT(parser.PeekType() == -1);
             } else if (0 == std::strcmp(another_key, "counts")) {
               annotation.rle_.rle_ = parser.GetString();
             }

diff --git a/dali/operators/reader/nemo_asr_reader_op.cc b/dali/operators/reader/nemo_asr_reader_op.cc
@@ -163,7 +163,7 @@ void NemoAsrReader::RunImpl(SampleWorkspace &ws) {
   int next_out_idx = 1;
   if (read_sr_) {
     auto &sample_rate = ws.Output<CPUBackend>(next_out_idx++);
-    sample_rate.Resize({1});
+    sample_rate.Resize({});
     sample_rate.set_type(TypeTable::GetTypeInfo(DALI_FLOAT));
     sample_rate.mutable_data<float>()[0] = sample.audio_meta().sample_rate;
     sample_rate.SetMeta(meta);

diff --git a/dali/operators/signal/decibel/to_decibels_op_gpu.cu b/dali/operators/signal/decibel/to_decibels_op_gpu.cu
@@ -94,7 +94,7 @@ void ToDecibelsImpl<T>::RunImpl(workspace_t<GPUBackend> &ws) {
   if (args_.ref_max) {
     max_out_.set_type(max_out_desc_[0].type);
     max_out_.Resize(max_out_desc_[0].shape);
-    auto max_values_view = view<T, 1>(max_out_);
+    auto max_values_view = view<T, 0>(max_out_);
     kmgr_max_.Run<MaxKernel>(0, 0, ctx, max_values_view, in_view);
     kmgr_todb_.Run<ToDecibelsKernel>(0, 0, ctx, out_view, in_view, args_, max_values_view);
   } else {

diff --git a/dali/pipeline/data/tensor_list_test.cc b/dali/pipeline/data/tensor_list_test.cc
@@ -270,11 +270,11 @@ TYPED_TEST(TensorListTest, TestMultipleZeroSizeResize) {
   }
 }
 
-TYPED_TEST(TensorListTest, TestScalarResize) {
+TYPED_TEST(TensorListTest, TestFakeScalarResize) {
   TensorList<TypeParam> tensor_list;
 
   int num_scalar = this->RandInt(1, 128);
-  auto shape = uniform_list_shape(num_scalar, {1});
+  auto shape = uniform_list_shape(num_scalar, {1});  // {1} on purpose
   tensor_list.Resize(shape);
 
   ASSERT_NE(tensor_list.template mutable_data<float>(), nullptr);
@@ -283,7 +283,7 @@ TYPED_TEST(TensorListTest, TestScalarResize) {
   ASSERT_FALSE(tensor_list.shares_data());
 
   for (int i = 0; i < num_scalar; ++i) {
-    ASSERT_EQ(tensor_list.tensor_shape(i), TensorShape<>{1});
+    ASSERT_EQ(tensor_list.tensor_shape(i), TensorShape<>{1});  // {1} on purpose
     ASSERT_EQ(tensor_list.tensor_offset(i), i);
   }
 }

diff --git a/dali/pipeline/executor/executor.h b/dali/pipeline/executor/executor.h
@@ -508,7 +508,7 @@ void Executor<WorkspacePolicy, QueuePolicy>::RunCPU() {
 
   // Run the cpu-ops in the thread
   // Process each CPU Op in batch
-  for (int cpu_op_id = 0; cpu_op_id < graph_->NumOp(OpType::CPU); ++cpu_op_id) {
+  for (int cpu_op_id = 0; cpu_op_id < graph_->NumOp(OpType::CPU) && !exec_error_; ++cpu_op_id) {
     OpNode &op_node = graph_->Node(OpType::CPU, cpu_op_id);
     typename WorkspacePolicy::template ws_t<OpType::CPU> ws =
         WorkspacePolicy::template GetWorkspace<OpType::CPU>(cpu_idxs, *graph_, cpu_op_id);
@@ -554,7 +554,7 @@ void Executor<WorkspacePolicy, QueuePolicy>::RunMixed() {
 
   CUDA_CALL(cudaEventSynchronize(mixed_stage_event_));
 
-    for (int i = 0; i < graph_->NumOp(OpType::MIXED); ++i) {
+    for (int i = 0; i < graph_->NumOp(OpType::MIXED) && !exec_error_; ++i) {
       OpNode &op_node = graph_->Node(OpType::MIXED, i);
       try {
         typename WorkspacePolicy::template ws_t<OpType::MIXED> ws =
@@ -615,7 +615,7 @@ void Executor<WorkspacePolicy, QueuePolicy>::RunGPU() {
   // iterations of a stage of the pipeline.
   CUDA_CALL(cudaEventSynchronize(gpu_stage_event_));
 
-    for (int i = 0; i < graph_->NumOp(OpType::GPU); ++i) {
+    for (int i = 0; i < graph_->NumOp(OpType::GPU) && !exec_error_; ++i) {
       OpNode &op_node = graph_->Node(OpType::GPU, i);
       try {
         typename WorkspacePolicy::template ws_t<OpType::GPU> ws =