NVIDIA · JanuszL · May 29, 2019 · May 14, 2019
diff --git a/dali/core/device_guard.cc b/dali/core/device_guard.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/core/device_guard.h"
+#include "dali/core/error_handling.h"
+
+namespace dali {
+
+DeviceGuard::DeviceGuard() :
+  old_context_(NULL) {
+  DALI_ENFORCE(cuInitChecked(),
+    "Failed to load libcuda.so. "
+    "Check your library paths and if the driver is installed correctly.");
+  CUDA_CALL(cuCtxGetCurrent(&old_context_));
+}
+
+DeviceGuard::DeviceGuard(int new_device) :
+  old_context_(NULL) {
+  if (new_device >= 0) {
+    DALI_ENFORCE(cuInitChecked(),
+      "Failed to load libcuda.so. "
+      "Check your library paths and if the driver is installed correctly.");
+    CUDA_CALL(cuCtxGetCurrent(&old_context_));
+    CUDA_CALL(cudaSetDevice(new_device));
+  }
+}
+
+DeviceGuard::~DeviceGuard() {
+  if (old_context_ != NULL) {
+    CUresult err = cuCtxSetCurrent(old_context_);
+    if (err != CUDA_SUCCESS) {
+      std::cerr << "Failed to recover from DeviceGuard: " << err << std::endl;
+      std::terminate();
+    }
+  }
+}
+
+}  // namespace dali
diff --git a/dali/core/device_guard_test.cc b/dali/core/device_guard_test.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "dali/core/dynlink_cuda.h"
+#include "dali/core/cuda_utils.h"
+#include "dali/core/device_guard.h"
+
+namespace dali {
+
+TEST(DeviceGuard, ConstructorWithDevice) {
+  int test_device = 0;
+  int guard_device = 0;
+  int current_device;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+  {
+    DeviceGuard g(guard_device);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, guard_device);
+  }
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+}
+
+TEST(DeviceGuard, ConstructorNoArgs) {
+  int test_device = 0;
+  int guard_device = 0;
+  int current_device;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+  {
+    DeviceGuard g;
+    EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, guard_device);
+  }
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+}
+
+TEST(DeviceGuard, Checkcontext) {
+  int test_device = 0;
+  CUdevice cu_test_device;
+  CUcontext cu_test_ctx;
+  int guard_device = 0;
+  int current_device;
+  CUdevice cu_current_device;
+  CUcontext cu_current_ctx;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  {
+    DeviceGuard g(guard_device);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, guard_device);
+    EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+    EXPECT_NE(cu_current_ctx, cu_test_ctx);
+  }
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  cuCtxDestroy(cu_test_ctx);
+}
+
+TEST(DeviceGuard, CheckcontextNoArgs) {
+  int test_device = 0;
+  CUdevice cu_test_device;
+  CUcontext cu_test_ctx;
+  int guard_device = 0;
+  int current_device;
+  CUdevice cu_current_device;
+  CUcontext cu_current_ctx;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  {
+    DeviceGuard g;
+    EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+    EXPECT_NE(cu_current_ctx, cu_test_ctx);
+  }
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  cuCtxDestroy(cu_test_ctx);
+}
+
+}  // namespace dali
diff --git a/dali/pipeline/data/buffer.h b/dali/pipeline/data/buffer.h
@@ -27,6 +27,7 @@
 #include "dali/core/error_handling.h"
 #include "dali/pipeline/data/types.h"
 #include "dali/core/util.h"
+#include "dali/core/device_guard.h"
 
 namespace dali {
 
@@ -234,6 +235,8 @@ class Buffer {
     // re-allocating: get the device
     if (std::is_same<Backend, GPUBackend>::value) {
       CUDA_CALL(cudaGetDevice(&device_));
+    } else {
+      device_ = -1;
     }
 
     data_.reset();
@@ -261,19 +264,9 @@ class Buffer {
 
  protected:
   static void FreeMemory(void *ptr, size_t bytes, int device, bool pinned) {
-    // change to correct device for deletion
-    // Note: Can't use device guard due to potentially not GPUBackend.
-    int current_device = 0;
-    if (std::is_same<Backend, GPUBackend>::value) {
-      CUDA_CALL(cudaGetDevice(&current_device));
-      CUDA_CALL(cudaSetDevice(device));
-    }
+    // for device == -1 it is noop
+    DeviceGuard g(device);
     Backend::Delete(ptr, bytes, pinned);
-
-    // reset to original calling device for consistency
-    if (std::is_same<Backend, GPUBackend>::value) {
-      CUDA_CALL(cudaSetDevice(current_device));
-    }
   }
 
   // Helper to resize the underlying allocation

diff --git a/dali/pipeline/data/tensor_list.h b/dali/pipeline/data/tensor_list.h
@@ -200,6 +200,7 @@ class DLL_PUBLIC TensorList : public Buffer<Backend> {
     shape_.clear();
     offsets_.clear();
     size_ = 0;
+    device_ = -1;
 
     // Tensor view of this TensorList is no longer valid
     if (tensor_view_) {

diff --git a/dali/pipeline/operators/decoder/nvjpeg_decoder_decoupled_api.h b/dali/pipeline/operators/decoder/nvjpeg_decoder_decoupled_api.h
@@ -28,6 +28,7 @@
 #include "dali/util/ocv.h"
 #include "dali/image/image_factory.h"
 #include "dali/pipeline/util/thread_pool.h"
+#include "dali/core/device_guard.h"
 
 namespace dali {
 
@@ -53,6 +54,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
     device_buffer_(num_threads_),
     streams_(num_threads_),
     events_(num_threads_ * 2),
+    device_id_(spec.GetArgument<int>("device_id")),
     thread_pool_(num_threads_,
                  spec.GetArgument<int>("device_id"),
                  true /* pin threads */) {
@@ -109,24 +111,31 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
       // As other images being process might fail, but we don't care
     }
 
-    for (int i = 0; i < batch_size_; ++i) {
-      NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
-      NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
-    }
-    NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
-    NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
-    for (int i = 0; i < num_threads_ * 2; ++i) {
-      NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
-      NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
-      CUDA_CALL(cudaEventDestroy(events_[i]));
-    }
+    try {
+      DeviceGuard g(device_id_);
+      for (int i = 0; i < batch_size_; ++i) {
+        NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
+        NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
+      }
+      NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
+      NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
+      for (int i = 0; i < num_threads_ * 2; ++i) {
+        NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
+        NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
+        CUDA_CALL(cudaEventDestroy(events_[i]));
+      }
 
-    for (int i = 0; i < num_threads_; ++i) {
-      NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
-      CUDA_CALL(cudaStreamDestroy(streams_[i]));
+      for (int i = 0; i < num_threads_; ++i) {
+        NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
+        CUDA_CALL(cudaStreamDestroy(streams_[i]));
+      }
+      CUDA_CALL(cudaEventDestroy(master_event_));
+      NVJPEG_CALL(nvjpegDestroy(handle_));
+    } catch (const std::exception &e) {
+      // If destroying nvJPEG resources failed we are leaking something so terminate
+      std::cerr << "Fatal error: exception in ~nvJPEGDecoder():\n" << e.what() << std::endl;
+      std::terminate();
     }
-    CUDA_CALL(cudaEventDestroy(master_event_));
-    NVJPEG_CALL(nvjpegDestroy(handle_));
   }
 
   using dali::OperatorBase::Run;
@@ -363,6 +372,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
   std::vector<cudaEvent_t> events_;
 
   cudaEvent_t master_event_;
+  int device_id_;
 
   ThreadPool thread_pool_;
 };

diff --git a/dali/pipeline/operators/decoder/nvjpeg_decoder_gpu.h b/dali/pipeline/operators/decoder/nvjpeg_decoder_gpu.h
@@ -22,6 +22,7 @@
 #include "dali/pipeline/operators/operator.h"
 #include "dali/pipeline/operators/decoder/nvjpeg_helper.h"
 #include "dali/util/ocv.h"
+#include "dali/core/device_guard.h"
 
 namespace dali {
 
@@ -31,7 +32,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
  public:
   explicit nvJPEGDecoderGPUStage(const OpSpec& spec) :
     Operator<MixedBackend>(spec),
-    output_image_type_(spec.GetArgument<DALIImageType>("output_type")) {
+    output_image_type_(spec.GetArgument<DALIImageType>("output_type")),
+    device_id_(spec.GetArgument<int>("device_id")) {
     NVJPEG_CALL(nvjpegCreateSimple(&handle_));
 
     NVJPEG_CALL(nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_HYBRID, &decoder_host_));
@@ -47,6 +49,7 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
   }
 
   ~nvJPEGDecoderGPUStage() noexcept(false) {
+    DeviceGuard g(device_id_);
     NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_));
     NVJPEG_CALL(nvjpegDecoderDestroy(decoder_host_));
     NVJPEG_CALL(nvjpegDecoderDestroy(decoder_hybrid_));
@@ -157,6 +160,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
   nvjpegJpegDecoder_t decoder_hybrid_;
 
   nvjpegBufferDevice_t device_buffer_;
+
+  int device_id_;
 };
 
 

diff --git a/dali/pipeline/operators/optical_flow/optical_flow.cc b/dali/pipeline/operators/optical_flow/optical_flow.cc
@@ -67,7 +67,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
     // Extract calculation params
     ExtractParams(input, hints);
 
-    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
+    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());
 
     std::vector<Dims> new_sizes;
     auto out_shape = optical_flow_->GetOutputShape();
@@ -109,7 +109,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
     // Extract calculation params
     ExtractParams(input);
 
-    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
+    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());
 
     std::vector<Dims> new_sizes;
     auto out_shape = optical_flow_->GetOutputShape();