Rework DeviceGuard to restore original context upon the exit (#882)

- some libraries are not using PrimiarContext while DALI does. When cudaSetDevice is called PrimaryContext is created and is set as the current one, the old one is lost while other apps may still need it. Adds saving of current context and restores it when DeviceGuard is destroyed - removed CUContext as it is not needed and can be replaced by the DeviceGuard Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
NVIDIA · May 29, 2019 · f7eb22b · f7eb22b
1 parent 6f3c70f
commit f7eb22b
Show file tree

Hide file tree

Showing 24 changed files with 347 additions and 267 deletions.
diff --git a/dali/core/device_guard.cc b/dali/core/device_guard.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/core/device_guard.h"
+#include "dali/core/error_handling.h"
+
+namespace dali {
+
+DeviceGuard::DeviceGuard() :
+  old_context_(NULL) {
+  DALI_ENFORCE(cuInitChecked(),
+    "Failed to load libcuda.so. "
+    "Check your library paths and if the driver is installed correctly.");
+  CUDA_CALL(cuCtxGetCurrent(&old_context_));
+}
+
+DeviceGuard::DeviceGuard(int new_device) :
+  old_context_(NULL) {
+  if (new_device >= 0) {
+    DALI_ENFORCE(cuInitChecked(),
+      "Failed to load libcuda.so. "
+      "Check your library paths and if the driver is installed correctly.");
+    CUDA_CALL(cuCtxGetCurrent(&old_context_));
+    CUDA_CALL(cudaSetDevice(new_device));
+  }
+}
+
+DeviceGuard::~DeviceGuard() {
+  if (old_context_ != NULL) {
+    CUresult err = cuCtxSetCurrent(old_context_);
+    if (err != CUDA_SUCCESS) {
+      std::cerr << "Failed to recover from DeviceGuard: " << err << std::endl;
+      std::terminate();
+    }
+  }
+}
+
+}  // namespace dali
diff --git a/dali/core/device_guard_test.cc b/dali/core/device_guard_test.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "dali/core/dynlink_cuda.h"
+#include "dali/core/cuda_utils.h"
+#include "dali/core/device_guard.h"
+
+namespace dali {
+
+TEST(DeviceGuard, ConstructorWithDevice) {
+  int test_device = 0;
+  int guard_device = 0;
+  int current_device;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+  {
+    DeviceGuard g(guard_device);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, guard_device);
+  }
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+}
+
+TEST(DeviceGuard, ConstructorNoArgs) {
+  int test_device = 0;
+  int guard_device = 0;
+  int current_device;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+  {
+    DeviceGuard g;
+    EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, guard_device);
+  }
+  EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, test_device);
+}
+
+TEST(DeviceGuard, Checkcontext) {
+  int test_device = 0;
+  CUdevice cu_test_device;
+  CUcontext cu_test_ctx;
+  int guard_device = 0;
+  int current_device;
+  CUdevice cu_current_device;
+  CUcontext cu_current_ctx;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  {
+    DeviceGuard g(guard_device);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, guard_device);
+    EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+    EXPECT_NE(cu_current_ctx, cu_test_ctx);
+  }
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  cuCtxDestroy(cu_test_ctx);
+}
+
+TEST(DeviceGuard, CheckcontextNoArgs) {
+  int test_device = 0;
+  CUdevice cu_test_device;
+  CUcontext cu_test_ctx;
+  int guard_device = 0;
+  int current_device;
+  CUdevice cu_current_device;
+  CUcontext cu_current_ctx;
+  int count = 1;
+
+  EXPECT_EQ(cuInitChecked(), true);
+  EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
+  if (count > 1) {
+    guard_device = 1;
+  }
+
+  EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  {
+    DeviceGuard g;
+    EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
+    EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+    EXPECT_NE(cu_current_ctx, cu_test_ctx);
+  }
+  EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
+  EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
+  EXPECT_EQ(cu_current_ctx, cu_test_ctx);
+  EXPECT_EQ(cu_current_device, cu_test_device);
+  cuCtxDestroy(cu_test_ctx);
+}
+
+}  // namespace dali
diff --git a/dali/pipeline/data/buffer.h b/dali/pipeline/data/buffer.h
@@ -27,6 +27,7 @@
 #include "dali/core/error_handling.h"
 #include "dali/pipeline/data/types.h"
 #include "dali/core/util.h"
+#include "dali/core/device_guard.h"
 
 namespace dali {
 
@@ -234,6 +235,8 @@ class Buffer {
     // re-allocating: get the device
     if (std::is_same<Backend, GPUBackend>::value) {
       CUDA_CALL(cudaGetDevice(&device_));
+    } else {
+      device_ = -1;
     }
 
     data_.reset();
@@ -261,19 +264,9 @@ class Buffer {
 
  protected:
   static void FreeMemory(void *ptr, size_t bytes, int device, bool pinned) {
-    // change to correct device for deletion
-    // Note: Can't use device guard due to potentially not GPUBackend.
-    int current_device = 0;
-    if (std::is_same<Backend, GPUBackend>::value) {
-      CUDA_CALL(cudaGetDevice(&current_device));
-      CUDA_CALL(cudaSetDevice(device));
-    }
+    // for device == -1 it is noop
+    DeviceGuard g(device);
     Backend::Delete(ptr, bytes, pinned);
-
-    // reset to original calling device for consistency
-    if (std::is_same<Backend, GPUBackend>::value) {
-      CUDA_CALL(cudaSetDevice(current_device));
-    }
   }
 
   // Helper to resize the underlying allocation

diff --git a/dali/pipeline/data/tensor_list.h b/dali/pipeline/data/tensor_list.h
@@ -200,6 +200,7 @@ class DLL_PUBLIC TensorList : public Buffer<Backend> {
     shape_.clear();
     offsets_.clear();
     size_ = 0;
+    device_ = -1;
 
     // Tensor view of this TensorList is no longer valid
     if (tensor_view_) {

diff --git a/dali/pipeline/operators/decoder/nvjpeg_decoder_decoupled_api.h b/dali/pipeline/operators/decoder/nvjpeg_decoder_decoupled_api.h
@@ -28,6 +28,7 @@
 #include "dali/util/ocv.h"
 #include "dali/image/image_factory.h"
 #include "dali/pipeline/util/thread_pool.h"
+#include "dali/core/device_guard.h"
 
 namespace dali {
 
@@ -53,6 +54,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
     device_buffer_(num_threads_),
     streams_(num_threads_),
     events_(num_threads_ * 2),
+    device_id_(spec.GetArgument<int>("device_id")),
     thread_pool_(num_threads_,
                  spec.GetArgument<int>("device_id"),
                  true /* pin threads */) {
@@ -109,24 +111,31 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
       // As other images being process might fail, but we don't care
     }
 
-    for (int i = 0; i < batch_size_; ++i) {
-      NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
-      NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
-    }
-    NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
-    NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
-    for (int i = 0; i < num_threads_ * 2; ++i) {
-      NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
-      NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
-      CUDA_CALL(cudaEventDestroy(events_[i]));
-    }
+    try {
+      DeviceGuard g(device_id_);
+      for (int i = 0; i < batch_size_; ++i) {
+        NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
+        NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
+      }
+      NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
+      NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
+      for (int i = 0; i < num_threads_ * 2; ++i) {
+        NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
+        NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
+        CUDA_CALL(cudaEventDestroy(events_[i]));
+      }
 
-    for (int i = 0; i < num_threads_; ++i) {
-      NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
-      CUDA_CALL(cudaStreamDestroy(streams_[i]));
+      for (int i = 0; i < num_threads_; ++i) {
+        NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
+        CUDA_CALL(cudaStreamDestroy(streams_[i]));
+      }
+      CUDA_CALL(cudaEventDestroy(master_event_));
+      NVJPEG_CALL(nvjpegDestroy(handle_));
+    } catch (const std::exception &e) {
+      // If destroying nvJPEG resources failed we are leaking something so terminate
+      std::cerr << "Fatal error: exception in ~nvJPEGDecoder():\n" << e.what() << std::endl;
+      std::terminate();
     }
-    CUDA_CALL(cudaEventDestroy(master_event_));
-    NVJPEG_CALL(nvjpegDestroy(handle_));
   }
 
   using dali::OperatorBase::Run;
@@ -363,6 +372,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
   std::vector<cudaEvent_t> events_;
 
   cudaEvent_t master_event_;
+  int device_id_;
 
   ThreadPool thread_pool_;
 };

diff --git a/dali/pipeline/operators/decoder/nvjpeg_decoder_gpu.h b/dali/pipeline/operators/decoder/nvjpeg_decoder_gpu.h
@@ -22,6 +22,7 @@
 #include "dali/pipeline/operators/operator.h"
 #include "dali/pipeline/operators/decoder/nvjpeg_helper.h"
 #include "dali/util/ocv.h"
+#include "dali/core/device_guard.h"
 
 namespace dali {
 
@@ -31,7 +32,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
  public:
   explicit nvJPEGDecoderGPUStage(const OpSpec& spec) :
     Operator<MixedBackend>(spec),
-    output_image_type_(spec.GetArgument<DALIImageType>("output_type")) {
+    output_image_type_(spec.GetArgument<DALIImageType>("output_type")),
+    device_id_(spec.GetArgument<int>("device_id")) {
     NVJPEG_CALL(nvjpegCreateSimple(&handle_));
 
     NVJPEG_CALL(nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_HYBRID, &decoder_host_));
@@ -47,6 +49,7 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
   }
 
   ~nvJPEGDecoderGPUStage() noexcept(false) {
+    DeviceGuard g(device_id_);
     NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_));
     NVJPEG_CALL(nvjpegDecoderDestroy(decoder_host_));
     NVJPEG_CALL(nvjpegDecoderDestroy(decoder_hybrid_));
@@ -157,6 +160,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
   nvjpegJpegDecoder_t decoder_hybrid_;
 
   nvjpegBufferDevice_t device_buffer_;
+
+  int device_id_;
 };
 
 

diff --git a/dali/pipeline/operators/optical_flow/optical_flow.cc b/dali/pipeline/operators/optical_flow/optical_flow.cc
@@ -67,7 +67,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
     // Extract calculation params
     ExtractParams(input, hints);
 
-    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
+    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());
 
     std::vector<Dims> new_sizes;
     auto out_shape = optical_flow_->GetOutputShape();
@@ -109,7 +109,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
     // Extract calculation params
     ExtractParams(input);
 
-    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
+    of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());
 
     std::vector<Dims> new_sizes;
     auto out_shape = optical_flow_->GetOutputShape();