Skip to content

Commit

Permalink
Rework DeviceGuard to restore original context upon the exit (#882)
Browse files Browse the repository at this point in the history
- some libraries are not using PrimiarContext while DALI does. When
  cudaSetDevice is called PrimaryContext is created and is set as
  the current one, the old one is lost while other apps may still need it.
  Adds saving of current context and restores it when DeviceGuard
  is destroyed
- removed CUContext as it is not needed and can be replaced by the DeviceGuard

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
  • Loading branch information
JanuszL committed May 29, 2019
1 parent 6f3c70f commit f7eb22b
Show file tree
Hide file tree
Showing 24 changed files with 347 additions and 267 deletions.
49 changes: 49 additions & 0 deletions dali/core/device_guard.cc
@@ -0,0 +1,49 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "dali/core/device_guard.h"
#include "dali/core/error_handling.h"

namespace dali {

DeviceGuard::DeviceGuard() :
old_context_(NULL) {
DALI_ENFORCE(cuInitChecked(),
"Failed to load libcuda.so. "
"Check your library paths and if the driver is installed correctly.");
CUDA_CALL(cuCtxGetCurrent(&old_context_));
}

DeviceGuard::DeviceGuard(int new_device) :
old_context_(NULL) {
if (new_device >= 0) {
DALI_ENFORCE(cuInitChecked(),
"Failed to load libcuda.so. "
"Check your library paths and if the driver is installed correctly.");
CUDA_CALL(cuCtxGetCurrent(&old_context_));
CUDA_CALL(cudaSetDevice(new_device));
}
}

DeviceGuard::~DeviceGuard() {
if (old_context_ != NULL) {
CUresult err = cuCtxSetCurrent(old_context_);
if (err != CUDA_SUCCESS) {
std::cerr << "Failed to recover from DeviceGuard: " << err << std::endl;
std::terminate();
}
}
}

} // namespace dali
145 changes: 145 additions & 0 deletions dali/core/device_guard_test.cc
@@ -0,0 +1,145 @@
// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <gtest/gtest.h>
#include "dali/core/dynlink_cuda.h"
#include "dali/core/cuda_utils.h"
#include "dali/core/device_guard.h"

namespace dali {

TEST(DeviceGuard, ConstructorWithDevice) {
int test_device = 0;
int guard_device = 0;
int current_device;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
{
DeviceGuard g(guard_device);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, guard_device);
}
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
}

TEST(DeviceGuard, ConstructorNoArgs) {
int test_device = 0;
int guard_device = 0;
int current_device;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
{
DeviceGuard g;
EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, guard_device);
}
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
}

TEST(DeviceGuard, Checkcontext) {
int test_device = 0;
CUdevice cu_test_device;
CUcontext cu_test_ctx;
int guard_device = 0;
int current_device;
CUdevice cu_current_device;
CUcontext cu_current_ctx;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
{
DeviceGuard g(guard_device);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, guard_device);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_NE(cu_current_ctx, cu_test_ctx);
}
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
cuCtxDestroy(cu_test_ctx);
}

TEST(DeviceGuard, CheckcontextNoArgs) {
int test_device = 0;
CUdevice cu_test_device;
CUcontext cu_test_ctx;
int guard_device = 0;
int current_device;
CUdevice cu_current_device;
CUcontext cu_current_ctx;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
{
DeviceGuard g;
EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_NE(cu_current_ctx, cu_test_ctx);
}
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
cuCtxDestroy(cu_test_ctx);
}

} // namespace dali
17 changes: 5 additions & 12 deletions dali/pipeline/data/buffer.h
Expand Up @@ -27,6 +27,7 @@
#include "dali/core/error_handling.h"
#include "dali/pipeline/data/types.h"
#include "dali/core/util.h"
#include "dali/core/device_guard.h"

namespace dali {

Expand Down Expand Up @@ -234,6 +235,8 @@ class Buffer {
// re-allocating: get the device
if (std::is_same<Backend, GPUBackend>::value) {
CUDA_CALL(cudaGetDevice(&device_));
} else {
device_ = -1;
}

data_.reset();
Expand Down Expand Up @@ -261,19 +264,9 @@ class Buffer {

protected:
static void FreeMemory(void *ptr, size_t bytes, int device, bool pinned) {
// change to correct device for deletion
// Note: Can't use device guard due to potentially not GPUBackend.
int current_device = 0;
if (std::is_same<Backend, GPUBackend>::value) {
CUDA_CALL(cudaGetDevice(&current_device));
CUDA_CALL(cudaSetDevice(device));
}
// for device == -1 it is noop
DeviceGuard g(device);
Backend::Delete(ptr, bytes, pinned);

// reset to original calling device for consistency
if (std::is_same<Backend, GPUBackend>::value) {
CUDA_CALL(cudaSetDevice(current_device));
}
}

// Helper to resize the underlying allocation
Expand Down
1 change: 1 addition & 0 deletions dali/pipeline/data/tensor_list.h
Expand Up @@ -200,6 +200,7 @@ class DLL_PUBLIC TensorList : public Buffer<Backend> {
shape_.clear();
offsets_.clear();
size_ = 0;
device_ = -1;

// Tensor view of this TensorList is no longer valid
if (tensor_view_) {
Expand Down
42 changes: 26 additions & 16 deletions dali/pipeline/operators/decoder/nvjpeg_decoder_decoupled_api.h
Expand Up @@ -28,6 +28,7 @@
#include "dali/util/ocv.h"
#include "dali/image/image_factory.h"
#include "dali/pipeline/util/thread_pool.h"
#include "dali/core/device_guard.h"

namespace dali {

Expand All @@ -53,6 +54,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
device_buffer_(num_threads_),
streams_(num_threads_),
events_(num_threads_ * 2),
device_id_(spec.GetArgument<int>("device_id")),
thread_pool_(num_threads_,
spec.GetArgument<int>("device_id"),
true /* pin threads */) {
Expand Down Expand Up @@ -109,24 +111,31 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
// As other images being process might fail, but we don't care
}

for (int i = 0; i < batch_size_; ++i) {
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
}
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
for (int i = 0; i < num_threads_ * 2; ++i) {
NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
CUDA_CALL(cudaEventDestroy(events_[i]));
}
try {
DeviceGuard g(device_id_);
for (int i = 0; i < batch_size_; ++i) {
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
}
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
for (int i = 0; i < num_threads_ * 2; ++i) {
NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
CUDA_CALL(cudaEventDestroy(events_[i]));
}

for (int i = 0; i < num_threads_; ++i) {
NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
CUDA_CALL(cudaStreamDestroy(streams_[i]));
for (int i = 0; i < num_threads_; ++i) {
NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
CUDA_CALL(cudaStreamDestroy(streams_[i]));
}
CUDA_CALL(cudaEventDestroy(master_event_));
NVJPEG_CALL(nvjpegDestroy(handle_));
} catch (const std::exception &e) {
// If destroying nvJPEG resources failed we are leaking something so terminate
std::cerr << "Fatal error: exception in ~nvJPEGDecoder():\n" << e.what() << std::endl;
std::terminate();
}
CUDA_CALL(cudaEventDestroy(master_event_));
NVJPEG_CALL(nvjpegDestroy(handle_));
}

using dali::OperatorBase::Run;
Expand Down Expand Up @@ -363,6 +372,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
std::vector<cudaEvent_t> events_;

cudaEvent_t master_event_;
int device_id_;

ThreadPool thread_pool_;
};
Expand Down
7 changes: 6 additions & 1 deletion dali/pipeline/operators/decoder/nvjpeg_decoder_gpu.h
Expand Up @@ -22,6 +22,7 @@
#include "dali/pipeline/operators/operator.h"
#include "dali/pipeline/operators/decoder/nvjpeg_helper.h"
#include "dali/util/ocv.h"
#include "dali/core/device_guard.h"

namespace dali {

Expand All @@ -31,7 +32,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
public:
explicit nvJPEGDecoderGPUStage(const OpSpec& spec) :
Operator<MixedBackend>(spec),
output_image_type_(spec.GetArgument<DALIImageType>("output_type")) {
output_image_type_(spec.GetArgument<DALIImageType>("output_type")),
device_id_(spec.GetArgument<int>("device_id")) {
NVJPEG_CALL(nvjpegCreateSimple(&handle_));

NVJPEG_CALL(nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_HYBRID, &decoder_host_));
Expand All @@ -47,6 +49,7 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
}

~nvJPEGDecoderGPUStage() noexcept(false) {
DeviceGuard g(device_id_);
NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_host_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_hybrid_));
Expand Down Expand Up @@ -157,6 +160,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
nvjpegJpegDecoder_t decoder_hybrid_;

nvjpegBufferDevice_t device_buffer_;

int device_id_;
};


Expand Down
4 changes: 2 additions & 2 deletions dali/pipeline/operators/optical_flow/optical_flow.cc
Expand Up @@ -67,7 +67,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
// Extract calculation params
ExtractParams(input, hints);

of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());

std::vector<Dims> new_sizes;
auto out_shape = optical_flow_->GetOutputShape();
Expand Down Expand Up @@ -109,7 +109,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
// Extract calculation params
ExtractParams(input);

of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());

std::vector<Dims> new_sizes;
auto out_shape = optical_flow_->GetOutputShape();
Expand Down

0 comments on commit f7eb22b

Please sign in to comment.