Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework DeviceGuard to restore original context upon the exit #882

Merged
merged 1 commit into from May 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 49 additions & 0 deletions dali/core/device_guard.cc
@@ -0,0 +1,49 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "dali/core/device_guard.h"
#include "dali/core/error_handling.h"

namespace dali {

DeviceGuard::DeviceGuard() :
old_context_(NULL) {
DALI_ENFORCE(cuInitChecked(),
"Failed to load libcuda.so. "
"Check your library paths and if the driver is installed correctly.");
CUDA_CALL(cuCtxGetCurrent(&old_context_));
}

DeviceGuard::DeviceGuard(int new_device) :
old_context_(NULL) {
if (new_device >= 0) {
mzient marked this conversation as resolved.
Show resolved Hide resolved
DALI_ENFORCE(cuInitChecked(),
"Failed to load libcuda.so. "
"Check your library paths and if the driver is installed correctly.");
CUDA_CALL(cuCtxGetCurrent(&old_context_));
CUDA_CALL(cudaSetDevice(new_device));
}
}

DeviceGuard::~DeviceGuard() {
if (old_context_ != NULL) {
CUresult err = cuCtxSetCurrent(old_context_);
if (err != CUDA_SUCCESS) {
std::cerr << "Failed to recover from DeviceGuard: " << err << std::endl;
std::terminate();
}
}
}

} // namespace dali
145 changes: 145 additions & 0 deletions dali/core/device_guard_test.cc
@@ -0,0 +1,145 @@
// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <gtest/gtest.h>
#include "dali/core/dynlink_cuda.h"
#include "dali/core/cuda_utils.h"
#include "dali/core/device_guard.h"

namespace dali {

TEST(DeviceGuard, ConstructorWithDevice) {
int test_device = 0;
int guard_device = 0;
int current_device;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
{
DeviceGuard g(guard_device);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, guard_device);
}
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
}

TEST(DeviceGuard, ConstructorNoArgs) {
int test_device = 0;
int guard_device = 0;
int current_device;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cudaSetDevice(test_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
{
DeviceGuard g;
EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, guard_device);
}
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, test_device);
}

TEST(DeviceGuard, Checkcontext) {
int test_device = 0;
CUdevice cu_test_device;
CUcontext cu_test_ctx;
int guard_device = 0;
int current_device;
CUdevice cu_current_device;
CUcontext cu_current_ctx;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
{
DeviceGuard g(guard_device);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(current_device, guard_device);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_NE(cu_current_ctx, cu_test_ctx);
}
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
cuCtxDestroy(cu_test_ctx);
}

TEST(DeviceGuard, CheckcontextNoArgs) {
int test_device = 0;
CUdevice cu_test_device;
CUcontext cu_test_ctx;
int guard_device = 0;
int current_device;
CUdevice cu_current_device;
CUcontext cu_current_ctx;
int count = 1;

EXPECT_EQ(cuInitChecked(), true);
EXPECT_EQ(cudaGetDeviceCount(&count), cudaSuccess);
if (count > 1) {
guard_device = 1;
}

EXPECT_EQ(cuDeviceGet(&cu_test_device, test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxCreate(&cu_test_ctx, 0, cu_test_device), CUDA_SUCCESS);
EXPECT_EQ(cuCtxSetCurrent(cu_test_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
{
DeviceGuard g;
EXPECT_EQ(cudaSetDevice(guard_device), cudaSuccess);
EXPECT_EQ(cudaGetDevice(&current_device), cudaSuccess);
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_NE(cu_current_ctx, cu_test_ctx);
}
EXPECT_EQ(cuCtxGetCurrent(&cu_current_ctx), CUDA_SUCCESS);
EXPECT_EQ(cuCtxGetDevice(&cu_current_device), CUDA_SUCCESS);
EXPECT_EQ(cu_current_ctx, cu_test_ctx);
EXPECT_EQ(cu_current_device, cu_test_device);
cuCtxDestroy(cu_test_ctx);
}

} // namespace dali
17 changes: 5 additions & 12 deletions dali/pipeline/data/buffer.h
Expand Up @@ -27,6 +27,7 @@
#include "dali/core/error_handling.h"
#include "dali/pipeline/data/types.h"
#include "dali/core/util.h"
#include "dali/core/device_guard.h"

namespace dali {

Expand Down Expand Up @@ -234,6 +235,8 @@ class Buffer {
// re-allocating: get the device
if (std::is_same<Backend, GPUBackend>::value) {
CUDA_CALL(cudaGetDevice(&device_));
} else {
device_ = -1;
}

data_.reset();
Expand Down Expand Up @@ -261,19 +264,9 @@ class Buffer {

protected:
static void FreeMemory(void *ptr, size_t bytes, int device, bool pinned) {
// change to correct device for deletion
// Note: Can't use device guard due to potentially not GPUBackend.
int current_device = 0;
if (std::is_same<Backend, GPUBackend>::value) {
CUDA_CALL(cudaGetDevice(&current_device));
CUDA_CALL(cudaSetDevice(device));
}
// for device == -1 it is noop
DeviceGuard g(device);
Backend::Delete(ptr, bytes, pinned);

// reset to original calling device for consistency
if (std::is_same<Backend, GPUBackend>::value) {
CUDA_CALL(cudaSetDevice(current_device));
}
}

// Helper to resize the underlying allocation
Expand Down
1 change: 1 addition & 0 deletions dali/pipeline/data/tensor_list.h
Expand Up @@ -200,6 +200,7 @@ class DLL_PUBLIC TensorList : public Buffer<Backend> {
shape_.clear();
offsets_.clear();
size_ = 0;
device_ = -1;

// Tensor view of this TensorList is no longer valid
if (tensor_view_) {
Expand Down
42 changes: 26 additions & 16 deletions dali/pipeline/operators/decoder/nvjpeg_decoder_decoupled_api.h
Expand Up @@ -28,6 +28,7 @@
#include "dali/util/ocv.h"
#include "dali/image/image_factory.h"
#include "dali/pipeline/util/thread_pool.h"
#include "dali/core/device_guard.h"

namespace dali {

Expand All @@ -53,6 +54,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
device_buffer_(num_threads_),
streams_(num_threads_),
events_(num_threads_ * 2),
device_id_(spec.GetArgument<int>("device_id")),
thread_pool_(num_threads_,
spec.GetArgument<int>("device_id"),
true /* pin threads */) {
Expand Down Expand Up @@ -109,24 +111,31 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
// As other images being process might fail, but we don't care
}

for (int i = 0; i < batch_size_; ++i) {
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
}
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
for (int i = 0; i < num_threads_ * 2; ++i) {
NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
CUDA_CALL(cudaEventDestroy(events_[i]));
}
try {
DeviceGuard g(device_id_);
for (int i = 0; i < batch_size_; ++i) {
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_host_state_[i]));
NVJPEG_CALL(nvjpegJpegStateDestroy(decoder_huff_hybrid_state_[i]));
}
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_host_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_huff_hybrid_));
for (int i = 0; i < num_threads_ * 2; ++i) {
NVJPEG_CALL(nvjpegJpegStreamDestroy(jpeg_streams_[i]));
NVJPEG_CALL(nvjpegBufferPinnedDestroy(pinned_buffer_[i]));
CUDA_CALL(cudaEventDestroy(events_[i]));
}

for (int i = 0; i < num_threads_; ++i) {
NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
CUDA_CALL(cudaStreamDestroy(streams_[i]));
for (int i = 0; i < num_threads_; ++i) {
NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_[i]));
CUDA_CALL(cudaStreamDestroy(streams_[i]));
}
CUDA_CALL(cudaEventDestroy(master_event_));
NVJPEG_CALL(nvjpegDestroy(handle_));
} catch (const std::exception &e) {
// If destroying nvJPEG resources failed we are leaking something so terminate
std::cerr << "Fatal error: exception in ~nvJPEGDecoder():\n" << e.what() << std::endl;
std::terminate();
}
CUDA_CALL(cudaEventDestroy(master_event_));
NVJPEG_CALL(nvjpegDestroy(handle_));
}

using dali::OperatorBase::Run;
Expand Down Expand Up @@ -363,6 +372,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
std::vector<cudaEvent_t> events_;

cudaEvent_t master_event_;
int device_id_;

ThreadPool thread_pool_;
};
Expand Down
7 changes: 6 additions & 1 deletion dali/pipeline/operators/decoder/nvjpeg_decoder_gpu.h
Expand Up @@ -22,6 +22,7 @@
#include "dali/pipeline/operators/operator.h"
#include "dali/pipeline/operators/decoder/nvjpeg_helper.h"
#include "dali/util/ocv.h"
#include "dali/core/device_guard.h"

namespace dali {

Expand All @@ -31,7 +32,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
public:
explicit nvJPEGDecoderGPUStage(const OpSpec& spec) :
Operator<MixedBackend>(spec),
output_image_type_(spec.GetArgument<DALIImageType>("output_type")) {
output_image_type_(spec.GetArgument<DALIImageType>("output_type")),
device_id_(spec.GetArgument<int>("device_id")) {
NVJPEG_CALL(nvjpegCreateSimple(&handle_));

NVJPEG_CALL(nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_HYBRID, &decoder_host_));
Expand All @@ -47,6 +49,7 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
}

~nvJPEGDecoderGPUStage() noexcept(false) {
DeviceGuard g(device_id_);
NVJPEG_CALL(nvjpegBufferDeviceDestroy(device_buffer_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_host_));
NVJPEG_CALL(nvjpegDecoderDestroy(decoder_hybrid_));
Expand Down Expand Up @@ -157,6 +160,8 @@ class nvJPEGDecoderGPUStage : public Operator<MixedBackend> {
nvjpegJpegDecoder_t decoder_hybrid_;

nvjpegBufferDevice_t device_buffer_;

int device_id_;
};


Expand Down
4 changes: 2 additions & 2 deletions dali/pipeline/operators/optical_flow/optical_flow.cc
Expand Up @@ -67,7 +67,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
// Extract calculation params
ExtractParams(input, hints);

of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());

std::vector<Dims> new_sizes;
auto out_shape = optical_flow_->GetOutputShape();
Expand Down Expand Up @@ -109,7 +109,7 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> *ws, const int) {
// Extract calculation params
ExtractParams(input);

of_lazy_init(frames_width_, frames_height_, depth_, image_type_, ws->stream());
of_lazy_init(frames_width_, frames_height_, depth_, image_type_, device_id_, ws->stream());

std::vector<Dims> new_sizes;
auto out_shape = optical_flow_->GetOutputShape();
Expand Down