Skip to content

Commit

Permalink
refactor(//core)!: Introducing a binding convention that will address
Browse files Browse the repository at this point in the history
determinism issues with TensorRT

The binding convention now looks for bindings by name and reorders
outputs to match the order expected by PyTorch.

BREAKING CHANGE: This changes the "ABI" of compiled TRTorch programs and
the runtime and breaks backwards compatability between the runtime in
0.1.0+ and programs compiled pre-0.1.0

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
  • Loading branch information
narendasan committed Aug 28, 2020
1 parent 7cfcca4 commit 5a105c6
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 71 deletions.
11 changes: 8 additions & 3 deletions core/conversion/conversion.cpp
Expand Up @@ -143,11 +143,12 @@ void AddInputs(ConversionCtx* ctx,
for (size_t i = 0; i < input_tensors.size(); i++) {
auto in = input_tensors[i];
auto dims = input_dims[i];
std::string name = std::string("input_") + std::to_string(ctx->num_inputs);
LOG_INFO(ctx->logger,
"Adding Input " << in->debugName() \
<< " (conversion.AddInputs)");
<< " named " << name << " in engine (conversion.AddInputs)");
LOG_DEBUG(ctx->logger, "Input shape set to " << dims.input_shape);
auto trt_in = ctx->net->addInput(in->debugName().c_str(),
auto trt_in = ctx->net->addInput(name.c_str(),
ctx->input_type, dims.input_shape);
TRTORCH_CHECK(trt_in, "Failed to add input node: " << in->debugName() << " (conversion.AddInputs)");

Expand All @@ -160,6 +161,7 @@ void AddInputs(ConversionCtx* ctx,
}

ctx->value_tensor_map[in] = trt_in;
ctx->num_inputs += 1;
}

TRTORCH_CHECK(profile->isValid(), "Optimization profile is invalid, please check the input range provided (conversion.AddInputs)");
Expand All @@ -174,14 +176,17 @@ void AddInputs(ConversionCtx* ctx,

void MarkOutputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> outputs) {
for (auto out : outputs) {
std::string name = std::string("output_") + std::to_string(ctx->num_outputs);
auto it = ctx->value_tensor_map.find(out);
// Leaves the potential for unused outputs to be populated with nullptr "safely"
TRTORCH_CHECK(it != ctx->value_tensor_map.end() && it->second,
"No corresponding output TRT Tensor found for TorchScript output: " << out->debugName());
auto out_tensor = it->second;
out_tensor->setName(name.c_str());
ctx->net->markOutput(*out_tensor);
LOG_INFO(ctx->logger,
"Marking Output " << out->debugName() << " (ctx.MarkOutput)");
"Marking Output " << out->debugName() << " named " << name << " in engine (ctx.MarkOutput)");
ctx->num_outputs += 1;
}
}

Expand Down
2 changes: 2 additions & 0 deletions core/conversion/conversionctx/ConversionCtx.h
Expand Up @@ -42,6 +42,8 @@ struct ConversionCtx {

~ConversionCtx();

uint64_t num_inputs = 0;
uint64_t num_outputs = 0;
bool input_is_dynamic = false;
nvinfer1::IBuilder* builder;
nvinfer1::INetworkDefinition* net;
Expand Down
7 changes: 7 additions & 0 deletions core/execution/TRTEngine.cpp
Expand Up @@ -42,13 +42,20 @@ TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine)
uint64_t outputs = 0;

for (int64_t x = 0; x < cuda_engine->getNbBindings(); x++) {
std::string name = cuda_engine->getBindingName(x);
std::string idx_s = name.substr(name.find("_") + 1);
uint64_t idx = static_cast<uint64_t>(std::stoi(idx_s));

if(cuda_engine->bindingIsInput(x)) {
inputs++;
in_binding_map[x] = idx;
} else {
outputs++;
out_binding_map[x] = idx;
}
}
num_io = std::make_pair(inputs, outputs);

}

TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
Expand Down
5 changes: 4 additions & 1 deletion core/execution/execution.h
Expand Up @@ -22,6 +22,9 @@ struct TRTEngine : torch::CustomClassHolder {
std::string name;
util::logging::TRTorchLogger logger;

std::unordered_map<uint64_t, uint64_t> in_binding_map;
std::unordered_map<uint64_t, uint64_t> out_binding_map;

~TRTEngine();
TRTEngine(std::string serialized_engine);
TRTEngine(std::string mod_name, std::string serialized_engine);
Expand All @@ -30,7 +33,7 @@ struct TRTEngine : torch::CustomClassHolder {
//c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
};

std::vector<at::Tensor> RunCudaEngine(nvinfer1::IExecutionContext* ctx, std::pair<uint64_t, uint64_t> io, std::vector<at::Tensor>& inputs);
std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);

} // namespace execution
} // namespace core
Expand Down
49 changes: 21 additions & 28 deletions core/execution/register_trt_op.cpp
Expand Up @@ -9,50 +9,43 @@
namespace trtorch {
namespace core {
namespace execution {
std::vector<at::Tensor> RunCudaEngine(nvinfer1::IExecutionContext* ctx, std::pair<uint64_t, uint64_t> io, std::vector<at::Tensor>& inputs) {

std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
std::vector<void*> gpu_handles;

std::vector<at::Tensor> contig_inputs{};
contig_inputs.reserve(inputs.size());

for (size_t i = 0; i < inputs.size(); i++) {
TRTORCH_CHECK(inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
auto expected_type = util::toATenDType(ctx->getEngine().getBindingDataType(i));
TRTORCH_CHECK(inputs[i].dtype() == expected_type, "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
auto dims = core::util::toDimsPad(inputs[i].sizes(), 1);
uint64_t pyt_idx = compiled_engine->in_binding_map[i];
TRTORCH_CHECK(inputs[pyt_idx].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[pyt_idx].device());
auto expected_type = util::toATenDType(compiled_engine->exec_ctx->getEngine().getBindingDataType(i));
TRTORCH_CHECK(inputs[pyt_idx].dtype() == expected_type, "Expected input tensors to have type " << expected_type << ", found type " << inputs[pyt_idx].dtype());
auto dims = core::util::toDimsPad(inputs[pyt_idx].sizes(), 1);
auto shape = core::util::toVec(dims);
contig_inputs.push_back(inputs[i].view(shape).contiguous());
contig_inputs.push_back(inputs[pyt_idx].view(shape).contiguous());
LOG_DEBUG("Input shape: " << dims);
ctx->setBindingDimensions(i, dims);
compiled_engine->exec_ctx->setBindingDimensions(i, dims);
gpu_handles.push_back(contig_inputs.back().data_ptr());
}

TRTORCH_CHECK(ctx->allInputDimensionsSpecified(), "Not enough inputs provided (execution.RunCudaEngine)");
TRTORCH_CHECK(compiled_engine->exec_ctx->allInputDimensionsSpecified(), "Not enough inputs provided (execution.RunCudaEngine)");

std::vector<at::Tensor> outputs;
for (uint64_t o = inputs.size(); o < (io.first + io.second); o++) {
auto out_shape = ctx->getBindingDimensions(o);
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
for (size_t o = inputs.size(); o < (compiled_engine->num_io.first + compiled_engine->num_io.second); o++) {
uint64_t pyt_idx = compiled_engine->out_binding_map[o];
auto out_shape = compiled_engine->exec_ctx->getBindingDimensions(o);
LOG_DEBUG("Output shape: " << out_shape);
auto dims = core::util::toVec(out_shape);
auto type = util::toATenDType(ctx->getEngine().getBindingDataType(o));
outputs.push_back(at::empty(dims, {at::kCUDA}).to(type).contiguous());
gpu_handles.push_back(outputs[outputs.size() - 1].data_ptr());
auto type = util::toATenDType(compiled_engine->exec_ctx->getEngine().getBindingDataType(o));
std::cout << pyt_idx << std::endl;
outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
gpu_handles.push_back(outputs[pyt_idx].data_ptr());
}

// Is this the right stream?
c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());

ctx->enqueueV2(gpu_handles.data(), stream, nullptr);

return outputs;
}

std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> engine) {
// Verify calling convention (right to left or left to right)
LOG_DEBUG("Attempting to run engine (ID: " << std::hex << engine->name << ")");

auto io = engine->num_io;
auto ctx = engine->exec_ctx;
auto outputs = RunCudaEngine(ctx, io, inputs);
compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);

return outputs;
}
Expand Down
45 changes: 6 additions & 39 deletions tests/util/run_graph_engine.cpp
Expand Up @@ -3,7 +3,9 @@
#include "c10/cuda/CUDAStream.h"
#include "torch/csrc/jit/ir/ir.h"
#include "torch/csrc/jit/ir/irparser.h"
#include "torch/custom_class.h"
#include "core/conversion/conversion.h"
#include "core/execution/execution.h"
#include "cuda_runtime_api.h"

#include <vector>
Expand All @@ -28,7 +30,7 @@ std::vector<core::conversion::InputRange> toInputRangesDynamic(std::vector<at::T
auto opt = core::util::toVec(i.sizes());

std::vector<int64_t> min_range(opt);
std::vector<int64_t> max_range(opt);
std::vector<int64_t> max_range(opt);

min_range[1] = ceil(opt[1]/2.0);
max_range[1] = 2*opt[1];
Expand All @@ -40,44 +42,9 @@ std::vector<core::conversion::InputRange> toInputRangesDynamic(std::vector<at::T
}

std::vector<at::Tensor> RunEngine(std::string& eng, std::vector<at::Tensor> inputs) {
auto rt = nvinfer1::createInferRuntime(core::util::logging::get_logger());
auto engine = rt->deserializeCudaEngine(eng.c_str(), eng.size());
auto ctx = engine->createExecutionContext();

std::vector<void*> gpu_handles;

std::vector<at::Tensor> contig_inputs{};
contig_inputs.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); i++) {
TRTORCH_CHECK(inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
auto expected_type = core::util::toATenDType(ctx->getEngine().getBindingDataType(i));
TRTORCH_CHECK(inputs[i].dtype() == expected_type, "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
auto dims = core::util::toDimsPad(inputs[i].sizes(), 1);
auto shape = core::util::toVec(dims);
contig_inputs.push_back(inputs[i].view(shape).contiguous());
LOG_DEBUG("In shape:" << shape);
ctx->setBindingDimensions(i, dims);
gpu_handles.push_back(contig_inputs.back().data_ptr());
}

TRTORCH_CHECK(ctx->allInputDimensionsSpecified(), "Not enough inputs provided (execution.RunCudaEngine)");

std::vector<at::Tensor> outputs;
for (int64_t o = inputs.size(); o < engine->getNbBindings(); o++) {
auto out_shape = ctx->getBindingDimensions(o);
LOG_DEBUG("Output: " << engine->getBindingName(o) << " out shape: " << out_shape);
auto dims = core::util::toVec(out_shape);
auto type = core::util::toATenDType(ctx->getEngine().getBindingDataType(o));
outputs.push_back(at::empty(dims, {at::kCUDA}).to(type).contiguous());
gpu_handles.push_back(outputs[outputs.size() - 1].data_ptr());
}

// Is this the right stream?
c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());

ctx->enqueueV2(gpu_handles.data(), stream, nullptr);

stream.synchronize();
LOG_DEBUG("Running TRT version");
auto engine_ptr = c10::make_intrusive<trtorch::core::execution::TRTEngine>("test_engine", eng);
auto outputs = trtorch::core::execution::execute_engine(inputs, engine_ptr);
return outputs;
}

Expand Down

0 comments on commit 5a105c6

Please sign in to comment.