Skip to content

Commit

Permalink
TensorRT OSS - 8.5.3.1 Release (#2630)
Browse files Browse the repository at this point in the history
Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
  • Loading branch information
asfiyab-nvidia committed Feb 3, 2023
1 parent 8dc4e70 commit b0c259a
Show file tree
Hide file tree
Showing 549 changed files with 107,910 additions and 2,382 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ build/
*.sln
*.vcxproj
externals/
**/.DS_Store
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# TensorRT OSS Release Changelog

## [8.5.3 GA](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-3) - 2023-01-30

TensorRT OSS release corresponding to TensorRT 8.5.3.1 GA release.
- Updates since [TensorRT 8.5.2 GA release](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-2).
- Please refer to the [TensorRT 8.5.3 GA release notes](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-3) for more information.

Key Features and Updates:

- Added the following HuggingFace demos: GPT-J-6B, GPT2-XL, and GPT2-Medium
- Added nvinfer1::plugin namespace
- Optimized KV Cache performance for T5

## [8.5.2 GA](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-2) - 2022-12-12

TensorRT OSS release corresponding to TensorRT 8.5.2.2 GA release.
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ message(STATUS "cuDNN version set to ${CUDNN_VERSION}")
set_ifndef(PROTOBUF_VERSION ${DEFAULT_PROTOBUF_VERSION})
message(STATUS "Protobuf version set to ${PROTOBUF_VERSION}")

set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
if (BUILD_PLUGINS OR BUILD_PARSERS)
include(third_party/protobuf.cmake)
Expand Down Expand Up @@ -132,7 +133,7 @@ endif()
find_library_create_target(nvinfer nvinfer SHARED ${TRT_LIB_DIR})
find_library_create_target(nvuffparser nvparsers SHARED ${TRT_LIB_DIR})

find_library(CUDART_LIB cudart HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib/x64 lib64)
find_library(CUDART_LIB cudart_static HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib/x64 lib64)

if (NOT MSVC)
find_library(RT_LIB rt)
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ You can skip the **Build** section to enjoy TensorRT with Python.
To build the TensorRT-OSS components, you will first need the following software packages.

**TensorRT GA build**
* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.5.2.2
* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.5.3.1

**System Packages**
* [CUDA](https://developer.nvidia.com/cuda-toolkit)
Expand Down Expand Up @@ -80,8 +80,8 @@ To build the TensorRT-OSS components, you will first need the following software

```bash
cd ~/Downloads
tar -xvzf TensorRT-8.5.1.7.Linux.x86_64-gnu.cuda-11.8.cudnn8.6.tar.gz
export TRT_LIBPATH=`pwd`/TensorRT-8.5.1.7
tar -xvzf TensorRT-8.5.3.1.Linux.x86_64-gnu.cuda-11.8.cudnn8.6.tar.gz
export TRT_LIBPATH=`pwd`/TensorRT-8.5.3.1
```


Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
8.5.2.2
8.5.3.1
9 changes: 5 additions & 4 deletions demo/BERT/infer_c/bert_infer.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,14 @@ struct BertInference
std::vector<char> bytes(fsize);
input.read(bytes.data(), fsize);

auto runtime = TrtUniquePtr<IRuntime>(createInferRuntime(gLogger));
if (runtime == nullptr)
mRuntime = TrtUniquePtr<IRuntime>(createInferRuntime(gLogger));
if (mRuntime == nullptr)
{
gLogError << "Error creating TRT runtime\n";
gLogError << "Error creating TRT mRuntime\n";
exit(-1);
}

mEngine = TrtUniquePtr<ICudaEngine>(runtime->deserializeCudaEngine(bytes.data(), bytes.size()));
mEngine = TrtUniquePtr<ICudaEngine>(mRuntime->deserializeCudaEngine(bytes.data(), bytes.size()));
if (mEngine == nullptr)
{
gLogError << "Error deserializing CUDA engine\n";
Expand Down Expand Up @@ -338,6 +338,7 @@ struct BertInference
const int mSeqLength;
const bool mEnableGraph;

TrtUniquePtr<IRuntime> mRuntime{nullptr};
TrtUniquePtr<ICudaEngine> mEngine{nullptr};
TrtUniquePtr<IExecutionContext> mContext{nullptr};
std::vector<void*> mBindings;
Expand Down
24 changes: 24 additions & 0 deletions demo/BERT/infer_c/perf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,29 @@ void printHelpInfo()
std::cout << std::endl;
}

void printDeviceInfo()
{
int32_t device{};
gpuErrChk(cudaGetDevice(&device));

cudaDeviceProp properties{};
gpuErrChk(cudaGetDeviceProperties(&properties, device));

std::cout << "=== Device Information ===" << std::endl;
std::cout << "Selected Device: " << properties.name << std::endl;
std::cout << "Compute Capability: " << properties.major << "." << properties.minor << std::endl;
std::cout << "SMs: " << properties.multiProcessorCount << std::endl;
std::cout << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl;
std::cout << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl;
std::cout << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl;
std::cout << "Memory Bus Width: " << properties.memoryBusWidth << " bits"
<< " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
std::cout << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl;
std::cout << "=== Software Information ===" << std::endl;
std::cout << "Build time TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << std::endl;
std::cout << "Runtime linked TensorRT version: " << getInferLibVersion() << std::endl;
}

int main(int argc, char* argv[])
{

Expand All @@ -70,6 +93,7 @@ int main(int argc, char* argv[])
printHelpInfo();
return EXIT_FAILURE;
}
printDeviceInfo();

if (args.batchSize.empty())
{
Expand Down
1 change: 1 addition & 0 deletions demo/BERT/scripts/inference_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ if [ "$arg_help" -eq "1" ]; then
fi

mkdir -p /workspace/TensorRT/demo/BERT/engines
nvidia-smi -q

# BERT BASE

Expand Down
1 change: 1 addition & 0 deletions demo/BERT/scripts/inference_benchmark_megatron.sh
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ if [ "$arg_help" -eq "1" ]; then
fi;

mkdir -p /workspace/TensorRT/demo/BERT/engines
nvidia-smi -q

# BERT LARGE

Expand Down
5 changes: 5 additions & 0 deletions demo/Diffusion/demo-diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from transformers import CLIPTokenizer
import tensorrt as trt
from utilities import Engine, DPMScheduler, LMSDiscreteScheduler, save_image, TRT_LOGGER
import gc

def parseArgs():
parser = argparse.ArgumentParser(description="Options for Stable Diffusion Demo")
Expand Down Expand Up @@ -226,6 +227,10 @@ def loadEngines(
output_names = obj.get_output_names(),
dynamic_axes=obj.get_dynamic_axes(),
)

del model
torch.cuda.empty_cache()
gc.collect()
else:
print(f"Found cached model: {onnx_path}")

Expand Down
8 changes: 7 additions & 1 deletion demo/Diffusion/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,12 @@ def insert_layernorm_plugin(self):
else:
inputTensor = node.i().inputs[0] # UNet and VAE

# The first axis to normalize from can be inferred from the size of the `axes`
# parameter of (any of) the `ReduceMean` node(s)
reduceMeanNode = node.o().o(0).o()
assert reduceMeanNode.op == "ReduceMean"
firstNormAxis = -1 * np.size(np.array(reduceMeanNode.attrs["axes"]))

gammaNode = node.o().o().o().o().o().o().o()
index = [type(i) == gs.ir.tensor.Constant for i in gammaNode.inputs].index(True)
gamma = np.array(deepcopy(gammaNode.inputs[index].values.tolist()), dtype=np.float32)
Expand All @@ -283,7 +289,7 @@ def insert_layernorm_plugin(self):

inputList = [inputTensor, constantGamma, constantBeta]
layerNormV = gs.Variable("LayerNormV-" + str(nLayerNormPlugin), np.dtype(np.float32), inputTensor.shape)
layerNormN = gs.Node("LayerNorm", "LayerNormN-" + str(nLayerNormPlugin), inputs=inputList, attrs=OrderedDict([('epsilon', 1.e-5)]), outputs=[layerNormV])
layerNormN = gs.Node("LayerNorm", "LayerNormN-" + str(nLayerNormPlugin), inputs=inputList, attrs=OrderedDict([('epsilon', 1.e-5), ('axis', firstNormAxis)]), outputs=[layerNormV])
self.graph.nodes.append(layerNormN)
nLayerNormPlugin += 1

Expand Down
7 changes: 7 additions & 0 deletions demo/HuggingFace/BART/BARTModelConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ def add_benchmarking_args(parser: argparse.ArgumentParser) -> None:
class BARTModelTRTConfig(NNConfig):

TARGET_MODELS = ["facebook/bart-base", "facebook/bart-large", "facebook/bart-large-cnn", "facebook/mbart-large-50"]

MAX_DECODER_WORKSPACE_MB = {
TARGET_MODELS[0]: 3072,
TARGET_MODELS[1]: 3072,
TARGET_MODELS[2]: 3072,
TARGET_MODELS[3]: 3072,
}

# bart-base: 12-layer, 768-hidden, 139M parameters
# bart-large: 24-layer, 1024-hidden, 406M parameters
Expand Down
6 changes: 6 additions & 0 deletions demo/HuggingFace/BART/checkpoint.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-on

[BART.all."facebook/bart-large".all.summarization]

[BART.all."facebook/mbart-large-50".all.summarization]

label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference, enabling developers to optimize neural network models trained on all major frameworks, calibrate for lower precision with high accuracy, and deploy to hyperscale data centers, embedded platforms, or automotive product platforms. TensorTM, built on the NVIDIA CUDA parallel programming model, enables developers of applications to optimise inference by leveraging libraries, development tools, and technologies in CUDA-X for AI, autonomous machines, high performance computing, and graphics. With new NVIDIA Ampere Architecture GPUs, Tensor RT also uses sparse tensor cores for an additional performance boost."
label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference, enabling developers to optimize neural network models trained on all major frameworks, calibrate for lower precision with high accuracy, and deploy to hyperscale data centers, embedded platforms, or automotive product platforms. Tensor RT is the first GPU-based inference platform to use NVIDIA's CUDA-X architecture. TenseRT, built on the NVIDIA CUDA parallel programming model, enables developers to analyze neural network data and perform inference by leveraging libraries, development tools, and technologies in CUDA, including CUDA for AI, autonomous machines, high performance computing, and graphics. With new NVIDIA Ampere Architecture GPUs, TensorRex also uses sparse tensor cores for an additional performance boost."

[BART.all."facebook/bart-large-cnn".all.summarization]

label = "TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference. TensorRT is built on the NVIDIA CUDA parallel programming model. With new NVIDIA Ampere Architecture GPUs, Tensor RT also uses sparse tensor cores for an additional performance boost."

[BART.all."facebook/mbart-large-50".all.summarization]

label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference, enabling developers to optimize neural network models trained on all major frameworks, calibrate for lower precision with high accuracy, and deploy to hyperscale data centers, embedded platforms, or automotive product platforms. TensorTM, built on the NVIDIA CUDA parallel programming model, enables developers of applications to optimise inference by leveraging libraries, development tools, and technologies in CUDA-X for AI, autonomous machines, high performance computing, and graphics. With new NVIDIA Ampere Architecture GPUs, Tensor RT also uses sparse tensor cores for an additional performance boost."

# There is a weird bug in Frameworks where the output is incorrect
# when compared to OnnxRT. Frameworks only the first two sentence is generated.
Expand Down
46 changes: 28 additions & 18 deletions demo/HuggingFace/BART/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def __init__(self, decoder, lm_head, final_logits_bias, config):
@staticmethod
def _reorder_cache(past, beam_idx):
return BartForConditionalGeneration._reorder_cache(past, beam_idx)

def prepare_inputs_for_generation(self, input_ids, past=None, use_cache=None, **kwargs):
# cut decoder_input_ids if past is used
if past is not None:
Expand All @@ -139,7 +139,7 @@ def prepare_inputs_for_generation(self, input_ids, past=None, use_cache=None, **
# To really enable KV cache in HuggingFace, these args must be passed. Just specifying use_cache = True in BartConfig is not enough. Also see the additional "past_key_values" fields in the forward() return below.
if self.config.use_cache:
ret["use_cache"] = use_cache
ret["past_key_values"] = past
ret["past_key_values"] = past

return ret

Expand Down Expand Up @@ -198,10 +198,10 @@ def __init__(self, model, network_metadata):

# TRT Engine File Encoding #
class BARTDecoderTRTEngine(TRTEngineFile):
DEFAULT_TRT_WORKSPACE_MB = 3072

def __init__(self, model, network_metadata):
super().__init__(model, BARTDecoderConverter, network_metadata)
self.max_trt_workspace = BARTModelTRTConfig.MAX_DECODER_WORKSPACE_MB[network_metadata.variant]

def get_network_definition(self, network_definition):
return add_extra_fp32(network_definition)
Expand All @@ -211,10 +211,10 @@ def use_obey_precision_constraints(self):


class BARTEncoderTRTEngine(TRTEngineFile):
DEFAULT_TRT_WORKSPACE_MB = 2048

def __init__(self, model, network_metadata):
super().__init__(model, BARTEncoderConverter, network_metadata)
self.max_trt_workspace = 2048

def get_network_definition(self, network_definition):
return add_extra_fp32(network_definition)
Expand Down Expand Up @@ -281,29 +281,37 @@ def _export_forward(*args, **kwargs):
**inputs.get_torch_dynamic_axis_encoding(),
**outputs.get_torch_dynamic_axis_encoding(),
},
training=False,
training=torch.onnx.TrainingMode.EVAL,
**opt_args
)
else:
encoder_hidden_states = simplified_encoder(input_ids)
decoder_output = decoder_with_lm_head_and_bias(input_ids[:,:-1], encoder_hidden_states) # decoder output at t-1 step (logits, past_key_values from 0 to t-1)
past_key_values = decoder_output[1]

decoder_root, decoder_fullname = os.path.split(output_fpath)
# Split kv and non kv onnx into separate folders to avoid weight overlap
non_kv_root = os.path.join(decoder_root, "non-kv")
kv_root = os.path.join(decoder_root, "kv")
decoder_name, decoder_ext = os.path.splitext(decoder_fullname)
non_kv_fpath = os.path.join(non_kv_root, decoder_name + "-non-kv" + decoder_ext)
kv_fpath = os.path.join(kv_root, decoder_fullname)

# This code allows for huggingface compatible torch class to use onnx exporter (change just before onnx.export)
old_forward = decoder_with_lm_head_and_bias.forward
def _export_forward(input_ids, encoder_hidden_states, past_key_values):
result = old_forward(input_ids, encoder_hidden_states, past_key_values=past_key_values)
return (result[0], result[1])
decoder_with_lm_head_and_bias.forward = _export_forward

torch.onnx.export(
decoder_with_lm_head_and_bias,
(input_ids[:,-1:], encoder_hidden_states,past_key_values),
# (1) input_ids should be the t token (last one) while past_key_values is 0 to t-1 caches
# (1) input_ids should be the t token (last one) while past_key_values is 0 to t-1 caches
# (2) since past_key_values is kwargs, ideally use "(input_ids[:,-1:], encoder_hidden_states, {"past_key_values": past_key_values})",
# but onnx.export seems to unable to take kwargs properly (although PyTorch 1.11 claims it supports already).
# but onnx.export seems to unable to take kwargs properly (although PyTorch 1.11 claims it supports already).
# Therefore, we need to wrap inside _export_forward() and make past_key_values indeed a kwargs
output_fpath,
kv_fpath,
export_params=True,
opset_version=12,
input_names=inputs.get_names(),
Expand All @@ -312,7 +320,7 @@ def _export_forward(input_ids, encoder_hidden_states, past_key_values):
**inputs.get_torch_dynamic_axis_encoding(),
**outputs.get_torch_dynamic_axis_encoding(),
},
training=False,
training=torch.onnx.TrainingMode.EVAL,
**opt_args
)

Expand All @@ -321,9 +329,6 @@ def _export_forward(input_ids, encoder_hidden_states, use_cache):
result = old_forward(input_ids, encoder_hidden_states, use_cache=use_cache)
return (result[0], result[1])
decoder_with_lm_head_and_bias.forward = _export_forward

fpath_root, fpath_ext = os.path.splitext(output_fpath)
output_fpath_non_kv = fpath_root + '-non-kv' + fpath_ext

# inputs are same as non-kv model
# outputs are same as kv model
Expand All @@ -334,7 +339,7 @@ def _export_forward(input_ids, encoder_hidden_states, use_cache):
torch.onnx.export(
decoder_with_lm_head_and_bias,
(input_ids[:,-1:], encoder_hidden_states, True),
output_fpath_non_kv,
non_kv_fpath,
export_params=True,
opset_version=12,
input_names=inputs_non_kv.get_names(),
Expand All @@ -343,14 +348,19 @@ def _export_forward(input_ids, encoder_hidden_states, use_cache):
**inputs_non_kv.get_torch_dynamic_axis_encoding(),
**outputs.get_torch_dynamic_axis_encoding(),
},
training=False,
training=torch.onnx.TrainingMode.EVAL,
**opt_args
)

if network_metadata.precision.fp16:
G_LOGGER.debug("Clamping FP16 weights for BART")
# move_t5_cast_op(output_fpath, output_fpath) # BART doesn't have T5's Add-Cast-Pow ordering issue
clamp_weights_onnx_to_fp16_bounds(output_fpath, output_fpath)
if network_metadata.other.kv_cache:
# both onnx files need clamp
clamp_weights_onnx_to_fp16_bounds(non_kv_fpath, non_kv_fpath)
clamp_weights_onnx_to_fp16_bounds(kv_fpath, kv_fpath)
else:
clamp_weights_onnx_to_fp16_bounds(output_fpath, output_fpath)

return BARTDecoderONNXFile(output_fpath, network_metadata)

Expand Down Expand Up @@ -396,7 +406,7 @@ def torch_to_onnx(
**inputs.get_torch_dynamic_axis_encoding(),
**outputs.get_torch_dynamic_axis_encoding(),
},
training=False,
training=torch.onnx.TrainingMode.EVAL,
**opt_args
)

Expand Down
Loading

0 comments on commit b0c259a

Please sign in to comment.