TensorRT OSS - 8.5.3.1 Release (#2630)

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
NVIDIA · Feb 3, 2023 · b0c259a · b0c259a
1 parent 8dc4e70
commit b0c259a
Show file tree

Hide file tree

Showing 549 changed files with 107,910 additions and 2,382 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ build/
 *.sln
 *.vcxproj
 externals/
+**/.DS_Store
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,17 @@
 # TensorRT OSS Release Changelog
 
+## [8.5.3 GA](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-3) - 2023-01-30
+
+TensorRT OSS release corresponding to TensorRT 8.5.3.1 GA release.
+- Updates since [TensorRT 8.5.2 GA release](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-2).
+- Please refer to the [TensorRT 8.5.3 GA release notes](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-3) for more information.
+
+Key Features and Updates:
+
+- Added the following HuggingFace demos: GPT-J-6B, GPT2-XL, and GPT2-Medium
+- Added nvinfer1::plugin namespace
+- Optimized KV Cache performance for T5
+
 ## [8.5.2 GA](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#rel-8-5-2) - 2022-12-12
 
 TensorRT OSS release corresponding to TensorRT 8.5.2.2 GA release.

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -100,6 +100,7 @@ message(STATUS "cuDNN version set to ${CUDNN_VERSION}")
 set_ifndef(PROTOBUF_VERSION ${DEFAULT_PROTOBUF_VERSION})
 message(STATUS "Protobuf version set to ${PROTOBUF_VERSION}")
 
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 if (BUILD_PLUGINS OR BUILD_PARSERS)
     include(third_party/protobuf.cmake)
@@ -132,7 +133,7 @@ endif()
 find_library_create_target(nvinfer nvinfer SHARED ${TRT_LIB_DIR})
 find_library_create_target(nvuffparser nvparsers SHARED ${TRT_LIB_DIR})
 
-find_library(CUDART_LIB cudart HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib/x64 lib64)
+find_library(CUDART_LIB cudart_static HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib/x64 lib64)
 
 if (NOT MSVC)
     find_library(RT_LIB rt)

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ You can skip the **Build** section to enjoy TensorRT with Python.
 To build the TensorRT-OSS components, you will first need the following software packages.
 
 **TensorRT GA build**
-* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.5.2.2
+* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.5.3.1
 
 **System Packages**
 * [CUDA](https://developer.nvidia.com/cuda-toolkit)
@@ -80,8 +80,8 @@ To build the TensorRT-OSS components, you will first need the following software
 
     ```bash
     cd ~/Downloads
-    tar -xvzf TensorRT-8.5.1.7.Linux.x86_64-gnu.cuda-11.8.cudnn8.6.tar.gz
-    export TRT_LIBPATH=`pwd`/TensorRT-8.5.1.7
+    tar -xvzf TensorRT-8.5.3.1.Linux.x86_64-gnu.cuda-11.8.cudnn8.6.tar.gz
+    export TRT_LIBPATH=`pwd`/TensorRT-8.5.3.1
     ```
 
 

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-8.5.2.2
+8.5.3.1
diff --git a/demo/BERT/infer_c/bert_infer.h b/demo/BERT/infer_c/bert_infer.h
@@ -68,14 +68,14 @@ struct BertInference
         std::vector<char> bytes(fsize);
         input.read(bytes.data(), fsize);
 
-        auto runtime = TrtUniquePtr<IRuntime>(createInferRuntime(gLogger));
-        if (runtime == nullptr)
+        mRuntime = TrtUniquePtr<IRuntime>(createInferRuntime(gLogger));
+        if (mRuntime == nullptr)
         {
-            gLogError << "Error creating TRT runtime\n";
+            gLogError << "Error creating TRT mRuntime\n";
             exit(-1);
         }
 
-        mEngine = TrtUniquePtr<ICudaEngine>(runtime->deserializeCudaEngine(bytes.data(), bytes.size()));
+        mEngine = TrtUniquePtr<ICudaEngine>(mRuntime->deserializeCudaEngine(bytes.data(), bytes.size()));
         if (mEngine == nullptr)
         {
             gLogError << "Error deserializing CUDA engine\n";
@@ -338,6 +338,7 @@ struct BertInference
     const int mSeqLength;
     const bool mEnableGraph;
 
+    TrtUniquePtr<IRuntime> mRuntime{nullptr};
     TrtUniquePtr<ICudaEngine> mEngine{nullptr};
     TrtUniquePtr<IExecutionContext> mContext{nullptr};
     std::vector<void*> mBindings;

diff --git a/demo/BERT/infer_c/perf.cpp b/demo/BERT/infer_c/perf.cpp
@@ -52,6 +52,29 @@ void printHelpInfo()
     std::cout << std::endl;
 }
 
+void printDeviceInfo()
+{
+    int32_t device{};
+    gpuErrChk(cudaGetDevice(&device));
+
+    cudaDeviceProp properties{};
+    gpuErrChk(cudaGetDeviceProperties(&properties, device));
+
+    std::cout << "=== Device Information ===" << std::endl;
+    std::cout << "Selected Device: " << properties.name << std::endl;
+    std::cout << "Compute Capability: " << properties.major << "." << properties.minor << std::endl;
+    std::cout << "SMs: " << properties.multiProcessorCount << std::endl;
+    std::cout << "Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl;
+    std::cout << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl;
+    std::cout << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl;
+    std::cout << "Memory Bus Width: " << properties.memoryBusWidth << " bits"
+              << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl;
+    std::cout << "Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl;
+    std::cout << "=== Software Information ===" << std::endl;
+    std::cout << "Build time TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << std::endl;
+    std::cout << "Runtime linked TensorRT version: " << getInferLibVersion() << std::endl;
+}
+
 int main(int argc, char* argv[])
 {
 
@@ -70,6 +93,7 @@ int main(int argc, char* argv[])
         printHelpInfo();
         return EXIT_FAILURE;
     }
+    printDeviceInfo();
 
     if (args.batchSize.empty())
     {

diff --git a/demo/BERT/scripts/inference_benchmark.sh b/demo/BERT/scripts/inference_benchmark.sh
@@ -98,6 +98,7 @@ if [ "$arg_help" -eq "1" ]; then
 fi
 
 mkdir -p /workspace/TensorRT/demo/BERT/engines
+nvidia-smi -q
 
 # BERT BASE
 

diff --git a/demo/BERT/scripts/inference_benchmark_megatron.sh b/demo/BERT/scripts/inference_benchmark_megatron.sh
@@ -105,6 +105,7 @@ if [ "$arg_help" -eq "1" ]; then
 fi;
 
 mkdir -p /workspace/TensorRT/demo/BERT/engines
+nvidia-smi -q
 
 # BERT LARGE
 

diff --git a/demo/Diffusion/demo-diffusion.py b/demo/Diffusion/demo-diffusion.py
@@ -28,6 +28,7 @@
 from transformers import CLIPTokenizer
 import tensorrt as trt
 from utilities import Engine, DPMScheduler, LMSDiscreteScheduler, save_image, TRT_LOGGER
+import gc
 
 def parseArgs():
     parser = argparse.ArgumentParser(description="Options for Stable Diffusion Demo")
@@ -226,6 +227,10 @@ def loadEngines(
                                     output_names = obj.get_output_names(),
                                     dynamic_axes=obj.get_dynamic_axes(),
                             )
+
+                        del model
+                        torch.cuda.empty_cache()
+                        gc.collect()
                     else:
                         print(f"Found cached model: {onnx_path}")
 

diff --git a/demo/Diffusion/models.py b/demo/Diffusion/models.py
@@ -271,6 +271,12 @@ def insert_layernorm_plugin(self):
                 else:
                     inputTensor = node.i().inputs[0]  # UNet and VAE
 
+                # The first axis to normalize from can be inferred from the size of the `axes`
+                # parameter of (any of) the `ReduceMean` node(s)
+                reduceMeanNode = node.o().o(0).o()
+                assert reduceMeanNode.op == "ReduceMean"
+                firstNormAxis = -1 * np.size(np.array(reduceMeanNode.attrs["axes"]))
+
                 gammaNode = node.o().o().o().o().o().o().o()
                 index = [type(i) == gs.ir.tensor.Constant for i in gammaNode.inputs].index(True)
                 gamma = np.array(deepcopy(gammaNode.inputs[index].values.tolist()), dtype=np.float32)
@@ -283,7 +289,7 @@ def insert_layernorm_plugin(self):
 
                 inputList = [inputTensor, constantGamma, constantBeta]
                 layerNormV = gs.Variable("LayerNormV-" + str(nLayerNormPlugin), np.dtype(np.float32), inputTensor.shape)
-                layerNormN = gs.Node("LayerNorm", "LayerNormN-" + str(nLayerNormPlugin), inputs=inputList, attrs=OrderedDict([('epsilon', 1.e-5)]), outputs=[layerNormV])
+                layerNormN = gs.Node("LayerNorm", "LayerNormN-" + str(nLayerNormPlugin), inputs=inputList, attrs=OrderedDict([('epsilon', 1.e-5), ('axis', firstNormAxis)]), outputs=[layerNormV])
                 self.graph.nodes.append(layerNormN)
                 nLayerNormPlugin += 1
 

diff --git a/demo/HuggingFace/BART/BARTModelConfig.py b/demo/HuggingFace/BART/BARTModelConfig.py
@@ -96,6 +96,13 @@ def add_benchmarking_args(parser: argparse.ArgumentParser) -> None:
 class BARTModelTRTConfig(NNConfig):
 
     TARGET_MODELS = ["facebook/bart-base", "facebook/bart-large", "facebook/bart-large-cnn", "facebook/mbart-large-50"]
+
+    MAX_DECODER_WORKSPACE_MB = {
+        TARGET_MODELS[0]: 3072,
+        TARGET_MODELS[1]: 3072,
+        TARGET_MODELS[2]: 3072,
+        TARGET_MODELS[3]: 3072,
+    }
 
     # bart-base: 12-layer, 768-hidden, 139M parameters
     # bart-large: 24-layer, 1024-hidden, 406M parameters

diff --git a/demo/HuggingFace/BART/checkpoint.toml b/demo/HuggingFace/BART/checkpoint.toml
@@ -9,12 +9,18 @@ label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-on
 
 [BART.all."facebook/bart-large".all.summarization]
 
+[BART.all."facebook/mbart-large-50".all.summarization]
+
+label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference, enabling developers to optimize neural network models trained on all major frameworks, calibrate for lower precision with high accuracy, and deploy to hyperscale data centers, embedded platforms, or automotive product platforms. TensorTM, built on the NVIDIA CUDA parallel programming model, enables developers of applications to optimise inference by leveraging libraries, development tools, and technologies in CUDA-X for AI, autonomous machines, high performance computing, and graphics. With new NVIDIA Ampere Architecture GPUs, Tensor RT also uses sparse tensor cores for an additional performance boost."
 label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference, enabling developers to optimize neural network models trained on all major frameworks, calibrate for lower precision with high accuracy, and deploy to hyperscale data centers, embedded platforms, or automotive product platforms. Tensor RT is the first GPU-based inference platform to use NVIDIA's CUDA-X architecture. TenseRT, built on the NVIDIA CUDA parallel programming model, enables developers to analyze neural network data and perform inference by leveraging libraries, development tools, and technologies in CUDA, including CUDA for AI, autonomous machines, high performance computing, and graphics. With new NVIDIA Ampere Architecture GPUs, TensorRex also uses sparse tensor cores for an additional performance boost."
 
 [BART.all."facebook/bart-large-cnn".all.summarization]
 
 label = "TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference. TensorRT is built on the NVIDIA CUDA parallel programming model. With new NVIDIA Ampere Architecture GPUs, Tensor RT also uses sparse tensor cores for an additional performance boost."
 
+[BART.all."facebook/mbart-large-50".all.summarization]
+
+label = "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference, enabling developers to optimize neural network models trained on all major frameworks, calibrate for lower precision with high accuracy, and deploy to hyperscale data centers, embedded platforms, or automotive product platforms. TensorTM, built on the NVIDIA CUDA parallel programming model, enables developers of applications to optimise inference by leveraging libraries, development tools, and technologies in CUDA-X for AI, autonomous machines, high performance computing, and graphics. With new NVIDIA Ampere Architecture GPUs, Tensor RT also uses sparse tensor cores for an additional performance boost."
 
 # There is a weird bug in Frameworks where the output is incorrect
 # when compared to OnnxRT. Frameworks only the first two sentence is generated.

diff --git a/demo/HuggingFace/BART/export.py b/demo/HuggingFace/BART/export.py
@@ -125,7 +125,7 @@ def __init__(self, decoder, lm_head, final_logits_bias, config):
         @staticmethod
         def _reorder_cache(past, beam_idx):
             return BartForConditionalGeneration._reorder_cache(past, beam_idx)
-            
+
         def prepare_inputs_for_generation(self, input_ids, past=None, use_cache=None, **kwargs):
             # cut decoder_input_ids if past is used
             if past is not None:
@@ -139,7 +139,7 @@ def prepare_inputs_for_generation(self, input_ids, past=None, use_cache=None, **
             # To really enable KV cache in HuggingFace, these args must be passed. Just specifying use_cache = True in BartConfig is not enough. Also see the additional "past_key_values" fields in the forward() return below.
             if self.config.use_cache:
                 ret["use_cache"] = use_cache
-                ret["past_key_values"] = past   
+                ret["past_key_values"] = past
 
             return ret
 
@@ -198,10 +198,10 @@ def __init__(self, model, network_metadata):
 
 # TRT Engine File Encoding #
 class BARTDecoderTRTEngine(TRTEngineFile):
-    DEFAULT_TRT_WORKSPACE_MB = 3072
 
     def __init__(self, model, network_metadata):
         super().__init__(model, BARTDecoderConverter, network_metadata)
+        self.max_trt_workspace = BARTModelTRTConfig.MAX_DECODER_WORKSPACE_MB[network_metadata.variant]
 
     def get_network_definition(self, network_definition):
         return add_extra_fp32(network_definition)
@@ -211,10 +211,10 @@ def use_obey_precision_constraints(self):
 
 
 class BARTEncoderTRTEngine(TRTEngineFile):
-    DEFAULT_TRT_WORKSPACE_MB = 2048
 
     def __init__(self, model, network_metadata):
         super().__init__(model, BARTEncoderConverter, network_metadata)
+        self.max_trt_workspace = 2048
 
     def get_network_definition(self, network_definition):
         return add_extra_fp32(network_definition)
@@ -281,29 +281,37 @@ def _export_forward(*args, **kwargs):
                     **inputs.get_torch_dynamic_axis_encoding(),
                     **outputs.get_torch_dynamic_axis_encoding(),
                 },
-                training=False,
+                training=torch.onnx.TrainingMode.EVAL,
                 **opt_args
             )
         else:
             encoder_hidden_states = simplified_encoder(input_ids)
             decoder_output = decoder_with_lm_head_and_bias(input_ids[:,:-1], encoder_hidden_states) # decoder output at t-1 step (logits, past_key_values from 0 to t-1)
             past_key_values = decoder_output[1]
 
+            decoder_root, decoder_fullname = os.path.split(output_fpath)
+            # Split kv and non kv onnx into separate folders to avoid weight overlap
+            non_kv_root = os.path.join(decoder_root, "non-kv")
+            kv_root = os.path.join(decoder_root, "kv")
+            decoder_name, decoder_ext = os.path.splitext(decoder_fullname)
+            non_kv_fpath = os.path.join(non_kv_root, decoder_name + "-non-kv" + decoder_ext)
+            kv_fpath = os.path.join(kv_root, decoder_fullname)
+
             # This code allows for huggingface compatible torch class to use onnx exporter (change just before onnx.export)
             old_forward = decoder_with_lm_head_and_bias.forward
             def _export_forward(input_ids, encoder_hidden_states, past_key_values):
                 result = old_forward(input_ids, encoder_hidden_states, past_key_values=past_key_values)
                 return (result[0], result[1])
             decoder_with_lm_head_and_bias.forward = _export_forward
-            
+
             torch.onnx.export(
                 decoder_with_lm_head_and_bias,
                 (input_ids[:,-1:], encoder_hidden_states,past_key_values),
-                # (1) input_ids should be the t token (last one) while past_key_values is 0 to t-1 caches 
+                # (1) input_ids should be the t token (last one) while past_key_values is 0 to t-1 caches
                 # (2) since past_key_values is kwargs, ideally use "(input_ids[:,-1:], encoder_hidden_states, {"past_key_values": past_key_values})",
-                # but onnx.export seems to unable to take kwargs properly (although PyTorch 1.11 claims it supports already). 
+                # but onnx.export seems to unable to take kwargs properly (although PyTorch 1.11 claims it supports already).
                 # Therefore, we need to wrap inside _export_forward() and make past_key_values indeed a kwargs
-                output_fpath,
+                kv_fpath,
                 export_params=True,
                 opset_version=12,
                 input_names=inputs.get_names(),
@@ -312,7 +320,7 @@ def _export_forward(input_ids, encoder_hidden_states, past_key_values):
                     **inputs.get_torch_dynamic_axis_encoding(),
                     **outputs.get_torch_dynamic_axis_encoding(),
                 },
-                training=False,
+                training=torch.onnx.TrainingMode.EVAL,
                 **opt_args
             )
 
@@ -321,9 +329,6 @@ def _export_forward(input_ids, encoder_hidden_states, use_cache):
                 result = old_forward(input_ids, encoder_hidden_states, use_cache=use_cache)
                 return (result[0], result[1])
             decoder_with_lm_head_and_bias.forward = _export_forward
-
-            fpath_root, fpath_ext = os.path.splitext(output_fpath)
-            output_fpath_non_kv = fpath_root + '-non-kv' + fpath_ext
 
             # inputs are same as non-kv model
             # outputs are same as kv model
@@ -334,7 +339,7 @@ def _export_forward(input_ids, encoder_hidden_states, use_cache):
             torch.onnx.export(
                 decoder_with_lm_head_and_bias,
                 (input_ids[:,-1:], encoder_hidden_states, True),
-                output_fpath_non_kv,
+                non_kv_fpath,
                 export_params=True,
                 opset_version=12,
                 input_names=inputs_non_kv.get_names(),
@@ -343,14 +348,19 @@ def _export_forward(input_ids, encoder_hidden_states, use_cache):
                     **inputs_non_kv.get_torch_dynamic_axis_encoding(),
                     **outputs.get_torch_dynamic_axis_encoding(),
                 },
-                training=False,
+                training=torch.onnx.TrainingMode.EVAL,
                 **opt_args
             )
-        
+
         if network_metadata.precision.fp16:
             G_LOGGER.debug("Clamping FP16 weights for BART")
             # move_t5_cast_op(output_fpath, output_fpath) # BART doesn't have T5's Add-Cast-Pow ordering issue
-            clamp_weights_onnx_to_fp16_bounds(output_fpath, output_fpath)
+            if network_metadata.other.kv_cache:
+                # both onnx files need clamp
+                clamp_weights_onnx_to_fp16_bounds(non_kv_fpath, non_kv_fpath)
+                clamp_weights_onnx_to_fp16_bounds(kv_fpath, kv_fpath)
+            else:
+                clamp_weights_onnx_to_fp16_bounds(output_fpath, output_fpath)
 
         return BARTDecoderONNXFile(output_fpath, network_metadata)
 
@@ -396,7 +406,7 @@ def torch_to_onnx(
                 **inputs.get_torch_dynamic_axis_encoding(),
                 **outputs.get_torch_dynamic_axis_encoding(),
             },
-            training=False,
+            training=torch.onnx.TrainingMode.EVAL,
             **opt_args
         )