In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Accelerating HuggingFace T5 Inference with TensorRT

T5 is an encoder-decoder model that converts all NLP problems into a text-to-text format. More specifically, it does so by encoding  different tasks as text directives in the input stream. This enables a single model to be trained supervised on a wide variety of NLP tasks such as translation, classification, Q&A and summarization.

This notebook shows 3 easy steps to convert a [HuggingFace PyTorch T5 model](https://huggingface.co/transformers/model_doc/t5.html) to a TensorRT engine for high-performance inference.

1. [Download HuggingFace T5 model](#1)
1. [Convert to ONNX format](#2)
1. [Convert to TensorRT engine](#3)

## Prerequisite

Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.

Next, we install some extra dependencies.

In [None]:
%%capture
!pip3 install -r ../requirements.txt

**Note:** After this step, you should restart the Jupyter kernel for the change to take effect.

In [None]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch
import tensorrt as trt

# huggingface
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
)

<a id="1"></a>

## 1. Download HuggingFace T5 model

First, we download the original HuggingFace PyTorch T5 model from HuggingFace model hubs, together with its associated tokernizer.

The T5 variants  that are suported by TensorRT 8 are:  t5-small (60M), t5-base (220M), t5-large (770M)

In [None]:
T5_VARIANT = 't5-small' # choices: t5-small | t5-base | t5-large

t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT)
tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)
config = T5Config(T5_VARIANT)

In [None]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(T5_VARIANT)
!mkdir -p $pytorch_model_dir

t5_model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

### Inference with PyTorch model

Next, we will carry out inference with the PyTorch model.

#### Single example inference

In [None]:
inputs = tokenizer("translate English to German: That is good.", return_tensors="pt")

# inference on a single example
t5_model.eval()
with torch.no_grad():
    outputs = t5_model(**inputs, labels=inputs["input_ids"])

logits = outputs.logits

In [None]:
# Generate sequence for an input
outputs = t5_model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'))
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### Model inference benchmark: encoder and decoder stacks

For benchmarking purposes, we will employ a helper functions `encoder_inference` and `decoder_inference` which execute the inference repeatedly for the T5 encoder and decoder stacks separately, and measure end to end execution time. Let's take note of this execution time for comparison with TensorRT. 
 
`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [None]:
from T5.measurements import decoder_inference, encoder_inference, full_inference_greedy
from T5.export import T5EncoderTorchFile, T5DecoderTorchFile
from NNDF.networks import TimingProfile

t5_torch_encoder = T5EncoderTorchFile.TorchModule(t5_model.encoder)
t5_torch_decoder = T5DecoderTorchFile.TorchModule(
    t5_model.decoder, t5_model.lm_head, t5_model.config
)

In [None]:
input_ids = inputs.input_ids

encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_torch_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0)
)
encoder_e2e_median_time

In [None]:
_, decoder_e2e_median_time = decoder_inference(
    t5_torch_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0)
)
decoder_e2e_median_time

#### Full model inference and benchmark

Next, we will try the T5 model for the task of translation from English to German.

For benchmarking purposes, we will employ a helper function `full_inference_greedy` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for comparison with TensorRT. 

In [None]:
from T5.T5ModelConfig import T5ModelTRTConfig, T5Metadata

decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(
    t5_torch_encoder,
    t5_torch_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0),
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT],
)
full_e2e_median_runtime

Let us decode the model's output back into text.

In [None]:
# De-tokenize output to raw text
print(tokenizer.decode(decoder_output_greedy[0], skip_special_tokens=True))

<a id="2"></a>

## 2. Convert to ONNX

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

The steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU. 

For the T5 model, we will convert the encoder and decoder seperately.

In [None]:
# helpers
from NNDF.networks import NetworkMetadata, Precision

In [None]:
onnx_model_path = './models/{}/ONNX'.format(T5_VARIANT)
!mkdir -p $onnx_model_path

metadata=NetworkMetadata(variant='t5-small', precision=Precision(fp16=True), other=T5Metadata(kv_cache=False))

encoder_onnx_model_fpath = T5_VARIANT + "-encoder.onnx"
decoder_onnx_model_fpath = T5_VARIANT + "-decoder-with-lm-head.onnx"

t5_encoder = T5EncoderTorchFile(t5_model.to('cpu'), metadata)
t5_decoder = T5DecoderTorchFile(t5_model.to('cpu'), metadata)

onnx_t5_encoder = t5_encoder.as_onnx_model(
    os.path.join(onnx_model_path, encoder_onnx_model_fpath), force_overwrite=False
)
onnx_t5_decoder = t5_decoder.as_onnx_model(
    os.path.join(onnx_model_path, decoder_onnx_model_fpath), force_overwrite=False
)

<a id="3"></a>

## 3. Convert to TensorRT

Now we are ready to parse the ONNX encoder and decoder models and convert them to optimized TensorRT engines.

Since the models contains dynamic input shapes, we can specify a valid input range with a TensorRT optimization profile.

In [None]:
from T5.export import T5DecoderONNXFile, T5EncoderONNXFile
from polygraphy.backend.trt import Profile

In [None]:
tensorrt_model_path = './models/{}/tensorrt'.format(T5_VARIANT)
!mkdir -p tensorrt_model_path
# Decoder optimization profiles
batch_size = 1
max_sequence_length = T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[T5_VARIANT]
decoder_profile = Profile()
decoder_profile.add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, max_sequence_length // 2),
    max=(batch_size, max_sequence_length),
)
decoder_profile.add(
    "encoder_hidden_states",
    min=(batch_size, 1, max_sequence_length),
    opt=(batch_size, max_sequence_length // 2, max_sequence_length),
    max=(batch_size, max_sequence_length, max_sequence_length),
)

# Encoder optimization profiles
encoder_profile = Profile()
encoder_profile.add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, max_sequence_length // 2),
    max=(batch_size, max_sequence_length),
)


In [None]:
t5_trt_encoder_engine = T5EncoderONNXFile(
                os.path.join(onnx_model_path, encoder_onnx_model_fpath), metadata
            ).as_trt_engine(os.path.join(tensorrt_model_path, encoder_onnx_model_fpath) + ".engine", profiles=[encoder_profile])

t5_trt_decoder_engine = T5DecoderONNXFile(
                os.path.join(onnx_model_path, decoder_onnx_model_fpath), metadata
            ).as_trt_engine(os.path.join(tensorrt_model_path, decoder_onnx_model_fpath) + ".engine", profiles=[decoder_profile])

### Inference with TensorRT engine

Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the T5 model, ready for us to carry out inference. 

#### Single example inference
The T5 model with TensorRT backend can now be employed in place of the original HuggingFace T5 model.


In [None]:
# Initialize TensorRT engines
from T5.trt import T5TRTEncoder, T5TRTDecoder

tfm_config = T5Config(
    use_cache=True,
    num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[T5_VARIANT],
)
    
t5_trt_encoder = T5TRTEncoder(
                t5_trt_encoder_engine, metadata, tfm_config
            )
t5_trt_decoder = T5TRTDecoder(
                t5_trt_decoder_engine, metadata, tfm_config
            )

In [None]:
# Inference on a single sample
encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)
outputs = t5_trt_decoder(input_ids, encoder_last_hidden_state)

In [None]:
# Generate sequence for an input
from transformers.generation_stopping_criteria import (
    MaxLengthCriteria,
    StoppingCriteriaList,
)

max_length = 64

decoder_input_ids = torch.full(
    (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32
).to("cuda:0")

encoder_last_hidden_state = t5_trt_encoder(input_ids=input_ids)

In [None]:
outputs = t5_trt_decoder.greedy_search(
            input_ids=decoder_input_ids,
            encoder_hidden_states=encoder_last_hidden_state,
            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length)])
        )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### TRT engine inference benchmark: encoder and decoder stacks
First, we will bechmark the encoder and decoder stacks as before.

In [None]:
encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
    t5_trt_encoder, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0)
)
encoder_e2e_median_time


In [None]:
_, decoder_e2e_median_time = decoder_inference(
    t5_trt_decoder, input_ids, encoder_last_hidden_state, TimingProfile(iterations=10, number=1, warmup=1, duration=0)
)
decoder_e2e_median_time

### Full model inference benchmark

Next, we will try the full TensorRT T5 engine for the task of translation. As before, note the time difference.

In [None]:
decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(
    t5_trt_encoder,
    t5_trt_decoder,
    input_ids,
    tokenizer,
    TimingProfile(iterations=10, number=1, warmup=1, duration=0),
    max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[metadata.variant],
    use_cuda=False,
)

print(tokenizer.decode(decoder_output_greedy[0], skip_special_tokens=True))
full_e2e_median_runtime


You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference. On an NVIDIA V100 32GB GPU, this results in upto ~10x performance improvement (from 0.0802s to 0.0082s for the T5-small variant).

## Conclusion and where-to next?

This notebook has walked you through the process of converting a HuggingFace PyTorch T5 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace T5 model while providing significant speed up. 

If you are interested in further details of the conversion process, check out [T5/trt.py](../T5/trt.py)