In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Accelerating HuggingFace GPT-2 Inference with TensorRT

GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. The model was pretrained on the raw texts to guess the next word in sentences. As no human labeling was required, GPT-2 pretraining can use lots of publicly available data with an automatic process to generate inputs and labels from those data.

This notebook shows 3 easy steps to convert a [HuggingFace PyTorch GPT-2 model](https://huggingface.co/gpt2) to a TensorRT engine for high-performance inference.

1. [Download HuggingFace GPT-2 model ](#1)
1. [Convert to ONNX format](#2)
1. [Convert to TensorRT engine](#3)
1. [Advanced Topic: KV Cache](#4)
1. [Advanced Topic: Beam Search](#5)

## Prerequisite

Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.

Next, we install some extra dependencies.

In [None]:
%%capture
!pip3 install -r ../requirements.txt

**Note:** After this step, you should restart the Jupyter kernel for the change to take effect.

In [None]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch 

# huggingface
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    GPT2Config,
)

<a id="1"></a>

## 1. Download HuggingFace GPT-2 model 

First, we download the original HuggingFace PyTorch GPT-2 model from HuggingFace model hubs, together with its associated tokernizer.

The GPT-2 variants supported by TensorRT 8 are: gpt2 (117M), gpt2-large (774M).

In [None]:
# download model and tokernizer
GPT2_VARIANT = 'gpt2' # choices: gpt2 | gpt2-medium | gpt2-large | gpt2-xl
config = GPT2Config(GPT2_VARIANT)

model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT, force_download = False)
tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)

In [None]:
# save model locally
pytorch_model_dir = './models/{}/pytorch'.format(GPT2_VARIANT)
!mkdir -p $pytorch_model_dir

model.save_pretrained(pytorch_model_dir)
print("Pytorch Model saved to {}".format(pytorch_model_dir))

### Inference with PyTorch model

In [None]:
# carry out inference with a single sample
input_str = "Hello, my dog is "
inputs = tokenizer(input_str, return_tensors="pt")
input_ids = inputs.input_ids.to('cuda:0')

In [None]:
input_ids, input_ids.shape

#### Single example inference

In [None]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"], use_cache = False)

logits = outputs.logits

In [None]:
logits, logits.shape

For benchmarking purposes, we will employ a helper function `gpt2_inference` which executes the inference on a single batch repeatedly and measures end to end execution time. Let's take note of this execution time for later comparison with TensorRT. 
 
`TimingProfile` is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [None]:
from GPT2.measurements import gpt2_inference
from NNDF.networks import TimingProfile

# Benchmarking TensorRT performance on single batch
_, decoder_e2e_median_time = gpt2_inference(
            model.to('cuda:0'), input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
        )
decoder_e2e_median_time

#### Open-end text generation
Next, we will employ the PyTorch model for the open-end text generation task, which GPT-2 is particularly good at. 

In [None]:
from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig
# MAX_LENGTH represents the maximum length that GPT2 could be used in text generation. 
# This corresponds to max_length in task_specific_params for text-generation, which = 50 for each model config.
# If the length exceeds max_length, the output becomes meaningless for the specific task.
max_length = GPT2ModelTRTConfig.MAX_LENGTH[GPT2_VARIANT]

In [None]:
sample_output = model.to('cuda:0').generate(input_ids, max_length=max_length, use_cache = False)

# de-tokenize model output to raw text
tokenizer.decode(sample_output[0], skip_special_tokens=True)

For benchmarking purposes, we will employ a helper function `full_inference` which executes the inference repeatedly and measures end to end execution time. Let's take note of this execution time for later comparison with TensorRT. 

TimingProfile is a named tuple that specifies the number of experiments and number of times to call the function per iteration (and number of warm-up calls although it is not used here).

In [None]:
from GPT2.measurements import full_inference

# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    model.to('cuda:0'), inputs.input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length
)
full_e2e_median_runtime

<a id="2"></a>

## 2. Convert to ONNX format

Prior to converting the model to a TensorRT engine, we will first convert the PyTorch model to an intermediate universal format: ONNX.

ONNX is an open format for machine learning and deep learning models. It allows you to convert deep learning and machine learning models from different frameworks such as TensorFlow, PyTorch, MATLAB, Caffe, and Keras to a single format.

At a high level, the steps to convert a PyTorch model to TensorRT are as follows:
- Convert the pretrained image segmentation PyTorch model into ONNX.
- Import the ONNX model into TensorRT.
- Apply optimizations and generate an engine.
- Perform inference on the GPU with the TensorRT engine. 

In [None]:
from NNDF.networks import NetworkMetadata, Precision
from GPT2.export import GPT2TorchFile
from GPT2.GPT2ModelConfig import GPT2Metadata

In [None]:
metadata = NetworkMetadata(variant=GPT2_VARIANT, precision=Precision(fp16=False), other=GPT2Metadata(kv_cache=False)) # kv_cache is disabled because it exports extra input/output to the model
gpt2 = GPT2TorchFile(model.to('cpu'), metadata)

In [None]:
!mkdir -p ./models/$GPT2_VARIANT/ONNX

onnx_path = ('./models/{}/ONNX/{}.onnx'.format(GPT2_VARIANT, GPT2_VARIANT))
gpt2.as_onnx_model(onnx_path, force_overwrite=False)

Let's take a look at the onnx file and investigate its input and output. You should see that "input_ids" as the input, and "logits" as the output.

In [None]:
import onnx

In [None]:
onnx_model = onnx.load(onnx_path)

In [None]:
onnx_model.graph.input

In [None]:
onnx_model.graph.output

<a id="3"></a>

## 3. Convert to TensorRT engine

Now we are ready to parse the ONNX model and convert it to an optimized TensorRT model.

Since the model contains dynamic input shapes, we can specify a valid input range with a TensorRT optimization profile.

Note: As TensorRT carries out many optimization, this conversion process for the larger model might take a while.

In [None]:
from polygraphy.backend.trt import Profile
from tensorrt import PreviewFeature
from GPT2.export import GPT2ONNXFile, GPT2TRTEngine

In [None]:
!mkdir -p ./models/$GPT2_VARIANT/trt-engine
trt_engine_folder = './models/{}/trt-engine'.format(GPT2_VARIANT)

# Create optimization profile for dynamic shape input. Can modify batch_size / max_sequence_length to build engines for different shapes
batch_size = 1
preview_dynamic_shapes = True # review_dynamic_shapes optimize the trt engine building time
# We can either use input length as the optimal length, or use max_length // 2. 
# In T5 or BART, input_length is better, but in GPT-2, max_length // 2 is better because we need to generate max_length number of tokens

use_input_length = False
opt_length = input_id.shape[1] if use_input_length else max_length // 2 
# Create different engine tags for different configurations
engine_tag = f"bs{batch_size}"
preview_features = []
if preview_dynamic_shapes:
    preview_features = [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
    engine_tag += "-previewFasterDynamicShapes"

profiles = [Profile().add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, opt_length), # Optimized based on the inputs. 
    max=(batch_size, max_length),
)]

In [None]:
profiles

In [None]:
engine_path = os.path.join(trt_engine_folder, f"{GPT2_VARIANT}-{engine_tag}.engine")
if not os.path.exists(engine_path):
    gpt2_engine = GPT2ONNXFile(onnx_path, metadata).as_trt_engine(output_fpath=engine_path, profiles=profiles, preview_features=preview_features)
else:
    gpt2_engine = GPT2TRTEngine(engine_path, metadata)

### Inference with TensorRT engine

Great, if you have reached this stage, it means we now have an optimized TensorRT engine for the GPT-2 model, ready for us to carry out inference. 

The GPT-2 model with TensorRT backend can now be employed in place of the original HuggingFace GPT-2 model.

#### Single batch inference


In [None]:
from GPT2.trt import GPT2TRTDecoder
config = GPT2Config(GPT2_VARIANT, use_cache = False)

In [None]:
gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)

In [None]:
# Benchmarking TensorRT performance on single batch
_, decoder_e2e_median_time = gpt2_inference(
            gpt2_trt, input_ids, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50)
        )
decoder_e2e_median_time

In [None]:
with torch.no_grad():
    outputs = gpt2_trt(input_ids=input_ids)
logits = outputs.logits

In [None]:
logits, logits.shape

#### Open-end text generation
Let's generate the same task again. Since GPT-2 is an open-ended model, a small turbulent in the model might have a very different result. Since we have done some format changes and input/output restriction while exporting the model, you might see a different result compared to raw HuggingFace model.  

In [None]:
sample_output = gpt2_trt.generate(input_ids, max_length=max_length)

# de-tokenize model output to raw text
tokenizer.decode(sample_output[0], skip_special_tokens=True)

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    gpt2_trt, input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length
)
full_e2e_median_runtime

You can now compare the output of the original PyTorch model and the TensorRT engine. Notice the speed difference. On an NVIDIA V100 32GB GPU, this results in about ~5x performance improvement for the GPT-2 model (from an average of 0.704s to 0.134s).

Now you have known how to convert a model to onnx, build TRT engine and optimize it. As you might have recalled, using kv cache and beam search are two important ways to improve the performance of the decoder models. We have recently added thse support to our HuggingFace demo. 

<a id="4"></a>

## 4. Advanced Topic: KV Cache

As you have seen above, we put `use_cache = False` in some code blocks. This is because in the simplified model, we only take `input_ids` as input and `logits` as output. `input_ids` is growing as the sequence goes longer. In reality, we sometimes cache the self-attentions for each layer and reuse them in the later computations. This allows us to only take the last generated `input_ids`. This is a trade-off between space and time. When the model is small or the sequence is small, the D2D data copy time usually outweights the performance improvement of the model. However, performance improvements have been found in larger models with larger sequence length like 512. 

In [None]:
use_cache = True
kv_config = GPT2Config(GPT2_VARIANT, use_cache = use_cache)

#### Raw HuggingFace

The model that we download from `GPT2LMHeadModel.from_pretrained` is dynamic in its inputs. It can take both kv and non-kv configurations. Changing `use_cache` will do it. You can see that changing this configuration, the output is changed. 

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    model.to('cuda:0'), inputs.input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, use_cache = use_cache
)
full_e2e_median_runtime

In [None]:
sample_output = model.to('cuda:0').generate(input_ids, max_length=max_length, use_cache = use_cache)

# de-tokenize model output to raw text
tokenizer.decode(sample_output[0], skip_special_tokens=True)

#### TensorRT

For the 1st decoding step, we take `input_ids` and generate both `logits` and the kv cache. In other steps, we take the new `input_ids` with `past` kv-cache and the outputs are `logits` and the updated `present` kv-cache. Taking dynamic number of inputs for trt is not currently supported in our demo, so we need to output 2 onnx files and build 2 engines.

In [None]:
kv_metadata = NetworkMetadata(variant=GPT2_VARIANT, precision=Precision(fp16=False), other=GPT2Metadata(kv_cache=use_cache))
kv_gpt2 = GPT2TorchFile(model.to('cpu'), kv_metadata)

In [None]:
kv_onnx_path = ('./models/{}/ONNX/{}-kv_cache.onnx'.format(GPT2_VARIANT, GPT2_VARIANT))
kv_gpt2.as_onnx_model(kv_onnx_path, force_overwrite=False)

In [None]:
# Helper function to get non_kv path. If the input is model.onnx, output is model-non-kv.onnx
def get_non_kv_path(kv_path):
    fpath_root, fpath_ext = os.path.splitext(kv_path)
    return fpath_root + '-non-kv' + fpath_ext

In [None]:
kv_onnx_non_kv_path = get_non_kv_path(kv_onnx_path)

In [None]:
kv_onnx_model = onnx.load(kv_onnx_path)
kv_onnx_non_kv_model = onnx.load(kv_onnx_non_kv_path)

We could see that the kv model has #inputs = #outputs = num_layers * 2 + 1, while the non-kv model has #inputs = 1 and #outputs = kv model

In [None]:
len(kv_onnx_model.graph.input), len(kv_onnx_model.graph.output)

In [None]:
len(kv_onnx_non_kv_model.graph.input), len(kv_onnx_non_kv_model.graph.output)

The next blocks will set up the profile and build the engine. The only difference is that we now have the profile for kv cache, and have 2 engines.

In [None]:
import copy

In [None]:
batch_size = 1
preview_dynamic_shapes = True

engine_tag = "bs{}".format(batch_size)

preview_features = []
if preview_dynamic_shapes:
    preview_features = [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
    engine_tag += "-previewFasterDynamicShapes"
    
use_input_length = False
opt_length = input_id.shape[1] if use_input_length else max_length // 2 

# Setup profiles
kv_profiles = Profile()
kv_profiles = kv_profiles.add(
    "input_ids",
    min=(batch_size, 1),
    opt=(batch_size, opt_length),
    max=(batch_size, max_length),
)

# still need non-kv engine in kv mode
kv_profiles_non_kv = copy.deepcopy(kv_profiles)
kv_profiles_non_kv_list = [kv_profiles_non_kv]

num_heads = GPT2ModelTRTConfig.NUMBER_OF_HEADS[GPT2_VARIANT]
embedding_size_per_head = GPT2ModelTRTConfig.EMBEDDING_SIZE[GPT2_VARIANT] // num_heads
num_decoder_layers = GPT2ModelTRTConfig.NUMBER_OF_LAYERS[GPT2_VARIANT]

for i in range(num_decoder_layers):
    self_attention_profile = {
        "min": (batch_size, num_heads, 1, embedding_size_per_head),
        "opt": (batch_size, num_heads, opt_length, embedding_size_per_head),
        "max": (batch_size, num_heads, max_length, embedding_size_per_head),
    }
    dec_profiles = kv_profiles.add(
        f"past_key_values.{i}.decoder.key",
        **self_attention_profile
    )
    dec_profiles = kv_profiles.add(
        f"past_key_values.{i}.decoder.value",
        **self_attention_profile
    )
kv_profiles_list = [kv_profiles]

In [None]:
kv_profiles_non_kv_list, kv_profiles_list

In [None]:
kv_engine_path = os.path.join(trt_engine_folder, f"{GPT2_VARIANT}-kv_cache_{engine_tag}.engine")
kv_engine_non_kv_path = get_non_kv_path(kv_engine_path)

# Set up the trt engine with both kv input/output augmented
if not os.path.exists(kv_engine_path):
    kv_gpt2_engine = GPT2ONNXFile(kv_onnx_path, kv_metadata).as_trt_engine(kv_engine_path,profiles=kv_profiles_list, preview_features=preview_features)
else:
    kv_gpt2_engine = GPT2TRTEngine(kv_engine_path, kv_metadata)

# Set up the starter engine (engine w/o kv input)
if not os.path.exists(kv_engine_non_kv_path):
    kv_gpt2_non_kv_engine = GPT2ONNXFile(kv_onnx_non_kv_path, kv_metadata).as_trt_engine(kv_engine_non_kv_path,profiles=kv_profiles_non_kv_list, preview_features=preview_features)
else:
    kv_gpt2_non_kv_engine = GPT2TRTEngine(kv_engine_non_kv_path, kv_metadata)
    
kv_gpt2_trt = GPT2TRTDecoder(
    kv_gpt2_engine, kv_metadata, kv_config, batch_size=batch_size
)

kv_gpt2_trt._set_non_kv_engine_for_kv_mode(kv_gpt2_non_kv_engine)

Since we have 2 engines, benchmarking single-run runtime does not make sense. We instead use `full_inference` to measure the time for the entire inference cycle.

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    kv_gpt2_trt, input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, use_cache = use_cache
)
full_e2e_median_runtime

In [None]:
kv_gpt2_trt.reset()
kv_sample_output = kv_gpt2_trt.generate(input_ids, max_length=max_length)
tokenizer.decode(kv_sample_output[0], skip_special_tokens=True)

In this short example, kv cache performance does not improve the performance, and may even be slightly worse than non kv cache mode. However, when we have larger input sequences for the model, it will be better.

<a id="5"></a>

## 5. Advanced Topic: Beam Search

Beam search is a way to increase the model quality. It looks for the top `num_beams` number of possible words and pick the one that conditions the best to the current position. Similarly, the original HuggingFace PyTorch model supports beam search natively, while we need to build separate trt engine for different `num_beams`.

In [None]:
beam_config = GPT2Config(GPT2_VARIANT, use_cache = False)
beam_metadata = NetworkMetadata(variant=GPT2_VARIANT, precision=Precision(fp16=False), other=GPT2Metadata(kv_cache=False))
num_beams = 3

#### HuggingFace

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    model.to('cuda:0'), input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, num_beams = num_beams
)
full_e2e_median_runtime

In [None]:
sample_output = model.to('cuda:0').generate(input_ids, max_length=max_length, num_beams = num_beams)

# de-tokenize model output to raw text
tokenizer.decode(sample_output[0], skip_special_tokens=True)

You could see that the output is very different from the original one. If you change `num_beams`, the result will also change significantly.

#### TensorRT
It uses the same onnx file as the original configuration, but the engine set up is differently, because it expands the inputs by `num_beams` for the first dimension of inputs.

In [None]:
# Create optimization profile for dynamic shape input. Can modify batch_size / max_sequence_length to build engines for different shapes
batch_size = 1
preview_dynamic_shapes = True # review_dynamic_shapes optimize the trt engine building time
# We can either use input length as the optimal length, or use max_length // 2. 
# In T5 or BART, input_length is better, but in GPT-2, max_length // 2 is better because we need to generate max_length number of tokens

use_input_length = False
opt_length = input_id.shape[1] if use_input_length else max_length // 2 
# Create different engine tags for different configurations
engine_tag = f"bs{batch_size}-beam{num_beams}"

preview_features = []
if preview_dynamic_shapes:
    preview_features = [PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
    engine_tag += "-previewFasterDynamicShapes"

beam_profiles = [Profile().add(
    "input_ids",
    min=(batch_size * num_beams, 1),
    opt=(batch_size * num_beams, opt_length), # Optimized based on the inputs. 
    max=(batch_size * num_beams, max_length),
)]

In [None]:
beam_profiles

In [None]:
beam_engine_path = os.path.join(trt_engine_folder, f"{GPT2_VARIANT}-{engine_tag}.engine")
if not os.path.exists(beam_engine_path):
    beam_gpt2_engine = GPT2ONNXFile(onnx_path, beam_metadata).as_trt_engine(output_fpath=beam_engine_path, profiles=beam_profiles, preview_features=preview_features)
else:
    beam_gpt2_engine = GPT2TRTEngine(beam_engine_path, beam_metadata)

In [None]:
beam_gpt2_trt = GPT2TRTDecoder(beam_gpt2_engine, beam_metadata, beam_config, num_beams = num_beams)

In [None]:
# get complete decoder inference result and its timing profile
_, full_e2e_median_runtime = full_inference(
    beam_gpt2_trt, input_ids, tokenizer, TimingProfile(iterations=10, number=1, warmup=1, duration=0, percentile=50),
    max_length=max_length, num_beams = num_beams
)
full_e2e_median_runtime

In [None]:
beam_sample_output = beam_gpt2_trt.generate(input_ids, max_length=max_length, num_beams = num_beams)
tokenizer.decode(beam_sample_output[0], skip_special_tokens=True)

We could see that because of larger batch size, beam search will take slightly longer, but for most sequences, it will generate more meaningful outputs.

## Conclusion and where-to next?

This notebook has walked you through the process of converting a HuggingFace PyTorch GPT-2 model to an optimized TensorRT engine for inference in 3 easy steps. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace GPT-2 model while providing significant speed up. 

If you are interested in further details of the conversion process, check out [GPT2/trt.py](../GPT2/trt.py)