In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# T5 Playground

This notebook demonstrates T5 model on the task of translation and text summarization.

The TensorRT HuggingFace T5 model is a plug-in replacement for the original PyTorch  HuggingFace T5 model.



**Notes**: 
 - For "CPU - PyTorch" and "GPU - PyTorch", a T5 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.
Average run time across 5 runs is reported.
 - Prior to running this notebook, run [t5.ipynb](t5.ipynb) to download the T5 model and generate the TensorRT engine.

In [None]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch 

# huggingface
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
)

# download HuggingFace model and tokernizer
T5_VARIANT = 't5-small'

t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT)
tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)
config = T5Config(T5_VARIANT)

# load TensorRT engine
from T5.trt import T5TRTEncoder, T5TRTDecoder, TRTHFRunner
from T5.T5ModelConfig import T5ModelTRTConfig, T5Metadata
from T5.export import T5DecoderTRTEngine, T5EncoderTRTEngine
from NNDF.networks import NetworkMetadata, Precision

from transformers.generation_stopping_criteria import (
    MaxLengthCriteria,
    StoppingCriteriaList,
)

tfm_config = T5Config(
    use_cache=True,
    num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[T5_VARIANT],
)
metadata=NetworkMetadata(variant=T5_VARIANT, precision=Precision(fp16=True), other=T5Metadata(kv_cache=False))

from os.path import exists
encoder_path = './models/{}/tensorrt/{}-encoder.onnx.engine'.format(T5_VARIANT,T5_VARIANT)
if not exists(encoder_path):
    print("Error: TensorRT engine not found at {}. Please run t5.ipynb to generate the TensorRT engine first!".format(encoder_path))
else:
    encoder_engine = T5DecoderTRTEngine('./models/{}/tensorrt/{}-encoder.onnx.engine'.format(T5_VARIANT,T5_VARIANT), metadata)
    decoder_engine = T5DecoderTRTEngine('./models/{}/tensorrt/{}-decoder-with-lm-head.onnx.engine'.format(T5_VARIANT,T5_VARIANT), metadata)

t5_trt_encoder = T5TRTEncoder(encoder_engine, metadata, tfm_config)
t5_trt_decoder = T5TRTDecoder(decoder_engine, metadata, tfm_config)

decoder_input_ids = torch.full(
    (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32
).to("cuda:0")

In [None]:
import ipywidgets as widgets
import numpy as np
import time

device = widgets.RadioButtons(
    options=['CPU - PyTorch', 
             'GPU - PyTorch', 
             'GPU - TensorRT'],
    description='Device:',
    disabled=False
)

task = widgets.RadioButtons(
    options=['En -> German', 
             'Summarize', 
             ],
    description='Task:',
    disabled=False
)

paragraph_text = widgets.Textarea(
    value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps'\
    'such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops'\
    'and layers before applying optimizations for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep'\
    'learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps.',
    placeholder='Type something',
    description='Context:',
    disabled=False,
    layout=widgets.Layout(width="auto"),
    rows=5,  
)


generated_text = widgets.Textarea(
    value='...',
    placeholder='Context',
    description='T5 output:',
    disabled=False,
    layout=widgets.Layout(width="auto"),
    rows=5,
)
button = widgets.Button(description="Generate")

display(paragraph_text)
display(generated_text)
display(device)
display(task)

from IPython.display import display
box_layout = widgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                width='100%')
N_RUN = 6
progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=N_RUN,
    description='Progress:',
    bar_style='', # 'success', 'info', 'warning', 'danger' or ''
    style={'bar_color': 'green'},
    orientation='horizontal', 
    layout=widgets.Layout(width='100%', height='50px')
)

box = widgets.HBox(children=[button],layout=box_layout)
output = widgets.Output()
display(box)
display(progress_bar)
display(output)

MAX_LENGTH = 256

def generate(b):
    progress_bar.value = 0
    inference_time_arr = []
    prefix = 'translate English to German' if task.value=='En -> German' else 'summarize'
    inputs = tokenizer("{}: {}".format(prefix, paragraph_text.value), return_tensors="pt")
    with output:
        if device.value == 'GPU - TensorRT':
            for _ in range(N_RUN):
                start_time = time.time()
                encoder_last_hidden_state = t5_trt_encoder(input_ids=inputs.input_ids)
                outputs = t5_trt_decoder.greedy_search(
                            input_ids=decoder_input_ids,
                            encoder_hidden_states=encoder_last_hidden_state,
                            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(MAX_LENGTH)])
                        )
                inference_time_arr.append(time.time()-start_time)
                progress_bar.value += 1

            # de-tokenize model output to raw text
            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_text.value = text
            print("GPU - TensorRT - Average inference time: %.2f (ms)"%(1000*np.mean(inference_time_arr[1:])))                   
                
        elif device.value == 'CPU - PyTorch':
            for _ in range(N_RUN):
                start_time = time.time()
                outputs = t5_model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=MAX_LENGTH)
                inference_time_arr.append(time.time()-start_time)
                progress_bar.value += 1

            # de-tokenize model output to raw text
            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_text.value = text
            print("CPU - PyTorch - Average inference time: %.2f (ms)"%(1000*np.mean(inference_time_arr[1:])))
            
        elif  device.value == 'GPU - PyTorch':  
            for _ in range(N_RUN):
                start_time = time.time()
                outputs = t5_model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'), max_length=MAX_LENGTH)
                inference_time_arr.append(time.time()-start_time)
                progress_bar.value += 1

            # de-tokenize model output to raw text
            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_text.value = text
            print("GPU - PyTorch - Average inference time: %.2f (ms)"%(1000*np.mean(inference_time_arr[1:])))    
            
button.on_click(generate)