In [None]:
# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

##### <img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Playground

This notebook demonstrates HuggingFace models on a variety of NLP tasks. You can select the model from a list of pretrained models from HuggingFace and convert them into TRT engines to speed up decoding, and run any customized prompts. Even if a model is not in the list, it is highly possible that TRT can run it! Let's try.

In [None]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

from BART.frameworks import BARTHF
from BART.trt import BARTTRT
from BART.BARTModelConfig import BARTModelTRTConfig

from T5.frameworks import T5HF
from T5.trt import T5TRT
from T5.T5ModelConfig import T5ModelTRTConfig


from GPT2.frameworks import GPT2HF
from GPT2.trt import GPT2TRT
from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig


from BLOOM.frameworks import BLOOMHF
from BLOOM.trt import BLOOMTRT
from BLOOM.BLOOMModelConfig import BLOOMModelTRTConfig


from OPT.frameworks import OPTHF
from OPT.trt import OPTTRT
from OPT.OPTModelConfig import OPTModelTRTConfig


from Seq2Seq.frameworks import Seq2SeqHF
from Seq2Seq.trt import Seq2SeqTRT

import ipywidgets as widgets
widget_style = {'description_width': 'initial'}
widget_layout = widgets.Layout(width='auto')

In [None]:
SUPPORT_MODELS = ["BART", "T5", "GPT2", "BLOOM", "OPT", "Other Seq2Seq"]
BART_VARIANTS = BARTModelTRTConfig.TARGET_MODELS
GPT2_VARIANTS = GPT2ModelTRTConfig.TARGET_MODELS
T5_VARIANTS = T5ModelTRTConfig.TARGET_MODELS
BLOOM_VARIANTS = BLOOMModelTRTConfig.TARGET_MODELS
OPT_VARIANTS = OPTModelTRTConfig.TARGET_MODELS

VARIANTS = {
    "BART": BART_VARIANTS,
    "T5": T5_VARIANTS,
    "GPT2": GPT2_VARIANTS,
    "BLOOM": BLOOM_VARIANTS,
    "OPT": OPT_VARIANTS
}


## Select your model!

You may want to 
1. Select the model variant from a list of supported models of our demo. You can also run some other Seq2Seq models from [HuggingFace](https://huggingface.co/), just put the model name in the text book and see if there is surprise!
2. Select the model configurations. If you have run our notebooks and command line, you will know that we have the following configs:
- `use_cache`: kv cache to speed decoding
- `num_beams`: beam search for better results
- `fp16`: Using float16 to speed decoding
- `batch_size`: batch size for the inputs.
- `cpu`: Only affects PyTorch model. If `cpu` is specified, PyTorch model will run on CPU instead.

In [None]:
model_selection_widget = widgets.RadioButtons(
    options=SUPPORT_MODELS,
    description='Please select a model and variant:',
    disabled=False,
    style=widget_style,
    layout=widget_layout
)

variant_selection_widgets = {
    "BART": widgets.RadioButtons(
        options=BART_VARIANTS,
        description='BART:',
        disabled=False,
        style=widget_style,
        layout=widget_layout
    ),
    "T5": widgets.RadioButtons(
        options=T5_VARIANTS,
        description='T5:',
        disabled=True,
        style=widget_style,
        layout=widget_layout
    ),
    "GPT2": widgets.RadioButtons(
        options=GPT2_VARIANTS,
        description='GPT2:',
        disabled=True,
        style=widget_style,
        layout=widget_layout
    ),
    "BLOOM": widgets.RadioButtons(
        options=BLOOM_VARIANTS,
        description='BLOOM:',
        disabled=True,
        style=widget_style,
        layout=widget_layout
    ),
    "OPT": widgets.RadioButtons(
        options=OPT_VARIANTS,
        description='OPT:',
        disabled=True,
        style=widget_style,
        layout=widget_layout
    ),
}

def display_model_selection(change):
    if change["new"] == "Other Seq2Seq":
        # Disable all selection if user choose other models
        for i in variant_selection_widgets:
            variant_selection_widgets[i].disabled = True
    else:
        if change["old"] != "Other Seq2Seq":
            hidden_widget = variant_selection_widgets[change["old"]]
            hidden_widget.disabled = True
        display_widget = variant_selection_widgets[change["new"]]
        display_widget.disabled = False

model_selection_widget.observe(display_model_selection, names='value')

variant_hbox = widgets.HBox(
    [variant_selection_widgets[i] for i in variant_selection_widgets],
)

model_variant_text = widgets.Text(
    value='',
    placeholder='Default = None',
    description='Not in the list?',
    disabled=False,
    style=widget_style,
    layout=widget_layout
)


fp16_widget = widgets.Checkbox(
    value=True,
    description='fp16',
    disabled=False,
    indent=False,
    style=widget_style,
    layout=widget_layout
)

cache_widget = widgets.Checkbox(
    value=True,
    description='Use KV Cache',
    disabled=False,
    indent=False,
    style=widget_style,
    layout=widget_layout
)

batch_size_widget = widgets.BoundedIntText(
    value=1,
    min=1,
    max=100000,
    step=1,
    description='Batch size',
    disabled=False,
    style=widget_style,
    layout=widget_layout
)

num_beam_widget = widgets.BoundedIntText(
    value=3,
    min=1,
    max=100000,
    step=1,
    description='Number of beams',
    disabled=False,
    style=widget_style,
    layout=widget_layout
)

cpu_widget = widgets.Checkbox(
    value=False,
    description='Use CPU for PyTorch',
    disabled=False,
    indent=False,
    style=widget_style,
    layout=widget_layout
)

widgets_all = widgets.VBox([
    model_selection_widget,
    variant_hbox,
    model_variant_text,
    fp16_widget, 
    cache_widget,
    batch_size_widget,
    num_beam_widget,
    cpu_widget
])

display(widgets_all)

## Build TRT Engine

Same as other [notebooks](.), you will need to call the APIs to build TRT model. All the PyTorch, ONNX and TRT models will be stored in [models](./models) folder for you to inspect.

In [None]:
def select_model(model, args):
    if model == "BART":
        torch_model = BARTHF(**args)
        trt_model = BARTTRT(**args)
    elif model == "T5":
        torch_model = T5HF(**args)
        trt_model = T5TRT(**args)
    elif model == "GPT2":
        torch_model = GPT2HF(**args)
        trt_model = GPT2TRT(**args)
    elif model == "BLOOM":
        torch_model = BLOOMHF(**args)
        trt_model = BLOOMTRT(**args)
    elif model == "OPT":
        torch_model = OPTHF(**args)
        trt_model = OPTTRT(**args)
    else:
        torch_model = Seq2SeqHF(**args)
        trt_model = Seq2SeqTRT(**args)
    return torch_model, trt_model

In [None]:
model = model_selection_widget.value
customized_variant = model_variant_text.value
if model == "Other Seq2Seq":
    variant = customized_variant
    assert variant != '', "Please specify a model variant for the demo"
else:
    selected_variant = variant_selection_widgets[model].value
    variant = selected_variant if customized_variant == '' else customized_variant
    
args = {
    "variant": variant, 
    "use_cache": cache_widget.value, 
    "fp16": fp16_widget.value,
    "num_beams": num_beam_widget.value, 
    "batch_size": batch_size_widget.value, 
    "working_dir": "models",
    "info": True,
    "iterations": 10,
    "number": 1,
    "warmup": 3,
    "duration": 0,
    "percentile": 50,
    "cpu": cpu_widget.value,
}


torch_model, trt_model = select_model(model, args)
torch_model.models = torch_model.setup_tokenizer_and_model()
trt_model.models = trt_model.setup_tokenizer_and_model()


## Run Inference!

Now it's time to play with the tasks. Each model has some tasks that they are able to complete. If you are not satisfied with the examples we provide, you can give them your own prompt. Enjoy playing with the models.

In [None]:
tasks = [
    'Text Generation',
    'Summarize', 
    'Mask Filling',
    'Text Classification',
    'Translate English to German',
    'Translate English to French',
    'Other',
]

tasks_per_model = {
    "BART": [False, True, True, False, False, False, True],
    "T5": [False, True, False, True, True, True, True],
    "GPT2": [True, False, False, False, False, False, True],
    "BLOOM": [True, False, False, False, False, False, True],
    "OPT": [True, False, False, False, False, False, True],
    "Other Seq2Seq": [True, True, True, True, True, True, True]
}

valid_tasks = [tasks[i] for i in range(len(tasks)) if tasks_per_model[model][i]]

task_widget = widgets.RadioButtons(
    options=valid_tasks,
    description='Task:',
    disabled=False
)

example_text = {
    tasks[0]:
        "TensorRT is a machine learning inference accelerator.",
    tasks[1]:
        "NVIDIA TensorRT-based applications perform up to 36X faster than CPU-only platforms during inference, enabling developers to optimize neural network models trained on all major frameworks, calibrate for lower precision with high accuracy, and deploy to hyperscale data centers, embedded platforms, or automotive product platforms.",
    tasks[2]:
        "My friends are <mask> but they eat too many carbs.",
    tasks[3]:
        "premise: I do not like vegetable. hypothesis: I like eating lettuce a lot.",
    tasks[4]:
        "TensorRT is a machine learning inference accelerator.",
    tasks[5]:
        "TensorRT is a machine learning inference accelerator.",
    tasks[6]:
        "What is inside your mind?"
}

framework_widget = widgets.RadioButtons(
    options=['PyTorch', 
             'TensorRT'],
    description='Framework:',
    disabled=False
)

paragraph_text = widgets.Textarea(
    value=example_text[tasks[0]],
    placeholder='Type something',
    description='Input:',
    disabled=False,
    style=widget_style,
    layout=widget_layout,
    rows=5,  
)

generated_text = widgets.Textarea(
    value='...',
    placeholder='...',
    description='Model output:',
    disabled=False,
    layout=widgets.Layout(width="auto"),
    rows=5,
)
button = widgets.Button(description="Generate")
output = widgets.Output()

display(task_widget)
display(framework_widget)

display(paragraph_text)
display(generated_text)
display(output)

display(button)

def switch_task(change):
    with output:
        paragraph_text.value = example_text[task_widget.value]

task_widget.observe(switch_task, 'value')


def generate(b):
    task = task_widget.value
    input_str = paragraph_text.value
    if task == "Translate English to German":
        input_str = "translate English to German: " + input_str
    elif task == "Translate English to French":
        input_str = "translate English to French: " + input_str
    elif task == "Summarize":
        input_str = "summarize: " + input_str
    
    framework = framework_widget.value
    if framework == 'PyTorch':
        model = torch_model
    elif framework == 'TensorRT':
        model = trt_model
    
    with output:
        # Need to specify device.
        use_cuda = not (framework == 'PyTorch' and cpu_widget.value)
        _, text = model.generate(input_str=input_str, use_cuda=use_cuda)
        generated_text.value = text[0]


button.on_click(generate)

## Performance Benchmarking

We are curious on how much acceleration TRT could provide to the e2e generation. We can run the demo in benchmarking mode, which uses randomized inputs/outputs with user-specified length.

In [None]:
# You can change these benchmarking parameters, or loop through multiple configs
model = model
variant = variant
input_seq_len = 256
output_seq_len = 512
num_beams = 1
batch_size = 1

model_dir = {"torch": {}, "trt": {}}
result_dir = {"torch": {}, "trt": {}}
print(f"Running variant:{variant}, batch_size={batch_size}, num_beams={num_beams}, input_seq_len={input_seq_len}, output_seq_len={output_seq_len}")
for use_cache in [False, True]:
    for precision in ["fp32", "fp16"]:
        metadata = f"use_cache:{use_cache}-precision:{precision}"
        print("Running:", metadata)
        args = {
            "variant": variant, 
            "use_cache": use_cache, 
            "fp16": True if precision == "fp16" else False,
            "num_beams": num_beams, 
            "batch_size": batch_size, 
            "working_dir": "models",
            "info": True,
            "iterations": 10,
            "number": 1,
            "warmup": 3,
            "duration": 0,
            "percentile": 50,
            "action": "benchmark",
            "input_seq_len": input_seq_len,
            "output_seq_len": output_seq_len,
        }
        torch_model, trt_model = select_model(model, args)
        model_dir["torch"][metadata] = torch_model
        model_dir["trt"][metadata] = trt_model
        result_dir["torch"][metadata] = torch_model.run()
        result_dir["trt"][metadata] = trt_model.run()
        

In [None]:
from tabulate import tabulate
def process_results(result_dir):
    headers = None
    rows = []
    for use_cache in [False, True]:
        for precision in ["fp32", "fp16"]:
            metadata = f"use_cache:{use_cache}-precision:{precision}"
            torch_result = result_dir["torch"][metadata].median_runtime
            trt_result = result_dir["trt"][metadata].median_runtime
            if headers is None:
                headers = ["use_cache", "precision", "framework"] + [i.name for i in torch_result]
            torch_entry = [use_cache, precision, "torch"] + [i.runtime for i in torch_result]
            trt_entry = [use_cache, precision, "trt"] + [i.runtime for i in trt_result]
            rows.append(torch_entry)
            rows.append(trt_entry)
    return rows, headers

rows, headers = process_results(result_dir)
print(tabulate(rows, headers=headers))