In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# GPT-2 Playground

This notebook demonstrates the GPT-2 model for open-end text generation.

The TensorRT HuggingFace GPT-2 model is a plug-in replacement for the original PyTorch  HuggingFace GPT-2 model.


**Notes**: 
 - For "CPU - PyTorch" and "GPU - PyTorch", a GPT-2 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.
Average run time across 5 runs is reported.
 - Prior to running this notebook, run [gpt2.ipynb](gpt2.ipynb) to download the GPT-2 model and generate the TensorRT engine.

In [None]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import warnings
warnings.filterwarnings('ignore')

import torch 

# huggingface
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    GPT2Config,
)

from GPT2.trt import GPT2TRTDecoder, GPT2TRTEngine
from NNDF.networks import NetworkMetadata, Precision
from collections import namedtuple 
from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig, GPT2Metadata

# download HuggingFace model and tokernizer
GPT2_VARIANT = 'gpt2' # choices: gpt2 | gpt2-large
model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)
config = GPT2Config(GPT2_VARIANT)
tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)

# load TensorRT engine
metadata = NetworkMetadata(variant=GPT2_VARIANT, precision=Precision(fp16=False), other=GPT2Metadata(kv_cache=False))
from os.path import exists
if not exists('./models/gpt2/trt-engine/gpt2.onnx.engine'):
    print("Error: TensorRT engine not found at ./models/gpt2/trt-engine/gpt2.onnx.engine. Please run gpt2.ipynb to generate the TensorRT engine first!")
else:
    gpt2_engine = GPT2TRTEngine('./models/gpt2/trt-engine/gpt2.onnx.engine', metadata)
    gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)

In [None]:
import ipywidgets as widgets
import numpy as np
import time

device = widgets.RadioButtons(
    options=['CPU - PyTorch', 
             'GPU - PyTorch', 
             'GPU - TensorRT'],
    description='Device:',
    disabled=False
)

paragraph_text = widgets.Textarea(
    value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps '\
'such as recommenders, speech and image/video on NVIDIA GPUs. ',
    placeholder='Type something',
    description='Context:',
    disabled=False,
    layout=widgets.Layout(width="auto"),
    rows=5,  
)

generated_text = widgets.Textarea(
    value='...',
    placeholder='GPT-2 generated text',
    description='GPT-2:',
    disabled=False,
    layout=widgets.Layout(width="auto"),
    rows=5,
)
button = widgets.Button(description="Generate")

display(paragraph_text)
display(generated_text)
display(device)

from IPython.display import display
box_layout = widgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                width='100%')
N_RUN = 6
progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=N_RUN,
    description='Progress:',
    bar_style='', # 'success', 'info', 'warning', 'danger' or ''
    style={'bar_color': 'green'},
    orientation='horizontal', 
    layout=widgets.Layout(width='100%', height='50px')
)

box = widgets.HBox(children=[button],layout=box_layout)
output = widgets.Output()
display(box)
display(progress_bar)
display(output)

def generate(b):
    progress_bar.value = 0
    inference_time_arr = []
    with output:
        if device.value == 'GPU - TensorRT':
            inputs = tokenizer(paragraph_text.value, return_tensors="pt")
            for _ in range(N_RUN):
                start_time = time.time()
                sample_output = gpt2_trt.generate(inputs.input_ids.to('cuda:0'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])
                inference_time_arr.append(time.time()-start_time)
                progress_bar.value += 1

            # de-tokenize model output to raw text
            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
            generated_text.value = text
            print("GPU - TensorRT - Average inference time: %.2f (ms)"%(1000*np.mean(inference_time_arr[1:])))                  
                
        elif device.value == 'CPU - PyTorch':
            inputs = tokenizer(paragraph_text.value, return_tensors="pt")
            for _ in range(N_RUN):
                start_time = time.time()
                sample_output = model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])
                inference_time_arr.append(time.time()-start_time)
                progress_bar.value += 1

            # de-tokenize model output to raw text
            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
            generated_text.value = text
            print("CPU - PyTorch - Average inference time: %.2f (ms)"%(1000*np.mean(inference_time_arr[1:])))
            
        elif  device.value == 'GPU - PyTorch':  
            inputs = tokenizer(paragraph_text.value, return_tensors="pt")
            for _ in range(N_RUN):
                start_time = time.time()
                sample_output = model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])
                inference_time_arr.append(time.time()-start_time)
                progress_bar.value += 1

            # de-tokenize model output to raw text
            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
            generated_text.value = text
            print("GPU - PyTorch - Average inference time: %.2f (ms)"%(1000*np.mean(inference_time_arr[1:])))    
            
button.on_click(generate)