In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# TensorRT: Q&A with BERT

This notebook provides a playground for testing various BERT QA models on the CPU and GPU.

For "CPU - Framework (PyTorch)" and "GPU - Framework (PyTorch)", a SpanBERT large model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.
Average run time across 10 runs is reported.

**Notes**: 
 - Prior to running this notebook, run [BERT-TRT-FP16.ipynb](BERT-TRT-FP16.ipynb) and [BERT-TRT-INT8-QAT-sparse.ipynb](BERT-TRT-INT8-QAT-sparse.ipynb) to generate the TensorRT engines.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/workspace/TensorRT/demo/BERT')

import ipywidgets as widgets
import tensorrt as trt;
TRT_VERSION = trt.__version__
print("TensorRT version: ", TRT_VERSION)

import time
import json
import ctypes
import argparse
import collections
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

from helpers import tokenization as tokenization
from helpers import data_processing as dp

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

################################################## PyTorch inference #######################################################
# Install a customized version of HuggingFace, adding DL model inference timing
#!pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
#!rm -rf /tmp/transformers
#!cd /tmp && git clone https://github.com/vinhngx/transformers && cd transformers && pip install .
# SpanBERT large model (340M params): https://github.com/facebookresearch/SpanBERT#finetuned-models-squad-1120-relation-extraction-coreference-resolution
from transformers import BertForQuestionAnswering, AutoTokenizer

#modelname = 'deepset/bert-base-cased-squad2'
modelname = 'mrm8488/spanbert-large-finetuned-squadv2'
model = BertForQuestionAnswering.from_pretrained(modelname)

from transformers import pipeline
nlp = pipeline('question-answering', model=model, tokenizer="SpanBERT/spanbert-large-cased")


model_gpu = BertForQuestionAnswering.from_pretrained(modelname).cuda()
nlp_gpu = pipeline('question-answering', model=model_gpu, tokenizer="SpanBERT/spanbert-large-cased", device=0)

################################################## TensorRT inference #######################################################
def inference_FP16(trt_context, d_inputs, h_output, d_output, features, tokens):
    #global h_output
    context = trt_context
    
    _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
            "NetworkOutput",
            ["start_logits", "end_logits", "feature_index"])
    networkOutputs = []

    eval_time_elapsed = 0
    for feature_index, feature in enumerate(features):
        # Copy inputs
        input_ids_batch = np.repeat(np.expand_dims(feature.input_ids, 0), 1, axis=0)
        segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), 1, axis=0)
        input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), 1, axis=0)

        input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
        segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))
        input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel()))

        eval_start_time = time.time()
        cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
        cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
        cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)

        # Run inference
        trt_context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
        # Synchronize the stream
        stream.synchronize()
        eval_time_elapsed += (time.time() - eval_start_time)

        # Transfer predictions back from GPU
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        stream.synchronize()

        for index, batch in enumerate(h_output):
            # Data Post-processing
            networkOutputs.append(_NetworkOutput(
                start_logits = np.array(batch.squeeze()[:, 0]),
                end_logits = np.array(batch.squeeze()[:, 1]),
                feature_index = feature_index
                ))

    eval_time_elapsed /= len(features)

    # The total number of n-best predictions to generate in the nbest_predictions.json output file
    n_best_size = 20

    # The maximum length of an answer that can be generated. This is needed 
    #  because the start and end predictions are not conditioned on one another
    max_answer_length = 30

    prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features,
        networkOutputs, n_best_size, max_answer_length)

    return eval_time_elapsed, prediction, nbest_json

def inference_INT8(trt_context, d_inputs, h_output, d_output, features, tokens):
    #global h_output
    context = trt_context
    
    _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
            "NetworkOutput",
            ["start_logits", "end_logits", "feature_index"])
    networkOutputs = []

    eval_time_elapsed = 0
    for feature_index, feature in enumerate(features):
        # Copy inputs
        B = 1
        S = np.sum(feature.input_mask)
        input_ids = feature.input_ids[0:S]
        segment_ids = feature.segment_ids[0:S]
        cu_seq_lens = np.array([0, S], dtype=np.int32);

        if context.get_binding_shape(0)[0] != S:
            context.set_binding_shape(0, (S,))
        if context.get_binding_shape(1)[0] != S:
            context.set_binding_shape(1, (S,))
        if context.get_binding_shape(2)[0] != 2:
            context.set_binding_shape(2, (2,))
        if context.get_binding_shape(3)[0] != S:
            context.set_binding_shape(3, (S,))

        h_input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids.ravel()))
        h_segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids.ravel()))
        h_cu_seq_lens = cuda.register_host_memory(np.ascontiguousarray(cu_seq_lens.ravel()))

        eval_start_time = time.time()
        cuda.memcpy_htod_async(d_inputs[0], h_input_ids, INT8_stream)
        cuda.memcpy_htod_async(d_inputs[1], h_segment_ids, INT8_stream)
        cuda.memcpy_htod_async(d_inputs[2], h_cu_seq_lens, INT8_stream)

        # Run inference
        trt_context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=INT8_stream.handle)
        # Synchronize the stream
        INT8_stream.synchronize()
        eval_time_elapsed += (time.time() - eval_start_time)

        # Transfer predictions back from GPU
        cuda.memcpy_dtoh_async(h_output, d_output, INT8_stream)
        INT8_stream.synchronize()

        # Only retrieve and post-process the first batch
        networkOutputs.append(_NetworkOutput(
            start_logits = np.array(h_output[0:S]),
            end_logits = np.array(h_output[S:S*2]),
            feature_index = feature_index
            ))

    eval_time_elapsed /= len(features)

    # Total number of n-best predictions to generate in the nbest_predictions.json output file
    n_best_size = 20

    # The maximum length of an answer that can be generated. This is needed
    # because the start and end predictions are not conditioned on one another
    max_answer_length = 30

    prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features,
            networkOutputs, n_best_size, max_answer_length)

    return eval_time_elapsed, prediction, nbest_json
        
def print_single_query(eval_time_elapsed, prediction, nbest_json):
    print("Answer: '{}'".format(prediction))
    print("With probability: {:.2f}%".format(nbest_json[0]['probability'] * 100.0))
    
def question_features(tokens, question):
    # Extract features from the paragraph and question
    return dp.convert_example_to_features(tokens, question, tokenizer, max_seq_length, doc_stride, max_query_length)

doc_stride = 128
max_query_length = 64

vocab_file = "models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_384_v19.03.1/vocab.txt"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

### FP16 TRT model
engine_path = "engines_{}/bert_large_384.engine".format(TRT_VERSION)
max_seq_length = 384
batch_size = 1

runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(open(engine_path, 'rb') .read()) 
context = engine.create_execution_context()

 # We always use batch size 1.
input_shape = (1, max_seq_length)
input_nbytes = trt.volume(input_shape) * trt.int32.itemsize

# Allocate device memory for inputs.
d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]

# Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)
# Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
for binding in range(3):
    context.set_binding_shape(binding, input_shape)
assert context.all_binding_shapes_specified

# Allocate output buffer by querying the size from the context. This may be different for different input shapes.
h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
d_output = cuda.mem_alloc(h_output.nbytes)

# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()

### INT8 TRT model
engine_path = "engines_%s/megatron_large_seqlen384_int8qat_sparse.engine"%TRT_VERSION
max_seq_length = 384

INT8_runtime = trt.Runtime(TRT_LOGGER)
INT8_engine = INT8_runtime.deserialize_cuda_engine(open(engine_path, 'rb') .read()) 
INT8_context = INT8_engine.create_execution_context()

# select engine profile
INT8_context.active_optimization_profile = 0

input_nbytes = max_seq_length * trt.int32.itemsize

# Allocate device memory for inputs.
INT8_d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(4)]

# Allocate output buffer by querying the size from the context. This may be different for different input shapes.
INT8_h_output = cuda.pagelocked_empty((2 * max_seq_length), dtype=np.float32)
INT8_d_output = cuda.mem_alloc(INT8_h_output.nbytes)

# Create a stream in which to copy inputs/outputs and run inference.
INT8_stream = cuda.Stream()

In [None]:
device = widgets.RadioButtons(
    options=['CPU - Framework (PyTorch)', 
             'GPU - Framework (PyTorch)', 
             'GPU - TensorRT FP16',
             'GPU - TensorRT INT8'],
    description='Device:',
    disabled=False
)

paragraph_text = widgets.Textarea(
    value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps'\
'such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops'\
'and layers before applying optimizations for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep'\
'learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps.',
    placeholder='Type something',
    description='Passage:',
    disabled=False,
    layout=widgets.Layout(width="auto"),
    rows=10,  
)

question_text = widgets.Textarea(
    value='What is TensorRT?',
    placeholder='Type something',
    description='Question:',
    disabled=False,
    layout=widgets.Layout(width="auto"),
    rows=2,
)
display(paragraph_text)
display(question_text)

from IPython.display import display
box_layout = widgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                width='100%')

button = widgets.Button(description="Answer Me!")
output = widgets.Output()
box = widgets.HBox(children=[button],layout=box_layout)

N_RUN = 10

def answer(b):
    progress_bar.value = 0
    inference_time_arr = []
    with output:
        if device.value == 'GPU - TensorRT FP16':
            output.clear_output()
            for _ in range(N_RUN):
                doc_tokens = dp.convert_doc_tokens(paragraph_text.value)
                features = question_features(doc_tokens, question_text.value)
                eval_time_elapsed, prediction, nbest_json = inference_FP16(context, d_inputs, h_output, d_output, features, doc_tokens)
                progress_bar.value += 1                
                inference_time_arr.append(eval_time_elapsed)

            print_single_query(eval_time_elapsed, prediction, nbest_json)
            print("Average inference time (over {} runs): {:.2f} ms".format(N_RUN, 1000*np.mean(inference_time_arr)))   
        elif device.value == 'GPU - TensorRT INT8':
            output.clear_output()
            for _ in range(N_RUN):
                doc_tokens = dp.convert_doc_tokens(paragraph_text.value)
                features = question_features(doc_tokens, question_text.value)
                eval_time_elapsed, prediction, nbest_json = inference_INT8(INT8_context, INT8_d_inputs, INT8_h_output, INT8_d_output, features, doc_tokens)
                progress_bar.value += 1                
                inference_time_arr.append(eval_time_elapsed)

            print_single_query(eval_time_elapsed, prediction, nbest_json)
            print("Average inference time (over {} runs): {:.2f} ms".format(N_RUN, 1000*np.mean(inference_time_arr)))   
 
        elif device.value == 'CPU - Framework (PyTorch)':
            output.clear_output()
            for _ in range(N_RUN):
                inference_time = time.time()
                answer = nlp({
                        'question': question_text.value,
                        'context': paragraph_text.value
                        })
                progress_bar.value += 1                
                inference_time_arr.append(time.time() - inference_time)
                
            print("Answer: '{}'".format(answer['answer']))
            print("With probability: {:.2f}%".format(answer['score']*100))
            print("Average inference time (over {} runs): {:.2f} ms".format(N_RUN, 1000*np.mean(inference_time_arr)))   
        elif  device.value == 'GPU - Framework (PyTorch)':  
            output.clear_output()
            for _ in range(N_RUN):
                inference_time = time.time()
                answer = nlp_gpu({
                        'question': question_text.value,
                        'context': paragraph_text.value
                        })
                progress_bar.value += 1                
                inference_time_arr.append(time.time() - inference_time)
                
            print("Answer: '{}'".format(answer['answer']))
            print("With probability: {:.2f}%".format(answer['score']*100))
            print("Average inference time (over {} runs): {:.2f} ms".format(N_RUN, 1000*np.mean(inference_time_arr)))           
            
button.on_click(answer)
display(device, box, output)

progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=N_RUN,
    description='Progress:',
    bar_style='', # 'success', 'info', 'warning', 'danger' or ''
    style={'bar_color': 'green'},
    orientation='horizontal', 
    layout=widgets.Layout(width='100%', height='50px')
)
display(progress_bar)