In [4]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# BERT TensorRT Benchmarking: FP16 vs. INT8 QAT+Sparsity

In this notebook, we benchmark different BERT Large TensorRT engines at different batch sizes.

**Notes**: 
 - Prior to running this notebook, run [BERT-TRT-FP16.ipynb](BERT-TRT-FP16.ipynb) and [BERT-TRT-INT8-QAT-sparse.ipynb](BERT-TRT-INT8-QAT-sparse.ipynb) to generate the TensorRT engines.
 - This benchmarking focuses on the compute part using synthetic inputs, without taking into account pre and post processing time.

In [5]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/workspace/TensorRT/demo/BERT')

import tensorrt as trt;
TRT_VERSION = trt.__version__

import time
import argparse
import ctypes
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from tqdm import tqdm

import ipywidgets as widgets
from ipywidgets import IntProgress
from ipywidgets import Button, Layout
from IPython.display import display

import helpers.tokenization as tokenization
import helpers.data_processing as dp

TRT_LOGGER = trt.Logger(trt.Logger.ERROR)

class DeviceBuffer(object):
    def __init__(self, shape, dtype=trt.int32):
        self.buf = cuda.mem_alloc(trt.volume(shape) * dtype.itemsize)

    def binding(self):
        return int(self.buf)

    def free(self):
        self.buf.free()

doc_stride = 128
max_query_length = 64
max_seq_length = 384

vocab_file = "models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_384_v19.03.1/vocab.txt"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

parser = argparse.ArgumentParser(description='BERT Inference Benchmark')
parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine', default='')
parser.add_argument('-b', '--batch-size', default=[], action="append", help='Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int)
parser.add_argument('-s', '--sequence-length', default=384, help='Sequence length of the BERT model', type=int)
parser.add_argument('-i', '--iterations', default=1000, help='Number of iterations to run when benchmarking each batch size.', type=int)
parser.add_argument('-w', '--warm-up-runs', default=10, help='Number of iterations to run prior to benchmarking.', type=int)
parser.add_argument('-r', '--random-seed', required=False, default=12345, help='Random seed.', type=int)
args, _ = parser.parse_known_args()
args.batch_size = args.batch_size or [1]

# Import necessary plugins for BERT TensorRT
ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)

### INT8 TRT model
def run_benchmark_INT8(b):
    engine_path = "engines_%s/megatron_large_seqlen384_int8qat_sparse.engine"%TRT_VERSION
    with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
        with output:
            #output.clear_output()
            args.batch_size = [int(batchsize_selector.value)]

            # Allocate buffers large enough to store the largest batch size
            max_input_shape = (args.sequence_length * max(args.batch_size), )
            max_output_shape = (args.sequence_length * max(args.batch_size), 2, 1, 1)
            buffers = [
                DeviceBuffer(max_input_shape),
                DeviceBuffer(max_input_shape),
                DeviceBuffer((max(args.batch_size) + 1, )),
                DeviceBuffer((args.sequence_length, )),
                DeviceBuffer(max_output_shape)
            ]

            # Prepare random input
            pseudo_vocab_size = 30522
            pseudo_type_vocab_size = 2
            np.random.seed(args.random_seed)
            test_word_ids = np.random.randint(0, pseudo_vocab_size, (args.sequence_length * max(args.batch_size)), dtype=np.int32)
            test_segment_ids = np.random.randint(0, pseudo_type_vocab_size, (args.sequence_length * max(args.batch_size)), dtype=np.int32)
            test_cu_seq_lens = np.arange(0, args.sequence_length * max(args.batch_size) + 1, args.sequence_length, dtype=np.int32)

            # Copy input h2d
            cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
            cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel())
            cuda.memcpy_htod(buffers[2].buf, test_cu_seq_lens.ravel())

            bench_times = {}

            for idx, batch_size in enumerate(sorted(args.batch_size)):
                num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles
                for idx in range(engine.num_optimization_profiles):
                    profile_shape = engine.get_profile_shape(profile_index = idx, binding = idx * num_binding_per_profile)
                    if profile_shape[0][0] <= batch_size and profile_shape[2][0] >= batch_size:
                        context.active_optimization_profile = idx
                        binding_idx_offset = idx * num_binding_per_profile
                        break

                # Each profile has unique bindings
                bindings = [0] * binding_idx_offset + [buf.binding() for buf in buffers]
                input_shape = (batch_size, args.sequence_length)
                for binding in range(3):
                    context.set_binding_shape(binding_idx_offset + binding, input_shape)
                assert context.all_binding_shapes_specified

                # Inference
                total_time = 0
                start = cuda.Event()
                end = cuda.Event()
                stream = cuda.Stream()

                # Warmup
                for _ in range(args.warm_up_runs):
                    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
                    stream.synchronize()

                # Timing loop
                times = []
                progress_bar.value = 0
                for _ in range(iteration_selector.value):
                    start.record(stream)
                    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
                    end.record(stream)
                    stream.synchronize()
                    times.append(end.time_since(start))
                    progress_bar.value +=1

                # Compute average time, 95th percentile time and 99th percentile time.
                bench_times[batch_size] = times

            [b.free() for b in buffers]

            for batch_size, times in bench_times.items():
                total_time = sum(times)
                avg_time = total_time / float(len(times))
                times.sort()
                percentile95 = times[int(len(times) * 0.95)]
                percentile99 = times[int(len(times) * 0.99)]
                print("BERT TRT INT8: Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:.2f} ms \tAverage Time: {:.2f} ms\t95th Percentile Time: {:.2f} ms\t99th Percentile Time: {:.2f}".format(args.iterations, batch_size, total_time, avg_time, percentile95, percentile99))


### FP16 TRT model
def run_benchmark_FP16(b):
    engine_path = "engines_%s/bert_large_384.engine"%TRT_VERSION
    with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
        with output:
            #output.clear_output()
            args.batch_size = [int(batchsize_selector.value)]

            # Allocate buffers large enough to store the largest batch size
            max_input_shape = (max(args.batch_size), args.sequence_length)
            max_output_shape = (max(args.batch_size), args.sequence_length, 2, 1, 1)
            buffers = [
                DeviceBuffer(max_input_shape),
                DeviceBuffer(max_input_shape),
                DeviceBuffer(max_input_shape),
                DeviceBuffer(max_output_shape)
            ]

            # Prepare random input
            pseudo_vocab_size = 30522
            pseudo_type_vocab_size = 2
            np.random.seed(args.random_seed)
            test_word_ids = np.random.randint(0, pseudo_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32)
            test_segment_ids = np.random.randint(0, pseudo_type_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32)
            test_input_mask = np.ones((max(args.batch_size), args.sequence_length), dtype=np.int32)

            # Copy input h2d
            cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
            cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel())
            cuda.memcpy_htod(buffers[2].buf, test_input_mask.ravel())

            num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles

            bench_times = {}

            for idx, batch_size in enumerate(sorted(args.batch_size)):
                num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles
                for idx in range(engine.num_optimization_profiles):
                    profile_shape = engine.get_profile_shape(profile_index = idx, binding = idx * num_binding_per_profile)
                    if profile_shape[0][0] <= batch_size and profile_shape[2][0] >= batch_size:
                        context.active_optimization_profile = idx
                        binding_idx_offset = idx * num_binding_per_profile
                        break

                # Each profile has unique bindings
                bindings = [0] * binding_idx_offset + [buf.binding() for buf in buffers]
                input_shape = (batch_size, args.sequence_length)
                for binding in range(3):
                    context.set_binding_shape(binding_idx_offset + binding, input_shape)
                assert context.all_binding_shapes_specified

                # Inference
                total_time = 0
                start = cuda.Event()
                end = cuda.Event()
                stream = cuda.Stream()

                # Warmup
                for _ in range(args.warm_up_runs):
                    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
                    stream.synchronize()

                # Timing loop
                times = []
                progress_bar.value = 0
                for _ in range(iteration_selector.value):
                    start.record(stream)
                    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
                    end.record(stream)
                    stream.synchronize()
                    times.append(end.time_since(start))
                    progress_bar.value +=1

                # Compute average time, 95th percentile time and 99th percentile time.
                bench_times[batch_size] = times

            [b.free() for b in buffers]

            for batch_size, times in bench_times.items():
                total_time = sum(times)
                avg_time = total_time / float(len(times))
                times.sort()
                percentile95 = times[int(len(times) * 0.95)]
                percentile99 = times[int(len(times) * 0.99)]
                print("BERT TRT FP16: Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:.2f} ms \tAverage Time: {:.2f} ms\t95th Percentile Time: {:.2f} ms\t99th Percentile Time: {:.2f}".format(args.iterations, batch_size, total_time, avg_time, percentile95, percentile99))


In [8]:
# UI elements
engine_selector = widgets.RadioButtons(
    options=['GPU - TensorRT FP16',
             'GPU - TensorRT INT8'],
    description='Engine:',
    disabled=False
)

batchsize_selector = widgets.RadioButtons(
    options=['1', '32', '64', '128'],
    description='Batch size:',
    disabled=False
)

iteration_selector = widgets.IntSlider(
    value=500,
    min=100,
    max=1000,
    step=1,
    description='Iterations:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

button = widgets.Button(description="Run benchmark")
output = widgets.Output()
#box = widgets.HBox(children=[button],layout=box_layout)

def run_benchmark(b):
    args.iterations = iteration_selector.value
    progress_bar.max = iteration_selector.value
    with output:
        if engine_selector.value=='GPU - TensorRT FP16':
            run_benchmark_FP16(b)
        elif engine_selector.value=='GPU - TensorRT INT8':
            run_benchmark_INT8(b)
        
button.on_click(run_benchmark)
display(engine_selector, batchsize_selector, iteration_selector)

from IPython.display import display
box_layout = widgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                width='100%')
box = widgets.HBox(children=[button],layout=box_layout)
display(box, output)

progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=1000,
    description='Progress:',
    bar_style='',
    style={'bar_color': 'green'},
    orientation='horizontal', 
    layout=Layout(width='100%', height='50px')
)
display(progress_bar)
    

RadioButtons(description='Engine:', options=('GPU - TensorRT FP16', 'GPU - TensorRT INT8'), value='GPU - Tenso…

RadioButtons(description='Batch size:', options=('1', '32', '64', '128'), value='1')

IntSlider(value=500, continuous_update=False, description='Iterations:', max=1000, min=100)

HBox(children=(Button(description='Run benchmark', style=ButtonStyle()),), layout=Layout(align_items='center',…

Output()

IntProgress(value=0, description='Progress:', layout=Layout(height='50px', width='100%'), max=1000, style=Prog…