In [None]:
# Copyright 2023 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Accelerating HuggingFace BLIP Inference with TensorRT

BLIP is an encoder-decoder multimodal model that can perform different vision-language tasks. Here we show BLIP working on image captioning tasks, containing a vision encoder and text decoder in the model architecture.

This notebook shows easy steps to convert a [HuggingFace PyTorch BLIP model](https://huggingface.co/transformers/model_doc/blip.html) to a TensorRT engine for high-performance inference in a few lines of code. 

## Prerequisite

Follow the instruction at https://github.com/NVIDIA/TensorRT to build the TensorRT-OSS docker container required to run this notebook.

Next, we install some extra dependencies.

In [None]:
%%capture
!pip3 install -r ../requirements.txt

**Note:** After this step, you should restart the Jupyter kernel for the change to take effect.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)

import torch

from BLIP.frameworks import BLIPHF
from BLIP.trt import BLIPTRT


## API usage

We have wrapped the process of importing models from PyTorch, exporting to onnx files and build TRT engines into a single class. We introduce new `BLIPHF` and `BLIPTRT` classes that both expose `generate` as the main entry point to run BLIP. `BLIPTRT` will automatically do all the 3 steps per user inputs. Here is an example:


### Specify model arguments

You pick your favorite model and configurations, and TRT will run it for you! The main choice that you need to make is:
- `use_cache`: kv cache to speed decoding
- `num_beams`: beam search for better results
- `fp16`: Using float16 to speed decoding

In [None]:
args = {
    "variant": "Salesforce/blip-image-captioning-base", # A HuggingFace model variant name. Required.
    "use_cache": True, # We support decoder kv cache in generation. Default: True
    "fp16": True, # Default: True
    "num_beams": 3, # We support beam search in generation. Default: 1
    "batch_size": 1, # Default: 1
    "use_mask": False, # Default: False
    # Folder name. Required. All the PyTorch, ONNX and TRT Engines will be stored in the folder.
    "working_dir": "models",
    # Log level.
    "info": True,
    # Benchmarking args
    "iterations": 10,
    "number": 1,
    "warmup": 3,
    "duration": 0,
    "percentile": 50,
}


### Initialize the models
Calling the API is just this easy...

In [None]:
framework_model = BLIPHF(**args)
trt_model = BLIPTRT(**args)

### Try your photo!
Both `BLIPHF` and `BLIPTRT` exposes `setup_tokenizer_and_model` and `generate`. If `setup_tokenizer_and_model` is not called prior to `generate`, it will be called first.

Load the image captioning dataset

In [None]:
from datasets import load_dataset 

dataset = load_dataset("lambdalabs/pokemon-blip-captions", split="train")

In [None]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

In [None]:
# load image
example = dataset[3]
image = example["image"]
display(image)

In [None]:
 # prepare image for the model
device = "cuda" if torch.cuda.is_available() else "cpu"

inputs = processor(images=image, text="", return_tensors="pt").to(device)
pixel_values = inputs.pixel_values

#### Run BLIP HF model

In [None]:
framework_model.models = framework_model.setup_tokenizer_and_model()

In [None]:
if args['batch_size'] > 1:
    pixel_values = pixel_values.repeat_interleave(args['batch_size'], 0)

In [None]:
hf_generated_caption = framework_model.generate(pixel_values=pixel_values, input="")

print(hf_generated_caption[0])
print(hf_generated_caption[1])

#### Run BLIP HF model - reference run with HF BLIP workflow (in fp32)

In [None]:
from transformers import BlipForConditionalGeneration
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model = model.to(device)

hf_fp32_generated_ids = model.generate(pixel_values=pixel_values, input_ids=inputs.input_ids)
hf_fp32_generated_caption = processor.batch_decode(hf_fp32_generated_ids, skip_special_tokens=True)[0]

print(hf_fp32_generated_ids)
print(hf_fp32_generated_caption)

#### Run BLIP TRT model

In [None]:
trt_model.models = trt_model.setup_tokenizer_and_model()

In [None]:
trt_generated_caption = trt_model.generate(pixel_values=pixel_values, input="")

print(trt_generated_caption[0])
print(trt_generated_caption[1])


### Evaluate

Image captioning models are typically evaluated with the [Rouge Score](https://huggingface.co/spaces/evaluate-metric/rouge) or [Word Error Rate](https://huggingface.co/spaces/evaluate-metric/wer). For this guide, you will use the Word Error Rate (WER). 

We use the 🤗 Evaluate library to do so. For potential limitations and other gotchas of the WER, refer to [this guide](https://huggingface.co/spaces/evaluate-metric/wer).

We will evaluate the WER between the framework model (pytorch/HF) and the TRT model.

In [None]:
!pip install evaluate jiwer -q

In [None]:
from evaluate import load

wer = load("wer")

In [None]:
print("hf_fp32_captions:", hf_fp32_generated_caption, "\nhf_generated_caption", hf_generated_caption[1][0], "\ntrt_generated_caption", trt_generated_caption[1][0])

In [None]:
wer_score = wer.compute(predictions=[trt_generated_caption[1][0]], references=[hf_generated_caption[1][0]])
print("wer_score", wer_score)


### Performance benchmark
You can see that TRT and PyTorch generates the same result, which is expected. To measure their performance, both `BLIPHF` and `BLIPTRT` exposes `execute_inference`, `full_inference`, `encoder_inference` and `decoder_inference` to measure the inference time. Let's take a look at how our latest TRT performs.

In [None]:
from tabulate import tabulate

data = [
    ['full p50(s)', 'decoder p50(s)', 'encoder p50(s)'],
]

def format_result(result):
    entry = []
    for segment in result.runtime:
        entry.append('{:.4f}'.format(segment.runtime))
    
    return entry

In [None]:
framework_result = framework_model.execute_inference(pixel_values=pixel_values, input=[""])
data.append(format_result(framework_result))

In [None]:
trt_result = trt_model.execute_inference(pixel_values=pixel_values, input=[""])
data.append(format_result(trt_result))

In [None]:
print(tabulate(data, headers='firstrow', tablefmt='github'))

In [None]:
framework_result

In [None]:
trt_result

### Variable Input/Output Performance Benchmarking

We can run more tests by varying input/output length, while using the same engines.

Note that TensorRT performance depends on optimal selection of the kernels in the engine. The variable length test here uses the same engine built with max input/output length profile = `max_length` in HuggingFace config to represent the best use of the model. If you want to change the length, please change this field prior to calling `set_tokenizer_and_model`.

In [None]:
input_output_len_list = [
    (2, 32),
    (16, 8), 
    (32, 32),
    (64, 128),
]

data = [
    ['(input_len, output_len)', 'HF p50 (s)', 'TRT p50 (s)'],
]

for (in_len, out_len) in input_output_len_list:

    input_ids = torch.randint(0, framework_model.config.vocab_size, (framework_model.config.batch_size, in_len))
        
    # Note: the min/max output len configs change the default generation config for both the framework and trt models. The above generate() functions will behave differently unless set those configs back to default
    # The config change here is only to fix the out_len for benchmarking purpose
    framework_model.config.min_output_length = in_len + out_len
    framework_model.config.max_output_length = in_len + out_len
    trt_model.config.min_output_length = in_len + out_len
    trt_model.config.max_output_length = in_len + out_len
    
    _, framework_e2e = framework_model.full_inference(input_ids=input_ids, pixel_values=pixel_values)
    _, trt_e2e = trt_model.full_inference(input_ids=input_ids, pixel_values=pixel_values)

    data.append([(in_len, out_len), framework_e2e, trt_e2e])

print(tabulate(data, headers='firstrow', tablefmt='github'))

Did TensorRT's performance amaze you?

## Conclusion and where-to next?

Is this the end? The API sounds too simple. I am used to the previous version that walks me step by step, and/or I want to know more on the process of conversion. Just follow the directory and you will find that PyTorch model, ONNX files and TRT engines are there. Feel free to investigate them. We have wrapped the entire model conversion process in `setup_tokenizer_and_model`. The TensorRT inference engine can be conviniently used as a drop-in replacement for the orginial HuggingFace BLIP model while providing significant speed up. If you are interested in further details of the conversion process, check out [BLIP](../BLIP) and [Vison2Seq/trt.py](../Vision2Seq/trt.py). You will find that all the Vision2Seq models could be treated in a similar way!