In [1]:
import onnx
import torch
import numpy as np
import tensorrt as trt
from transformers import AutoModel, AutoTokenizer
import logging

loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

# Load pre-trained BERT model
model_id = "bert-base-uncased"
model = AutoModel.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [2]:
import torch

input_ids = torch.randint(0, len(tokenizer), (8, 128))
attention_mask = torch.randint(0, 1, (8, 128))
token_type_ids = torch.randint(0, 1, (8, 128))

model.eval()
with torch.no_grad():
    _ = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

In [3]:
torch.onnx.export(
    model,
    (input_ids, attention_mask),
    "weights/model.onnx",
    input_names=['input_ids', 'attention_mask', 'token_type_ids'],
    output_names=['last_hidden_state', 'pooler_output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size'},
        'attention_mask': {0: 'batch_size'},
        'token_type_ids': {0: 'batch_size'},
        'last_hidden_state': {0: 'batch_size'},
        'pooler_output': {0: 'batch_size'},
    },
    opset_version=17
)

In [4]:
logger = trt.Logger(min_severity=trt.Logger.INFO)
builder = trt.Builder(logger)

network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)
parser.parse_from_file("weights/model.onnx")
print("Model parsed successfully.....")

[08/22/2024-08:58:09] [TRT] [I] [MemUsageChange] Init CUDA: CPU +18, GPU +0, now: CPU 165, GPU 1021 (MiB)
[08/22/2024-08:58:12] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +2088, GPU +386, now: CPU 2408, GPU 1407 (MiB)
[08/22/2024-08:58:13] [TRT] [I] ----------------------------------------------------------------
[08/22/2024-08:58:13] [TRT] [I] Input filename:   weights/model.onnx
[08/22/2024-08:58:13] [TRT] [I] ONNX IR version:  0.0.8
[08/22/2024-08:58:13] [TRT] [I] Opset version:    17
[08/22/2024-08:58:13] [TRT] [I] Producer name:    pytorch
[08/22/2024-08:58:13] [TRT] [I] Producer version: 2.2.0
[08/22/2024-08:58:13] [TRT] [I] Domain:           
[08/22/2024-08:58:13] [TRT] [I] Model version:    0
[08/22/2024-08:58:13] [TRT] [I] Doc string:       
[08/22/2024-08:58:13] [TRT] [I] ----------------------------------------------------------------
[08/22/2024-08:58:13] [TRT] [W] ModelImporter.cpp:420: Make sure input input_ids has Int64 binding.
[08/22/2024-08:58:13] [TR

In [5]:
config = builder.create_builder_config()
profile = builder.create_optimization_profile()

min_shape = (1, 128)
opt_shape = (1, 128)
max_shape = (1, 128)

profile.set_shape("input_ids", min_shape,
                  opt_shape, max_shape)
profile.set_shape("attention_mask", min_shape,
                  opt_shape, max_shape)
profile.set_shape("token_type_ids", min_shape,
                  opt_shape, max_shape)

config.set_flag(trt.BuilderFlag.FP16)
config.add_optimization_profile(profile)

0

In [6]:
engine = builder.build_serialized_network(network, config)
with open("weights/model.trt", "wb") as f:
    f.write(engine)

[08/22/2024-08:58:21] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[08/22/2024-08:58:48] [TRT] [I] Detected 2 inputs and 2 output network tensors.
[08/22/2024-08:58:48] [TRT] [I] Total Host Persistent Memory: 32
[08/22/2024-08:58:48] [TRT] [I] Total Device Persistent Memory: 0
[08/22/2024-08:58:48] [TRT] [I] Total Scratch Memory: 5177344
[08/22/2024-08:58:48] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 1 steps to complete.
[08/22/2024-08:58:48] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 0.017443ms to assign 1 blocks to 1 nodes requiring 5177344 bytes.
[08/22/2024-08:58:48] [TRT] [I] Total Activation Memory: 5177344
[08/22/2024-08:58:48] [TRT] [I] Total Weights Memory: 437930368
[08/22/2024-08:58:48] [TRT] [I] Engine generation completed in 27.1043 seconds.
[08/22/2024-08:58:48] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 208 MiB, GPU 423 MiB
[08/22/2024-

In [1]:
import onnx
import time
import torch
import numpy as np
import tensorrt as trt
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer

import pycuda.driver as cuda
import pycuda.autoinit

In [2]:
# Sample dataset
prompt_dataset = load_dataset("fka/awesome-chatgpt-prompts")
prompt_dataset

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 153
    })
})

In [3]:
logger = trt.Logger(trt.Logger.INFO)

In [4]:
model_id = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [5]:
with open("weights/model.trt", "rb") as f, \
    trt.Runtime(logger) as runtime, \
    runtime.deserialize_cuda_engine(f.read()) as engine, \
    engine.create_execution_context() as context:

    input_shape = (1, 128)
    input_nbytes = trt.volume(input_shape) * trt.int32.itemsize * 5
    
    # Allocate device memory
    d_input_ids = cuda.mem_alloc(input_nbytes)
    d_attention_mask = cuda.mem_alloc(input_nbytes)

    # Create the stream
    stream = cuda.Stream()
    
    # Set the shape
    context.set_input_shape('input_ids', input_shape)
    context.set_input_shape('attention_mask', input_shape)

    h_pooler = cuda.pagelocked_empty(tuple(context.get_tensor_shape(engine.get_tensor_name(3))), dtype=np.float32)
    d_pooler = cuda.mem_alloc(h_pooler.nbytes)
    
    h_hidden = cuda.pagelocked_empty(tuple(context.get_tensor_shape(engine.get_tensor_name(2))), dtype=np.float32)
    d_hidden = cuda.mem_alloc(h_hidden.nbytes)

    # Loop and check
    # Get the total time
    total = 0
    for text in tqdm(prompt_dataset["train"]["prompt"], total=len(prompt_dataset["train"]["prompt"])):
        # Start
        start = time.time()
        
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="np", padding="max_length", max_length=128)
        input_ids = np.array(inputs['input_ids'], dtype=np.int32, order=None)
        attention_mask = np.array(inputs['input_ids'], dtype=np.int32, order=None)

        # Push data to memory
        input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids.ravel()))
        attention_mask = cuda.register_host_memory(np.ascontiguousarray(attention_mask.ravel()))

        # Copy to stream
        cuda.memcpy_htod_async(d_input_ids, input_ids, stream)
        cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)

        # Set address for context manager
        context.set_tensor_address("input_ids", int(d_input_ids))
        context.set_tensor_address("attention_mask", int(d_attention_mask))
        context.set_tensor_address("pooler_output", int(d_pooler))
        context.set_tensor_address("last_hidden_state", int(d_hidden))

        # Run the engine via the stream
        context.execute_async_v3(stream_handle=stream.handle)
        stream.synchronize()

        # Fetch and sync outputs #########################
        cuda.memcpy_dtoh_async(h_pooler, d_pooler, stream)
        stream.synchronize()
    
        cuda.memcpy_dtoh_async(h_hidden, d_hidden, stream)
        stream.synchronize()
        ##################################################
        
        end = time.time()
        total += (end - start) * 1000
    
print(f"Avergae time (ms) : {total / len(prompt_dataset['train']['prompt'])}")

[08/22/2024-10:55:32] [TRT] [I] Loaded engine size: 418 MiB
[08/22/2024-10:55:32] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +5, now: CPU 0, GPU 422 (MiB)


  0%|          | 0/153 [00:00<?, ?it/s]

Avergae time (ms) : 5.426266614128561


In [6]:
import torch
import onnx
import time
import torch
import numpy as np
import tensorrt as trt
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer


import logging
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

In [7]:
# Sample dataset
prompt_dataset = load_dataset("fka/awesome-chatgpt-prompts")
prompt_dataset

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 153
    })
})

In [8]:
# Load pre-trained BERT model
model_id = "bert-base-uncased"
model = AutoModel.from_pretrained(model_id).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.eval()

total = 0
for text in tqdm(prompt_dataset["train"]["prompt"], total=len(prompt_dataset["train"]["prompt"])):
    # Start
    start = time.time()
    
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=128)
    inputs = {k: v.cuda() for k, v in inputs.items()}

    # Get the inference
    with torch.no_grad():
        _ = model(**inputs)
    
    end = time.time()
    total += (end - start) * 1000
    
print(f"Avergae time (ms) : {total / len(prompt_dataset['train']['prompt'])}")

  0%|          | 0/153 [00:00<?, ?it/s]

Avergae time (ms) : 9.301696727478426
