In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
!pip install onnx onnxruntime
# !python3 -m pip install mlc-ai -f https://mlc.ai/wheels
# !pip install numpy==1.25.2
!pip install onnxruntime-gpu
# !pip install apache-tvm
import onnx
import onnxruntime

import tvm
from tvm import relay
from tvm.contrib import graph_executor

from torch.utils.data import Dataset, DataLoader
from transformers import BertForQuestionAnswering, BertTokenizerFast
from torch.optim import AdamW
import time

In [None]:
tvm.__path__

In [None]:
df = pd.read_csv("test_df_sample.csv")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

question = df.iloc[0]["question"]
context = df.iloc[0]["contexts"]

base_inputs = tokenizer(
    question,
    context,
    truncation=True,
    padding="max_length",
    max_length=128,
    return_offsets_mapping=True,
    return_tensors="pt",
)

inputs_cpu = {k: v.clone().detach().to("cpu") for k, v in base_inputs.items()}
inputs_gpu = (
    {k: v.clone().detach().to("cuda") for k, v in base_inputs.items()}
    if torch.cuda.is_available()
    else None
)

valid_keys = {"input_ids", "attention_mask", "token_type_ids"}

inputs_cpu = {k: v for k, v in inputs_cpu.items() if k in valid_keys}
inputs_gpu = {k: v for k, v in inputs_gpu.items() if k in valid_keys}


np_inputs = {k: v.detach().cpu().numpy() for k, v in inputs_gpu.items()}

In [None]:
df.iloc[0]

## Inference with Pytorch

In [None]:
model_cpu = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
model_cpu.load_state_dict(torch.load("bert_qa_model_weights.pt", map_location="cpu"))
model_cpu.eval().to("cpu")

# Warm-up + timing on CPU
for _ in range(10):
    with torch.no_grad():
        _ = model_cpu(**inputs_cpu)

start = time.time()
for _ in range(100):
    with torch.no_grad():
        _ = model_cpu(**inputs_cpu)
end = time.time()
print(f"CPU avg time: {(end - start) * 1000 / 100:.2f} ms")

# Load model on GPU (if available)
if torch.cuda.is_available():
    model_gpu = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
    model_gpu.load_state_dict(
        torch.load("bert_qa_model_weights.pt", map_location="cuda")
    )
    model_gpu.eval().to("cuda")

    # Warm-up on GPU
    for _ in range(10):
        with torch.no_grad():
            _ = model_gpu(**inputs_gpu)

    torch.cuda.synchronize()  # Ensure GPU is ready
    start = time.time()
    for _ in range(100):
        with torch.no_grad():
            _ = model_gpu(**inputs_gpu)
    torch.cuda.synchronize()  # Wait for GPU to finish
    end = time.time()
    print(f"GPU avg time: {(end - start) * 1000 / 100:.2f} ms")
else:
    print("GPU not available.")

## Inference with ONNX Runtime

In [None]:
import onnxruntime as ort

print(ort.get_available_providers())

In [None]:
if "CUDAExecutionProvider" in ort.get_available_providers():
    ort_session = ort.InferenceSession(
        "bert_qa.onnx", providers=["CUDAExecutionProvider"]
    )
    print("Using GPU for inference (CUDAExecutionProvider).")
else:
    ort_session = ort.InferenceSession(
        "bert_qa.onnx", providers=["CPUExecutionProvider"]
    )
    print("Using CPU for inference.")

start = time.time()
for _ in range(100):
    ort_outputs = ort_session.run(
        None,
        {
            "input_ids": np_inputs["input_ids"],
            "attention_mask": np_inputs["attention_mask"],
            "token_type_ids": np_inputs["token_type_ids"],
        },
    )
end = time.time()
print(f"ONNX avg time: {(end - start) * 1000 / 100:.2f} ms")

## Load Baseline Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model.load_state_dict(torch.load("bert_qa_model_weights.pt", map_location=device))
model.eval().to(device)

In [None]:
question = df.iloc[0]["question"]
context = df.iloc[0]["contexts"]

# inputs = tokenizer(question, context, return_tensors="pt", padding=True)
# inputs = {k: v.to(device) for k, v in inputs.items()}

# input_names = ["input_ids", "attention_mask", "token_type_ids"]
# output_names = ["start_logits", "end_logits"]

inputs = tokenizer(
    question,
    context,
    truncation=True,
    padding="max_length",
    max_length=128,
    return_offsets_mapping=True,
    return_tensors="pt",
)
inputs = {k: v.to(device) for k, v in inputs.items()}
input_names = ["input_ids", "attention_mask", "token_type_ids"]
output_names = ["start_logits", "end_logits"]

##  Export BERT to ONNX

In [None]:
torch.onnx.export(
    model,
    (inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]),
    "bert_qa.onnx",
    input_names=input_names,
    output_names=output_names,
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "token_type_ids": {0: "batch_size", 1: "seq_len"},
        "start_logits": {0: "batch_size", 1: "seq_len"},
        "end_logits": {0: "batch_size", 1: "seq_len"},
    },
    opset_version=14,
)

###  Load the ONNX Model into TVM

In [None]:
onnx_model = onnx.load("bert_qa.onnx")

batch_size = 1
seq_length = 128  # or 384 for long QA

input_shapes = {
    "input_ids": (batch_size, seq_length),
    "attention_mask": (batch_size, seq_length),
    "token_type_ids": (batch_size, seq_length),
}

In [None]:
mod, params = relay.frontend.from_onnx(onnx_model, shape=input_shapes)

In [None]:
target = "cuda" if tvm.cuda().exist else "llvm"
dev = tvm.device(target, 0)

with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)

module = graph_executor.GraphModule(lib["default"](dev))

In [None]:
# question = "Who was the president of the US in 2008?"
# context = "Barack Obama was elected president of the US in 2008."

inputs = tokenizer(
    question,
    context,
    return_tensors="np",
    padding="max_length",
    max_length=seq_length,
    truncation=True,
)

# Set inputs
module.set_input("input_ids", inputs["input_ids"])
module.set_input("attention_mask", inputs["attention_mask"])
module.set_input("token_type_ids", inputs["token_type_ids"])

In [None]:
module.run()

start_logits = module.get_output(0).numpy()
end_logits = module.get_output(1).numpy()

In [None]:
start = np.argmax(start_logits)
end = np.argmax(end_logits)

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
answer = tokenizer.convert_tokens_to_string(tokens[start : end + 1])

print("Answer:", answer)

In [None]:
lib.export_library("bert_qa_tvm.so")

## Inference with TVM

In [None]:
# target = tvm.target.Target("cuda -arch=sm_75")
# target = 'cuda -arch=sm_75'

In [None]:
# onnx_model = onnx.load("bert_qa.onnx")
# input_shape = {
#     "input_ids": np_inputs["input_ids"].shape,
#     "attention_mask": np_inputs["attention_mask"].shape,
#     "token_type_ids": np_inputs["token_type_ids"].shape
# }
# mod, params = relay.frontend.from_onnx(onnx_model, shape=input_shape)

target = "cuda" if tvm.cuda().exist else "llvm"
dev = tvm.device(target, 0)

# # Load the compiled module
loaded_lib = tvm.runtime.load_module("bert_qa_tvm.so")

# print("input_ids shape:", np_inputs["input_ids"].shape)
# print("attention_mask shape:", np_inputs["attention_mask"].shape)
# print("token_type_ids shape:", np_inputs["token_type_ids"].shape)


# with tvm.transform.PassContext(opt_level=3, required_pass=["FastMath"]):
# lib = relay.build(mod, target=target, params=params)

module = graph_executor.GraphModule(loaded_lib["default"](dev))
module.set_input("input_ids", tvm.nd.array(np_inputs["input_ids"].astype("int64")))
module.set_input(
    "attention_mask", tvm.nd.array(np_inputs["attention_mask"].astype("int64"))
)
module.set_input(
    "token_type_ids", tvm.nd.array(np_inputs["token_type_ids"].astype("int64"))
)

# lib.export_library("bert_qa_tvm.so")

In [None]:
start = time.time()
for _ in range(100):
    module.run()
    start_logits = module.get_output(0).numpy()
    end_logits = module.get_output(1).numpy()
end = time.time()
print(f"TVM avg time: {(end - start) * 1000 / 100:.2f} ms")

In [None]:
tvm.cuda().exist

In [None]:
import torch

print(torch.cuda.is_available())

In [None]:
%cd /content
!git clone --recursive https://github.com/apache/tvm tvm
%cd tvm
!mkdir build
!cp cmake/config.cmake build/

In [None]:
!sed -i 's/USE_LLVM OFF/USE_LLVM ON/' build/config.cmake
!sed -i 's/USE_CUDA OFF/USE_CUDA ON/' build/config.cmake

In [None]:
%cd build
!cmake ..
!make -j4

In [None]:
%cd ../python
!pip install -e .

In [None]:
import tvm
from tvm import relay

print("TVM version:", tvm.__version__)
print("CUDA available:", tvm.cuda().exist)

In [None]:
!ls /content/tvm/build

In [None]:
import os

os.environ["TVM_HOME"] = "/content/tvm"
os.environ["PYTHONPATH"] = "/content/tvm/python"