# Evaluation mode


## Load the model and tokenizer


In [1]:
from transformers import AutoModel, AutoTokenizer

transformer = AutoModel.from_pretrained(
    "sentence-transformers/multi-qa-mpnet-base-cos-v1"
)
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/multi-qa-mpnet-base-cos-v1"
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

## Create a sample input text and tokenize it


In [2]:
text = """
Pope John Paul II[b] (born Karol Józef Wojtyła;[c] 18 May 1920 – 2 April 2005) was head of the Catholic Church and sovereign of the Vatican City from 16 October 1978 until his death in 2005. He was the first non-Italian pope since Adrian VI in the 16th century, as well as the third-longest-serving pope in history, after Pius IX and St. Peter.[d]

In his youth, Wojtyła dabbled in stage acting. He graduated with excellent grades from an all-boys high school in Wadowice, Poland, in 1938, soon after which World War II broke out. During the war, to avoid being kidnapped and sent to a German forced labour camp, he signed up for work in harsh conditions in a quarry. Wojtyła eventually took up acting and developed a love for the profession and participated at a local theatre. The linguistically skilled Wojtyła wanted to study Polish at university. Encouraged by a conversation with Adam Stefan Sapieha, he decided to study theology and become a priest. Eventually, Wojtyła rose to the position of Archbishop of Kraków and then a cardinal, both positions held by his mentor. Wojtyła was elected pope on the third day of the October 1978 conclave, becoming one of the youngest popes in history. The conclave was called after the death of John Paul I, who served only 33 days as pope. Wojtyła adopted the name of his predecessor in tribute to him.
"""
tokenized_text = tokenizer(
    text=text, padding=True, truncation=True, return_tensors="pt"
)

In [3]:
tokenized_text

{'input_ids': tensor([[    0,  4835,  2202,  2707,  2466,  1035,  1042,  1037,  1010,  2145,
         10560, 13157, 23262, 24189,  3505,  3727, 22976,  1029,  1035,  1043,
          1037,  2328,  2093,  4448,  1520,  1020,  2262,  2388,  1011,  2005,
          2136,  2001,  2000,  3238,  2281,  2002, 11078,  2001,  2000, 12115,
          2107,  2017,  2389,  2259,  3305,  2131,  2014,  2335,  2003,  2388,
          1016,  2006,  2005,  2000,  2038,  2516,  1015,  3063,  4835,  2148,
          7922,  6823,  2003,  2000,  5771,  2305,  1014,  2008,  2096,  2008,
          2000,  2357,  1015,  6497,  1015,  3533,  4835,  2003,  2385,  1014,
          2048, 14367, 11818,  2002,  2362,  1016,  2852,  1016,  1035,  1044,
          1037,  2003,  2014,  3364,  1014, 24189,  3505,  3727, 22976,  4834,
         12824,  2003,  2758,  3776,  1016,  2006,  3856,  2011,  6585,  7026,
          2017,  2023,  2039,  1015,  3341,  2156,  2086,  2003, 11337,  3531,
         23429,  1014,  3739,  1014,  

## Measure the inference time of the model in various inference modes


In [4]:
import time

import torch

device = "cpu"
transformer.to(device)

num_runs = 100

start = time.time()
for _ in range(num_runs):
    outputs = transformer(**tokenized_text)
time_no_opt = (time.time() - start) / num_runs

transformer.eval()
start = time.time()
for _ in range(num_runs):
    outputs = transformer(**tokenized_text)
time_eval = (time.time() - start) / num_runs

transformer.eval()
start = time.time()
for _ in range(num_runs):
    with torch.no_grad():
        outputs = transformer(**tokenized_text)
time_eval_no_grad = (time.time() - start) / num_runs

transformer.eval()
start = time.time()
for _ in range(num_runs):
    with torch.inference_mode():
        outputs = transformer(**tokenized_text)
time_eval_inference = (time.time() - start) / num_runs

print("=" * 60)
print("INFERENCE TIME COMPARISON (average over 100 runs)")
print("=" * 60)
print(f"1. No optimizations:           {time_no_opt * 1000:.4f} ms")
print(f"2. transformer.eval():               {time_eval * 1000:.4f} ms")
print(f"3. transformer.eval() + no_grad():   {time_eval_no_grad * 1000:.4f} ms")
print(f"4. transformer.eval() + inference:   {time_eval_inference * 1000:.4f} ms")
print("=" * 60)
print(f"Speedup (no_grad vs baseline):      {time_no_opt / time_eval_no_grad:.2f}x")
print(f"Speedup (inference vs baseline):    {time_no_opt / time_eval_inference:.2f}x")
print(
    f"Speedup (inference vs no_grad):     {time_eval_no_grad / time_eval_inference:.2f}x"
)

INFERENCE TIME COMPARISON (average over 100 runs)
1. No optimizations:           884.1026 ms
2. transformer.eval():               889.8580 ms
3. transformer.eval() + no_grad():   868.4453 ms
4. transformer.eval() + inference:   866.7855 ms
Speedup (no_grad vs baseline):      1.02x
Speedup (inference vs baseline):    1.02x
Speedup (inference vs no_grad):     1.00x


# PyTorch model compilation


## Compile the model using `torch.compile()`


In [5]:
start_time = time.time()
transformer.eval()
transformer.compile()
transformer(**tokenized_text)
compilation_plus_warm_up_time = time.time() - start_time

## Measure the inference time, Calculate the speedup


In [6]:
start = time.time()
for _ in range(num_runs):
    outputs = transformer(**tokenized_text)
time_compiled = (time.time() - start) / num_runs


print("=" * 60)
print("COMPILED INFERENCE TIME COMPARISON (average over 100 runs)")
print("=" * 60)
print(f"Compilation + warm_up:           {compilation_plus_warm_up_time:.2f} s")
print("=" * 60)
print(f"Speedup (compiled vs eval):      {time_eval / time_compiled:.2f}x")
print(f"Speedup (compiled vs no_grad):      {time_eval_no_grad / time_compiled:.2f}x")
print(f"Speedup (compiled vs inference):    {time_eval_inference / time_compiled:.2f}x")

COMPILED INFERENCE TIME COMPARISON (average over 100 runs)
Compilation + warm_up:           41.34 s
Speedup (compiled vs eval):      0.80x
Speedup (compiled vs no_grad):      0.78x
Speedup (compiled vs inference):    0.78x


# Quantization


## Quantize the model


In [7]:
torch.backends.quantized.engine = "qnnpack"
model_quantized = torch.ao.quantization.quantize_dynamic(
    model=transformer, dtype=torch.qint8
)

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_quantized = torch.ao.quantization.quantize_dynamic(


In [8]:
print(model_quantized)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (o): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (dropout): Dropout(p=0.1, inplace=F

## Save both models to disk


In [9]:
from pathlib import Path

models_directory = Path("models")
models_directory.mkdir(parents=True, exist_ok=True)
torch.save(transformer.state_dict(), f=f"{models_directory}/orig_model.pt")
torch.save(model_quantized.state_dict(), f=f"{models_directory}/quantized_model.pt")

In [10]:
import os

orig_model_size = os.path.getsize(f"{models_directory}/orig_model.pt")
quantized_model_size = os.path.getsize(f"{models_directory}/quantized_model.pt")

print(f"Size of the original model: {orig_model_size / 1024 / 1024:.2f} MB")
print(f"Size of quantized model: {quantized_model_size / 1024 / 1024:.2f} MB")


Size of the original model: 417.73 MB
Size of quantized model: 173.10 MB


## Compare the inference speed and speedup on CPU


In [11]:
start_time = time.time()
model_quantized.eval()
model_quantized.compile()
model_quantized(**tokenized_text)
quantized_compilation_plus_warm_up_time = time.time() - start_time

In [None]:
start = time.time()
for _ in range(num_runs):
    outputs = model_quantized(**tokenized_text)
quantized_time_compiled = (time.time() - start) / num_runs


print("=" * 60)
print("ORIGINAL MODEL COMPILED INFERENCE TIME COMPARISON")
print("=" * 60)
print(f"Compilation + warm_up:           {compilation_plus_warm_up_time:.2f} s")
print("=" * 60)
print(f"Speedup (compiled vs eval):      {time_eval / time_compiled:.2f}x")
print(f"Speedup (compiled vs no_grad):      {time_eval_no_grad / time_compiled:.2f}x")
print(
    f"Speedup (compiled vs inference):    {time_eval_inference / time_compiled:.2f}x\n"
)

print("=" * 60)
print("QUANTIZED MODEL COMPILED INFERENCE TIME COMPARISON")
print("=" * 60)
print(f"Compilation + warm_up:           {compilation_plus_warm_up_time:.2f} s")
print("=" * 60)
print(f"Speedup (compiled vs eval):      {time_eval / quantized_time_compiled:.2f}x")
print(
    f"Speedup (compiled vs no_grad):      {time_eval_no_grad / quantized_time_compiled:.2f}x"
)
print(
    f"Speedup (compiled vs inference):    {time_eval_inference / quantized_time_compiled:.2f}x"
)

ORIGINAL MODEL COMPILED INFERENCE TIME COMPARISON
Compilation + warm_up:           41.34 s
Speedup (compiled vs eval):      0.80x
Speedup (compiled vs no_grad):      0.78x
Speedup (compiled vs inference):    0.78x

QUANTIZED MODEL COMPILED INFERENCE TIME COMPARISON
Compilation + warm_up:           41.34 s
Speedup (compiled vs eval):      0.38x
Speedup (compiled vs no_grad):      0.37x
Speedup (compiled vs inference):    0.37x


In [13]:
start = time.time()
for _ in range(num_runs):
    outputs = model_quantized(**tokenized_text)
quantized_time_no_opt = (time.time() - start) / num_runs

transformer.eval()
start = time.time()
for _ in range(num_runs):
    outputs = model_quantized(**tokenized_text)
quantized_time_eval = (time.time() - start) / num_runs

transformer.eval()
start = time.time()
for _ in range(num_runs):
    with torch.no_grad():
        outputs = model_quantized(**tokenized_text)
quantized_time_eval_no_grad = (time.time() - start) / num_runs

transformer.eval()
start = time.time()
for _ in range(num_runs):
    with torch.inference_mode():
        outputs = model_quantized(**tokenized_text)
quantized_time_eval_inference = (time.time() - start) / num_runs

In [None]:
print("=" * 60)
print("SPEEDUP: BASE MODEL vs QUANTIZED MODEL")
print("=" * 60)
print(f"1. No optimizations:           {time_no_opt / quantized_time_no_opt:.2f}x")
print(f"2. eval():                     {time_eval / quantized_time_eval:.2f}x")
print(
    f"3. eval() + no_grad():         {time_eval_no_grad / quantized_time_eval_no_grad:.2f}x"
)
print(
    f"4. eval() + inference_mode():  {time_eval_inference / quantized_time_eval_inference:.2f}x"
)

print("\n" + "=" * 60)
print("SPEEDUP: BASE MODEL vs QUANTIZED + COMPILED MODEL")
print("=" * 60)
print(f"Compilation + warm_up time:    {compilation_plus_warm_up_time:.4f} s")
print(f"vs Base no opt:                {time_no_opt / quantized_time_compiled:.2f}x")
print(f"vs Base eval():                {time_eval / quantized_time_compiled:.2f}x")
print(
    f"vs Base no_grad():             {time_eval_no_grad / quantized_time_compiled:.2f}x"
)
print(
    f"vs Base inference_mode():      {time_eval_inference / quantized_time_compiled:.2f}x"
)

print("\n" + "=" * 60)
print("BEST CONFIGURATION COMPARISON")
print("=" * 60)
print(f"Base (best):                   {time_eval_inference * 1000:.4f} ms")
print(f"Quantized (best):              {quantized_time_eval_inference * 1000:.4f} ms")
print(f"Quantized + Compiled:          {quantized_time_compiled * 1000:.4f} ms")
print(
    f"\nQuantization speedup:          {time_eval_inference / quantized_time_eval_inference:.2f}x"
)
print(
    f"Quantization + Compilation:    {time_eval_inference / quantized_time_compiled:.2f}x"
)

SPEEDUP: BASE MODEL vs QUANTIZED MODEL
1. No optimizations:           0.38x
2. eval():                     0.38x
3. eval() + no_grad():         0.36x
4. eval() + inference_mode():  0.37x

SPEEDUP: BASE MODEL vs QUANTIZED + COMPILED MODEL
Compilation + warm_up time:    41.3387 s
vs Base no opt:                0.38x
vs Base eval():                0.38x
vs Base no_grad():             0.37x
vs Base inference_mode():      0.37x

BEST CONFIGURATION COMPARISON
Base (best):                   866.7855 ms
Quantized (best):              2337.9440 ms
Quantized + Compiled:          2340.1477 ms

Quantization speedup:          0.37x
Quantization + Compilation:    0.37x


Quantization is not beneficial for CPU inference on this model size

# GPU optimization strategies


## Compare inference time of:

- torch.compile() with default settings
- torch.compile() with mode="max-autotune"
- torch.compile() with mode="max-autotune-no-cudagraphs"


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

text_100 = text[:100]
text_500 = text[:500]

text_sizes = {"100 chars": text_100, "500 chars": text_500, "1350 chars": text}


def compare_models(text: str, text_size: str):
    print("=" * 60)
    print(f"COMPARING MODELS INPUT TEXT SIZE OF: {text_size}")
    print("=" * 60)
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    inputs = {k: v.pin_memory() for k, v in inputs.items()}

    model_compiled_default = torch.compile(transformer)
    model_compiled_default.eval()
    with torch.inference_mode():
        for _ in range(10):
            _ = model_compiled_default(**inputs)

    start = time.time()
    with torch.inference_mode():
        for _ in range(num_runs):
            _ = model_compiled_default(**inputs)
    time_compiled_default = (time.time() - start) / num_runs

    model_compiled_max_autotune = torch.compile(transformer, mode="max-autotune")
    model_compiled_max_autotune.eval()
    with torch.inference_mode():
        for _ in range(10):
            _ = model_compiled_max_autotune(**inputs)

    start = time.time()
    with torch.inference_mode():
        for _ in range(num_runs):
            _ = model_compiled_max_autotune(**inputs)
    time_compiled_max_autotune = (time.time() - start) / num_runs

    model_compiled_no_cudagraphs = torch.compile(
        transformer, mode="max-autotune-no-cudagraphs"
    )
    model_compiled_no_cudagraphs.eval()
    with torch.inference_mode():
        for _ in range(10):
            _ = model_compiled_no_cudagraphs(**inputs)

    start = time.time()
    with torch.inference_mode():
        for _ in range(num_runs):
            _ = model_compiled_no_cudagraphs(**inputs)
    time_compiled_no_cudagraphs = (time.time() - start) / num_runs

    print("\nTORCH.COMPILE() MODE COMPARISON")
    print("=" * 60)
    print(f"Default mode:                  {time_compiled_default * 1000:.4f} ms")
    print(f"max-autotune:                  {time_compiled_max_autotune * 1000:.4f} ms")
    print(f"max-autotune-no-cudagraphs:    {time_compiled_no_cudagraphs * 1000:.4f} ms")
    print("=" * 60)
    print(
        f"Speedup (max-autotune vs default):        {time_compiled_default / time_compiled_max_autotune:.2f}x"
    )
    print(
        f"Speedup (no-cudagraphs vs default):       {time_compiled_default / time_compiled_no_cudagraphs:.2f}x"
    )
    print(
        f"Speedup (max-autotune vs no-cudagraphs):  {time_compiled_no_cudagraphs / time_compiled_max_autotune:.2f}x"
    )

In [16]:
for text_name, text_chunk in text_sizes.items():
    compare_models(text_chunk, text_name)

COMPARING MODELS INPUT TEXT SIZE OF: 100 chars

TORCH.COMPILE() MODE COMPARISON
Default mode:                  129.1312 ms
max-autotune:                  130.1898 ms
max-autotune-no-cudagraphs:    137.9546 ms
Speedup (max-autotune vs default):        0.99x
Speedup (no-cudagraphs vs default):       0.94x
Speedup (max-autotune vs no-cudagraphs):  1.06x
COMPARING MODELS INPUT TEXT SIZE OF: 500 chars

TORCH.COMPILE() MODE COMPARISON
Default mode:                  406.4013 ms
max-autotune:                  406.9343 ms
max-autotune-no-cudagraphs:    416.1094 ms
Speedup (max-autotune vs default):        1.00x
Speedup (no-cudagraphs vs default):       0.98x
Speedup (max-autotune vs no-cudagraphs):  1.02x
COMPARING MODELS INPUT TEXT SIZE OF: 1350 chars

TORCH.COMPILE() MODE COMPARISON
Default mode:                  1089.6288 ms
max-autotune:                  1099.9176 ms
max-autotune-no-cudagraphs:    1096.8534 ms
Speedup (max-autotune vs default):        0.99x
Speedup (no-cudagraphs vs default

The advanced compilation modes provide no meaningful performance improvement - they're essentially the same as default mode or slightly slower

# Changing numerical precision


## Check if your GPU supports Tensor Cores (capability >= (7,0))


In [17]:
import torch

capability = torch.cuda.get_device_capability()
print(f"CUDA device capability: {capability}")

# Tensor Cores are available on NVidia GPUs with CUDA >= 7 (e.g. Volta, Turing, Ampere, Hopper)
if capability >= (7, 0):
    print("Tensor Cores available: fast float16 supported.")
else:
    print("Tensor Cores not available: float16 may be slow or unsupported.")

CUDA device capability: (7, 5)
Tensor Cores available: fast float16 supported.


## Measure inference time with:

- full precision (float32)
- manual half-precision (float16)
- automatic mixed precision (torch.autocast)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Running on device: {device}")

print("\nTesting full precision (float32)...")
transformer_fp32 = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")  
transformer_fp32 = transformer_fp32.to(device).float()
transformer_fp32.eval()

tokenized_text_fp32 = {k: v.to(device) for k, v in tokenized_text.items()}

with torch.inference_mode():
    for _ in range(10):
        _ = transformer_fp32(**tokenized_text_fp32)
    
    start = time.time()
    for _ in range(num_runs):
        _ = transformer_fp32(**tokenized_text_fp32)
    time_float32 = (time.time() - start) / num_runs

print(f"  Time: {time_float32*1000:.4f} ms")

print("\nTesting manual half-precision (float16)...")
transformer_fp16 = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")  
transformer_fp16 = transformer_fp16.to(device).half()
transformer_fp16.eval()

tokenized_text_fp16 = {k: v.to(device) for k, v in tokenized_text.items()}

with torch.inference_mode():
    for _ in range(10):
        _ = transformer_fp16(**tokenized_text_fp16)
    
    start = time.time()
    for _ in range(num_runs):
        _ = transformer_fp16(**tokenized_text_fp16)
    time_float16 = (time.time() - start) / num_runs

print(f"  Time: {time_float16*1000:.4f} ms")

print("\nTesting automatic mixed precision (autocast)...")
transformer_autocast = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
transformer_autocast = transformer_autocast.to(device).float()
transformer_autocast.eval()

tokenized_text_autocast = {k: v.to(device) for k, v in tokenized_text.items()}

with torch.inference_mode():
    for _ in range(10):
        with torch.autocast(device_type=device, dtype=torch.float16):
            _ = transformer_autocast(**tokenized_text_autocast)
    
    start = time.time()
    for _ in range(num_runs):
        with torch.autocast(device_type=device, dtype=torch.float16):
            _ = transformer_autocast(**tokenized_text_autocast)
    time_autocast = (time.time() - start) / num_runs

print(f"  Time: {time_autocast*1000:.4f} ms")

print("\n" + "="*60)
print("PRECISION COMPARISON (average over 100 runs)")
print("="*60)
print(f"Full precision (float32):      {time_float32*1000:.4f} ms")
print(f"Manual half (float16):         {time_float16*1000:.4f} ms")
print(f"Automatic mixed (autocast):    {time_autocast*1000:.4f} ms")
print("="*60)
print(f"Speedup (float16 vs float32):    {time_float32/time_float16:.2f}x")
print(f"Speedup (autocast vs float32):   {time_float32/time_autocast:.2f}x")
print(f"Speedup (float16 vs autocast):   {time_autocast/time_float16:.2f}x")
print("="*60)

Running on device: cuda

Testing full precision (float32)...
  Time: 23.0370 ms

Testing manual half-precision (float16)...
  Time: 12.3941 ms

Testing automatic mixed precision (autocast)...
  Time: 12.0268 ms

PRECISION COMPARISON (average over 100 runs)
Full precision (float32):      23.0370 ms
Manual half (float16):         12.3941 ms
Automatic mixed (autocast):    12.0268 ms
Speedup (float16 vs float32):    1.86x
Speedup (autocast vs float32):   1.92x
Speedup (float16 vs autocast):   0.97x


In practice, I would use autocast, because it achieved best performance while being the easiest to implement

# ONNX


## Measure cold start time (including session creation) of the ONNX model using online and offline optimization modes on CPU and measure inference time


In [24]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Downloading onnx-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m116.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.19.1


In [25]:
model_cpu = transformer = (
    AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-cos-v1")
    .eval()
    .cpu()
)

sample_text = "This is a sample input text for ONNX export."
sample_input = tokenizer(
    sample_text,
    padding=True,
    truncation=True,
    return_tensors="pt",
)

print("Exporting model to ONNX...")
torch.onnx.export(
    model_cpu,
    (sample_input["input_ids"], sample_input["attention_mask"]),
    "model.onnx",
    opset_version=17,
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size"},
    },
    export_params=True,
    do_constant_folding=True,
    dynamo=False,
)

Exporting model to ONNX...


  torch.onnx.export(


In [28]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m106.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstal

In [29]:
import onnxruntime as ort

ort_inputs = {
    "input_ids": sample_input["input_ids"].numpy(),
    "attention_mask": sample_input["attention_mask"].numpy(),
}

print("\n" + "=" * 60)
print("ONNX RUNTIME OPTIMIZATION COMPARISON")
print("=" * 60)


print("\n1. Testing ONLINE optimization...")
start_cold = time.time()
sess_options_online = ort.SessionOptions()
sess_options_online.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
ort_session_online = ort.InferenceSession(
    "model.onnx", sess_options=sess_options_online, providers=["CPUExecutionProvider"]
)
_ = ort_session_online.run(None, ort_inputs)
cold_start_online = time.time() - start_cold

for _ in range(10):
    _ = ort_session_online.run(None, ort_inputs)

start = time.time()
for _ in range(num_runs):
    _ = ort_session_online.run(None, ort_inputs)
inference_time_online = (time.time() - start) / num_runs

print(f"   Cold start time: {cold_start_online:.4f} s")
print(f"   Inference time:  {inference_time_online * 1000:.4f} ms")


ONNX RUNTIME OPTIMIZATION COMPARISON

1. Testing ONLINE optimization...
   Cold start time: 0.6458 s
   Inference time:  57.6660 ms


In [None]:
print("\n2. Testing OFFLINE optimization...")
print("   Creating offline optimized model...")
start_optimize = time.time()
sess_options_offline = ort.SessionOptions()
sess_options_offline.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options_offline.optimized_model_filepath = "model_optimized.onnx"
_ = ort.InferenceSession("model.onnx", sess_options_offline)
optimization_time = time.time() - start_optimize
print(f"   Optimization time: {optimization_time:.4f} s")

start_cold = time.time()
sess_options_load = ort.SessionOptions()
sess_options_load.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
ort_session_offline = ort.InferenceSession(
    "model_optimized.onnx", 
    sess_options=sess_options_load, 
    providers=["CPUExecutionProvider"]
)
_ = ort_session_offline.run(None, ort_inputs)
cold_start_offline = time.time() - start_cold

for _ in range(10):
    _ = ort_session_offline.run(None, ort_inputs)

start = time.time()
for _ in range(num_runs):
    _ = ort_session_offline.run(None, ort_inputs)
inference_time_offline = (time.time() - start) / num_runs

print(f"   Cold start time: {cold_start_offline:.4f} s")
print(f"   Inference time:  {inference_time_offline*1000:.4f} ms")


2. Testing OFFLINE optimization...
   Creating offline optimized model...
   Optimization time: 2.1047 s
   Cold start time: 0.8375 s
   Inference time:  57.8757 ms


In [33]:
print("RESULTS SUMMARY")
print("=" * 60)
print("\nCold Start Time (session creation + first inference):")
print(f"   Online optimization:   {cold_start_online:.4f} s")
print(f"   Offline optimization:  {cold_start_offline:.4f} s")
print(f"   Speedup:               {cold_start_online / cold_start_offline:.2f}x")

print("\nInference Time (average over 100 runs):")
print(f"   Online optimization:   {inference_time_online * 1000:.4f} ms")
print(f"   Offline optimization:  {inference_time_offline * 1000:.4f} ms")
print(
    f"   Speedup:               {inference_time_online / inference_time_offline:.2f}x"
)

RESULTS SUMMARY

Cold Start Time (session creation + first inference):
   Online optimization:   0.6458 s
   Offline optimization:  0.8375 s
   Speedup:               0.77x

Inference Time (average over 100 runs):
   Online optimization:   57.6660 ms
   Offline optimization:  57.8757 ms
   Speedup:               1.00x
