In [6]:
import torch.utils.benchmark as benchmark
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from optimum.intel import OVModelForSequenceClassification

In [7]:
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model_non_optimized = AutoModelForSequenceClassification.from_pretrained(model_id)

model_optimized = OVModelForSequenceClassification.from_pretrained(model_id, export=True)


Framework not specified. Using pt to export to ONNX.
Using framework PyTorch: 2.0.1+cu117
  op1 = operator(*args, **kwargs)


verbose: False, log level: Level.ERROR



Compiling the model...
Set CACHE_DIR to /tmp/tmpe8ahvj51/model_cache


In [8]:
def run_inference_non_optimized():
    inputs = tokenizer("Hello, how are you?", return_tensors="pt")
    outputs = model_non_optimized(**inputs)

def run_inference_optimized():
    inputs = tokenizer("Hello, how are you?", return_tensors="pt")
    outputs = model_optimized(**inputs)

In [9]:
timer_non_optimized = benchmark.Timer(
    stmt="run_inference_non_optimized()",
    setup="from __main__ import run_inference_non_optimized",
    num_threads=1,
)

timer_optimized = benchmark.Timer(
    stmt="run_inference_optimized()",
    setup="from __main__ import run_inference_optimized",
    num_threads=1,
)

In [11]:
result_non_optimized = timer_non_optimized.timeit(100)
print("Non-optimized model:", result_non_optimized)

Non-optimized model: <torch.utils.benchmark.utils.common.Measurement object at 0x7f08166f4d50>
run_inference_non_optimized()
setup: from __main__ import run_inference_non_optimized
  51.80 ms
  1 measurement, 100 runs , 1 thread


In [12]:
result_optimized = timer_optimized.timeit(100)
print("Optimized model:", result_optimized)

Optimized model: <torch.utils.benchmark.utils.common.Measurement object at 0x7f082eae54d0>
run_inference_optimized()
setup: from __main__ import run_inference_optimized
  5.01 ms
  1 measurement, 100 runs , 1 thread
