### Exercise 1


In [2]:
from transformers import AutoModel, AutoTokenizer

model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [3]:
sample_text = "This is some sample text to be tokenized. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

inputs = tokenizer(sample_text, padding=True, truncation=True, return_tensors="pt")

print(inputs)

{'input_ids': tensor([[    0,  2027,  2007,  2074,  7103,  3797,  2004,  2026, 19208,  3554,
          1016, 19548,  2217, 13001, 17425,  2083, 10630,  4137,  2576,  3392,
          1014,  9534,  3370,  6597,  3392,  3130, 27137, 18140,  6133, 12009,
          2106,  1016,  7371,  2098,  2083,  1045,  4177,  5306,  2098, 13661,
          2103,  4301, 28177, 27588,  2106, 21187,  4454,  2067,  3806,  2083,
         20190, 20205,  4866, 16215,  1016,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}


In [4]:
import torch
import time


def measure_inference_time(model, inputs):
    times = []
    for _ in range(100):
        start = time.time()
        outputs = model(**inputs)
        end = time.time()
        times.append(end - start)
    return sum(times) / len(times)


model.train()
time_no_opt = measure_inference_time(model, inputs)

model.eval()
time_eval = measure_inference_time(model, inputs)

model.eval()
with torch.no_grad():
    time_no_grad = measure_inference_time(model, inputs)

model.eval()
with torch.inference_mode():
    time_inference_mode = measure_inference_time(model, inputs)

print(f"No optimizations: {time_no_opt}s")
print(f"eval(): {time_eval}s ({time_no_opt/time_eval}x speedup)")
print(f"eval() + no_grad(): {time_no_grad}s ({time_no_opt/time_no_grad}x speedup)")
print(
    f"eval() + inference_mode(): {time_inference_mode}s ({time_no_opt/time_inference_mode}x speedup)"
)

No optimizations: 0.2849780583381653s
eval(): 0.2348670768737793s (1.2133589012619095x speedup)
eval() + no_grad(): 0.21505134105682372s (1.3251628980210108x speedup)
eval() + inference_mode(): 0.1969376587867737s (1.4470470507964848x speedup)


We can see significant speed ups, altough the inference times are very small, each stage yields much better results.


### 2. PyTorch model compilation


In [5]:
model.eval()

start_compile = time.time()
compiled_model = torch.compile(model)

with torch.inference_mode():
    _ = compiled_model(**inputs)

end_compile = time.time()
result_time = end_compile - start_compile

print(f"Compilation + warm-up time: {result_time}s")

Compilation + warm-up time: 110.6852593421936s


In [6]:
with torch.inference_mode():
    time_compiled = measure_inference_time(compiled_model, inputs)

print(f"Compiled model inference time: {time_compiled}s")
print(f"Speedup vs no optimizations: {time_no_opt / time_compiled}x")
print(f"Speedup vs eval() + inference_mode(): {time_inference_mode / time_compiled}x")

Compiled model inference time: 0.25084442377090455s
Speedup vs no optimizations: 1.1360749186851964x
Speedup vs eval() + inference_mode(): 0.785098810753857x


We can see the significant boost in the compiled model. Maybe I should consider doing it more often.


### 3. Quantization


In [7]:
model = model.cpu()

In [8]:
from torch.ao.quantization import quantize_dynamic
import torch.nn as nn

quantized_model = quantize_dynamic(model, qconfig_spec={nn.Linear}, dtype=torch.qint8)

print(quantized_model)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (o): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (dropout): Dropout(p=0.1, inplace=F

In [9]:
import os

torch.save(model.state_dict(), "model_original.pth")
torch.save(quantized_model.state_dict(), "model_quantized.pth")

original_size = os.path.getsize("model_original.pth")
quantized_size = os.path.getsize("model_quantized.pth")

print(f"Original model size: {original_size / (1024**2)} MB")
print(f"Quantized model size: {quantized_size / (1024**2)} MB")

Original model size: 417.7300271987915 MB
Quantized model size: 173.09671783447266 MB


Very interesting results, I was expecting something more like >1/2 of the inital size, but in this case the quantized model turned to be <1/2 of the initial size. Not sure why that's the case, but most likely some smart tricks performed under the hood.


In [10]:
model.eval()
quantized_model.eval()

with torch.inference_mode():
    time_original_cpu = measure_inference_time(model, inputs)

with torch.inference_mode():
    time_quantized_cpu = measure_inference_time(quantized_model, inputs)

print(f"Original: {time_original_cpu}s")
print(f"Quantized: {time_quantized_cpu}s")
print(f"Speedup: {time_original_cpu / time_quantized_cpu}x")

Original: 0.204613037109375s
Quantized: 0.16590158224105836s
Speedup: 1.233339877446546x


In this case I can say that this boost is for sure statistically significant and not just the measurment error.


### 4. GPU optimization strategies


In [12]:
from lorem_text import lorem
import torch
import time

input_sizes = [10, 200, 1000]
inputs_sized = [
    tokenizer(lorem.words(words), padding=True, truncation=True, return_tensors="pt")
    for words in input_sizes
]

model.eval()

for size, inputs in zip(input_sizes, inputs_sized):
    print(f"Input size: {size} words")

    compiled_default = torch.compile(model)
    with torch.inference_mode():
        _ = compiled_default(**inputs)
        time_default = measure_inference_time(compiled_default, inputs)
    print(f"Default: {time_default}s")

    compiled_max_autotune = torch.compile(model, mode="max-autotune")
    with torch.inference_mode():
        _ = compiled_max_autotune(**inputs)
        time_max_autotune = measure_inference_time(compiled_max_autotune, inputs)
    speedup_autotune = time_default / time_max_autotune
    print(f"Max-autotune: {time_max_autotune}s ({speedup_autotune}x speedup)")

    compiled_no_cudagraphs = torch.compile(model, mode="max-autotune-no-cudagraphs")
    with torch.inference_mode():
        _ = compiled_no_cudagraphs(**inputs)
        time_no_cudagraphs = measure_inference_time(compiled_no_cudagraphs, inputs)
    speedup_no_cudagraphs = time_default / time_no_cudagraphs
    print(
        f"Max-autotune-no-cudagraphs: {time_no_cudagraphs}s ({speedup_no_cudagraphs}x speedup)"
    )
    print()

Input size: 10 words
Default: 0.1418938899040222s
Max-autotune: 0.14797403573989867s (0.9589107250777168x speedup)
Max-autotune-no-cudagraphs: 0.1509942603111267s (0.9397303553899798x speedup)

Input size: 200 words
Default: 1.998001790046692s
Max-autotune: 1.9180034279823304s (1.0417091861762295x speedup)
Max-autotune-no-cudagraphs: 1.9226209354400634s (1.0392073409880844x speedup)

Input size: 1000 words
Default: 1.9379348921775819s
Max-autotune: 1.913768322467804s (1.0126277404772879x speedup)
Max-autotune-no-cudagraphs: 1.9080325770378113s (1.01567180534527x speedup)



We can see little to no speed up and variations could be just the margin of error. From my research it could because for the exercise we moved the model to the CPU and most of the optimizations and speed ups could be visible on the GPU.


In [13]:
import torch

capability = torch.cuda.get_device_capability()
print(f"CUDA device capability: {capability}")

# Tensor Cores are available on NVidia GPUs with CUDA >= 7 (e.g. Volta, Turing, Ampere, Hopper)
if capability >= (7, 0):
    print("Tensor Cores available: fast float16 supported.")
else:
    print("Tensor Cores not available: float16 may be slow or unsupported.")

CUDA device capability: (8, 6)
Tensor Cores available: fast float16 supported.


In [14]:
model_fp32 = torch.nn.Linear(10, 1)
data_fp32 = torch.randn(100, 10)
labels_fp32 = torch.randn(100, 1)

print(f"Data type of model_fp32 parameters: {model_fp32.weight.dtype}")
print(f"Data type of data_fp32: {data_fp32.dtype}")
print(f"Data type of labels_fp32: {labels_fp32.dtype}")

output_fp32 = model_fp32(data_fp32)
loss_fn = torch.nn.MSELoss()
loss_fp32 = loss_fn(output_fp32, labels_fp32)

print(f"Loss fp32: {loss_fp32.item()}")

Data type of model_fp32 parameters: torch.float32
Data type of data_fp32: torch.float32
Data type of labels_fp32: torch.float32
Loss fp32: 1.4398962259292603


In [15]:
model_fp16 = model_fp32.half()
data_fp16 = data_fp32.half()
labels_fp16 = labels_fp32.half()

print(f"Data type of model_fp16 parameters: {model_fp16.weight.dtype}")
print(f"Data type of data_fp16: {data_fp16.dtype}")
print(f"Data type of labels_fp16: {labels_fp16.dtype}")

output_fp16 = model_fp16(data_fp16)
loss_fp16 = loss_fn(output_fp16.float(), labels_fp16.float())

print(f"Loss fp16: {loss_fp16.item()}")

Data type of model_fp16 parameters: torch.float16
Data type of data_fp16: torch.float16
Data type of labels_fp16: torch.float16
Loss fp16: 1.439892292022705


In [17]:
model_gpu = model.to("cuda")
model_gpu.eval()

inputs_gpu = {k: v.to("cuda") for k, v in inputs.items()}

with torch.inference_mode():
    time_fp32 = measure_inference_time(model_gpu, inputs_gpu)
print(f"float32: {time_fp32}s")

model_fp16 = model_gpu.half()
inputs_fp16 = inputs_gpu
with torch.inference_mode():
    time_fp16 = measure_inference_time(model_fp16, inputs_fp16)
speedup_fp16 = time_fp32 / time_fp16
print(f"float16: {time_fp16}s ({speedup_fp16}x speedup)")

model_gpu = model_gpu.float()


def measure_inference_time_autocast(model, inputs):
    times = []
    for _ in range(100):
        start = time.time()
        with torch.autocast(device_type="cuda", dtype=torch.float16):
            _ = model(**inputs)
        end = time.time()
        times.append(end - start)
    return sum(times) / len(times)


with torch.inference_mode():
    time_autocast = measure_inference_time_autocast(model_gpu, inputs_gpu)
speedup_autocast = time_fp32 / time_autocast
print(f"autocast: {time_autocast}s ({speedup_autocast}x speedup)")

float32: 0.06270748615264893s
float16: 0.050943562984466555s (1.2309206988873034x speedup)
autocast: 0.05790738582611084s (1.082892713218867x speedup)


Changing to float16 gives significant speed up, but the autocast only gives a bit more. Most likely autocast is partialy overlapping with modifications in the second step.


### 5. ONNX


In [None]:
import torch
import torch.onnx

model_cpu = model.eval().cpu()
sample_input = tokenizer(
    "This is a sample input text for ONNX export.",
    padding=True,
    truncation=True,
    return_tensors="pt",
)

torch.onnx.export(
    model_cpu,
    (sample_input["input_ids"], sample_input["attention_mask"]),
    "model.onnx",
    opset_version=17,
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size"},
    },
)

In [None]:
import onnxruntime as ort

sess_options = ort.SessionOptions()

sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options.optimized_model_filepath = "model_optimized.onnx"
ort.InferenceSession("model.onnx", sess_options)

<onnxruntime.capi.onnxruntime_inference_collection.InferenceSession at 0x7d03d590f350>

In [None]:
import onnxruntime as ort
import time

sample_input = tokenizer(
    "This is a sample input text for ONNX inference.",
    padding=True,
    truncation=True,
    return_tensors="np",
)

inputs_onnx = {
    "input_ids": sample_input["input_ids"],
    "attention_mask": sample_input["attention_mask"],
}

# ONLINE
start_online = time.time()
sess_options_online = ort.SessionOptions()
sess_options_online.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
ort_session_online = ort.InferenceSession(
    "model.onnx", sess_options=sess_options_online, providers=["CPUExecutionProvider"]
)
end_online = time.time()
cold_start_online = end_online - start_online

# OFFLINE
start_offline = time.time()
sess_options_offline = ort.SessionOptions()
sess_options_offline.graph_optimization_level = (
    ort.GraphOptimizationLevel.ORT_DISABLE_ALL
)
ort_session_offline = ort.InferenceSession(
    "model_optimized.onnx",
    sess_options=sess_options_offline,
    providers=["CPUExecutionProvider"],
)
end_offline = time.time()
cold_start_offline = end_offline - start_offline

print("Cold start")
print(f"Online optimization: {cold_start_online}s")
print(f"Offline optimization: {cold_start_offline}s")
print(f"Speedup: {cold_start_online / cold_start_offline}x")
print()


def measure_onnx_inference_time(session, inputs):
    times = []
    for _ in range(100):
        start = time.time()
        _ = session.run(None, inputs)
        end = time.time()
        times.append(end - start)
    return sum(times) / len(times)


time_online = measure_onnx_inference_time(ort_session_online, inputs_onnx)
time_offline = measure_onnx_inference_time(ort_session_offline, inputs_onnx)

print("Inference time")
print(f"Online optimization: {time_online}s")
print(f"Offline optimization: {time_offline}s")
print(f"Speedup: {time_online / time_offline}x")

Cold start
Online optimization: 1.011584758758545s
Offline optimization: 0.8138351440429688s
Speedup: 1.2429848553027534x

Inference time
Online optimization: 0.07272726774215699s
Offline optimization: 0.07199683189392089s
Speedup: 1.0101453887486647x


For sure we can see that the offline optimization has a better cold start, but the average inference time, yields similar restuls.


**Docker exercise**


Image sizes:
Torch compiled: 7850 MB
ONNX 933 MB

Yes, although the exercise is supposed to only use CPU, I could have used the CPU only PyTorch version like we did on one of the previous labs. But I think installing CUDA is more closer to something we would see on the production env. This shows huge storage size differences, mostly because when using ONNX we don't have to install PyTorch and all CUDA libraries that come with it.

Average inference time:
Torch compiled : 0.0734s
ONNX: 0.0478s

We can see the significant speed up in the inference time. I'm really surprised in the speed up. I'm only wondering if the experiment on the GPU would yield similar results.
