# Lab 07 MLops

In [1]:
!nvidia-smi

Fri Nov 21 16:12:15 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.274.02             Driver Version: 535.274.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  | 00000000:5B:00.0 Off |                    0 |
| N/A   41C    P0              42W / 300W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Imports

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import time

## Exercise 1

### Load model

In [3]:
model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

### Sample input - used for experiment

In [4]:
texts = [
    "Hello World1!",
    "Java is better programming language than python - change my mind.",
    "Transformers are powerful models for NLP.",
    "Batch processing allows multiple sentences to be encoded together.",
    "This is another example input."
]

encoded_inputs = tokenizer(
    texts,
    padding=True,          
    truncation=True,      
    return_tensors="pt"    
)

input_ids = encoded_inputs["input_ids"]

for text, ids in zip(texts, input_ids):
    token_list = tokenizer.convert_ids_to_tokens(ids.tolist())
    print(f"\nTEXT: {text}\nTOKENS: {token_list}\nIDS:{ids.tolist()}")


TEXT: Hello World1!
TOKENS: ['<s>', 'hello', 'world', '##1', '!', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
IDS:[0, 7596, 2092, 2491, 1003, 2, 1, 1, 1, 1, 1, 1, 1, 1]

TEXT: Java is better programming language than python - change my mind.
TOKENS: ['<s>', 'java', 'is', 'better', 'programming', 'language', 'than', 'python', '-', 'change', 'my', 'mind', '.', '</s>']
IDS:[0, 9266, 2007, 2492, 4734, 2657, 2088, 18754, 1015, 2693, 2030, 2572, 1016, 2]

TEXT: Transformers are powerful models for NLP.
TOKENS: ['<s>', 'transformers', 'are', 'powerful', 'models', 'for', 'nl', '##p', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
IDS:[0, 19085, 2028, 3932, 4279, 2009, 17957, 2365, 1016, 2, 1, 1, 1, 1]

TEXT: Batch processing allows multiple sentences to be encoded together.
TOKENS: ['<s>', 'batch', 'processing', 'allows', 'multiple', 'sentences', 'to', 'be', 'encoded', 'together', '.', '</s>', '<pad>', '<pad>']
IDS:[0, 14112, 6368, 4477, 3678, 11750, 2004

In [5]:
encoded_inputs_cpu = {k: v.to("cpu") for k, v in encoded_inputs.items()}
encoded_inputs_gpu = {k: v.to("cuda") for k, v in encoded_inputs.items()}

### Measuring inference time

In [9]:
def measure_inference_time(model, encoded_inputs, runs=200):
    _ = model(**encoded_inputs) # warmup

    start_time = time.perf_counter()
    
    for _ in range(runs):
        outputs = model(**encoded_inputs)
    
    end_time = time.perf_counter()
    total_time = end_time - start_time
    return total_time / runs

#### Plain Pytorch

In [7]:
model.to(device="cpu")
plain_cpu = measure_inference_time(model, encoded_inputs_cpu)
print(f"Time Plain Torch CPU: {plain_cpu:.4f} s")

model.to(device="cuda")
plain_gpu = measure_inference_time(model, encoded_inputs_gpu)
print(f"Time Plain Torch GPU: {plain_gpu:.4f} s")

Time Plain Torch CPU: 0.2303 s
Time Plain Torch GPU: 0.0081 s


#### Eval mode

In [8]:
model.to(device="cpu")
model.eval()
eval_cpu = measure_inference_time(model, encoded_inputs_cpu)
print(f"Time Eval Torch CPU: {eval_cpu:.4f} s")

model.to(device="cuda")
model.eval()
eval_gpu = measure_inference_time(model, encoded_inputs_gpu)
print(f"Time Eval Torch GPU: {eval_gpu:.4f} s")

Time Eval Torch CPU: 0.2332 s
Time Eval Torch GPU: 0.0080 s


#### Eval mode and no grad

In [9]:
model.to(device="cpu")
model.eval()
with torch.no_grad():
    eval_no_grad_cpu = measure_inference_time(model, encoded_inputs_cpu)
    print(f"Time Eval and no Grad Torch CPU: {eval_no_grad_cpu:.4f} s")

model.to(device="cuda")
model.eval()
with torch.no_grad():
    eval_no_grad_gpu = measure_inference_time(model, encoded_inputs_gpu)
    print(f"Time Eval and no Grad Torch GPU: {eval_no_grad_gpu:.4f} s")

Time Eval and no Grad Torch CPU: 0.2250 s
Time Eval and no Grad Torch GPU: 0.0063 s


#### Full inference mode

In [10]:
model.to(device="cpu")
with torch.inference_mode():
    eval_inference_cpu = measure_inference_time(model, encoded_inputs_cpu)
    print(f"Time Inference Mode CPU: {eval_inference_cpu:.4f} s")

model.to(device="cuda")
with torch.inference_mode():
    eval_inference_gpu = measure_inference_time(model, encoded_inputs_gpu)
    print(f"Time Inference Mode GPU: {eval_inference_gpu:.4f} s")

Time Inference Mode CPU: 0.2214 s
Time Inference Mode GPU: 0.0056 s


### Result of experiment

| Method                     | CPU Time (s) | GPU Time (s) |
|----------------------------|-------------|-------------|
| Plain Torch                | 0.2303      | 0.0081      |
| Eval Torch                 | 0.2332      | 0.0080      |
| Eval Torch (no Grad)       | 0.2250      | 0.0063      |
| Inference Mode             | 0.2214      | 0.0056      |


The fastest is the inference mode. Interestingly, the Eval Torch on CPU is slower than plan torch on CPU. For the GPU Difference is small, but eval is faster.


## Exercise 2

Compiling only for GPU.

### 1. Compiling models

In [13]:
model.eval()
model.to(device="cuda")
compiled_model_gpu = torch.compile(model) 

### 2. Measuring time

In [14]:
eval_compiled_gpu = measure_inference_time(compiled_model_gpu, encoded_inputs_gpu)
print(f"Time Compiled GPU: {eval_compiled_gpu:.4f} s")

Time Compiled GPU: 0.0048 s


### 3. Speedup

In [None]:
speed_up = 0.0081 / 0.0048
print(speed_up)

1.6875


Compiled model is faster than all of previous ones.

## Exercise 3

### 1. Ensuring model on cpu

In [20]:
model.to(device="cpu")
model.eval()

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

### 2. Quantize model

### 3. Save model to variable, veryfing

In [23]:
import torch.ao.quantization
import torch.nn as nn

model_quantized = torch.ao.quantization.quantize_dynamic(
    model,
    {nn.Linear}, 
    dtype=torch.qint8
)

print("Quantized model:")
print(model_quantized)

Quantized model:
MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (o): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (dropout): Dropout

### 4. Saving model

In [28]:
import os

torch.save(model.state_dict(), "model_original.pt")
torch.save(model_quantized.state_dict(), "model_quantized.pt")

size_original = os.path.getsize("model_original.pt")
size_quantized = os.path.getsize("model_quantized.pt")

size_original_mb = size_original / (1024 * 1024)
size_quantized_mb = size_quantized / (1024 * 1024)

print(f"Original model size: {size_original_mb:.2f} MB")
print(f"Quantized model size: {size_quantized_mb:.2f} MB")
print(f"Compression ratio: {size_original_mb / size_quantized_mb:.2f}×")

Original model size: 417.73 MB
Quantized model size: 173.10 MB
Compression ratio: 2.41×


### 5. Inference comparision

In [27]:
quantized_cpu = measure_inference_time(model_quantized, encoded_inputs_cpu)
print(f"Time Quantized CPU: {quantized_cpu:.4f} s")

no_quantized_cpu = measure_inference_time(model, encoded_inputs_cpu)
print(f"Time Non Quantized CPU: {no_quantized_cpu:.4f} s")

Time Quantized CPU: 0.0626 s
Time Non Quantized CPU: 0.2359 s


### 6. Comments on comparission

Quantized model has size of 173.10MB while original 417.73MB which is over 2.41 more. Furthermore, quantized model is almost 4 times faster than plain model.

## Exercise 4

### 1. Comparing inference

In [8]:
model.eval()
model.to(device="cuda")
compiled_model_gpu = torch.compile(model) 
compiled_model_gpu_max_autotune = torch.compile(model, mode="max-autotune") 
compiled_model_gpu_max_autotune_no_cuda_graphs = torch.compile(model, mode="max-autotune-no-cudagraphs")

In [9]:
with torch.inference_mode():
    _ = compiled_model_gpu(**encoded_inputs_gpu)
    _ = compiled_model_gpu_max_autotune(**encoded_inputs_gpu)
    _ = compiled_model_gpu_max_autotune_no_cuda_graphs(**encoded_inputs_gpu)

In [10]:
with torch.inference_mode():
    eval_compiled_gpu = measure_inference_time(compiled_model_gpu, encoded_inputs_gpu)
    print(f"Time Compiled GPU (inference_mode): {eval_compiled_gpu:.4f} s")

    eval_compiled_gpu_max_autotune = measure_inference_time(compiled_model_gpu_max_autotune, encoded_inputs_gpu)
    print(f"Time Compiled GPU Max Autotune (inference_mode): {eval_compiled_gpu_max_autotune:.4f} s")

    eval_compiled_gpu_max_autotune_no_cuda_graphs = measure_inference_time(
        compiled_model_gpu_max_autotune_no_cuda_graphs, encoded_inputs_gpu
    )
    print(f"Time Compiled GPU Max Autotune No CUDA Graphs (inference_mode): {eval_compiled_gpu_max_autotune_no_cuda_graphs:.4f} s")

Time Compiled GPU (inference_mode): 0.0031 s
Time Compiled GPU Max Autotune (inference_mode): 0.0022 s
Time Compiled GPU Max Autotune No CUDA Graphs (inference_mode): 0.0032 s


### 2. Comment on results

| Model                                    | Time |
| ---------------------------------------- | ------------- |
| Compiled GPU (default)                   | 0.0031 s      |
| Compiled GPU Max Autotune                | 0.0022 s      |
| Compiled GPU Max Autotune No CUDA Graphs | 0.0032 s      |


The fastest was Compiled GPU Max Autotune.

## Exercise 5

### 1. Check capability of GPU Tensor Cores

In [6]:
import torch

capability = torch.cuda.get_device_capability()
print(f"CUDA device capability: {capability}")

if capability >= (7, 0):
    print("Tensor Cores available: fast float16 supported.")
else:
    print("Tensor Cores not available: float16 may be slow or unsupported.")

CUDA device capability: (7, 0)
Tensor Cores available: fast float16 supported.


### 2. Measruing inference time

In [12]:
model.eval()
model.to(device="cuda")

model_half = model.half().to('cuda')
model_half.eval()

with torch.inference_mode():
    full_precision_gpu = measure_inference_time(model, encoded_inputs_gpu)
    print(f"Time Full Precisionn Mode GPU: {full_precision_gpu:.4f} s")


with torch.inference_mode():
    half_precision_gpu = measure_inference_time(model_half, encoded_inputs_gpu)
    print(f"Time HAlf Precisionn Mode GPU: {half_precision_gpu:.4f} s")

with torch.inference_mode():
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        auto_precision_gpu = measure_inference_time(model, encoded_inputs_gpu)
        print(f"Time Autocast Precisionn Mode GPU: {auto_precision_gpu:.4f} s")
    

Time Full Precisionn Mode GPU: 0.0058 s
Time HAlf Precisionn Mode GPU: 0.0057 s
Time Autocast Precisionn Mode GPU: 0.0070 s


### 3. Time comparision

| Type                  | Time [s]   |
|-----------------------|------------|
| Full Precision (FP32) | 0.0058     |
| Half Precision (FP16) | 0.0057     |
| Autocast (Mixed FP16) | 0.0070     |


I would use Half precision. It is th fastest there and it is supported well for majority of gpus. However, the test samples count may not be enough to test real differences. 

## Exercise 6

In [29]:
import onnxruntime as ort

def measure_inference_time_onnx(session: ort.InferenceSession, encoded_inputs: dict, runs: int = 100):
    _ = session.run(None, encoded_inputs) #warmup

    start_time = time.perf_counter()

    for _ in range(runs):
        outputs = session.run(None, encoded_inputs)

    end_time = time.perf_counter()
    total_time = end_time - start_time
    return total_time / runs

### 1. Measure online and offline model loading

**transforming to onnx**

In [15]:
import torch.onnx

model_cpu = model.eval().cpu()

sample_input = tokenizer(
    "This is a sample input text for ONNX export.",
    padding=True,
    truncation=True,
    return_tensors="pt",
)

torch.onnx.export(
    model_cpu,
    (sample_input["input_ids"], sample_input["attention_mask"]),
    "non_opt_model.onnx",
    opset_version=17,
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size"},
    },
)

In [48]:
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options.intra_op_num_threads = 1
sess_options.inter_op_num_threads = 2
sess_options.enable_cpu_mem_arena = True
sess_options.optimized_model_filepath = "model_optimized.onnx"
session = ort.InferenceSession("non_opt_model.onnx", sess_options)

**comparision**

In [49]:
opt_load_start = time.perf_counter()
sess_opt_options = ort.SessionOptions()
sess_opt_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
sess_opt_options.intra_op_num_threads = 1
sess_opt_options.inter_op_num_threads = 2 
sess_opt_options.enable_cpu_mem_arena = True

ort_session_optimized = ort.InferenceSession(
    "model_optimized.onnx", 
    sess_options=sess_opt_options, 
    providers=['CPUExecutionProvider']
)
opt_load_end = time.perf_counter()
opt_load_time = opt_load_end - opt_load_start
print(f"Opt model loading time: {opt_load_time:.4f} s")


Opt model loading time: 0.2370 s


In [50]:
no_opt_load_start = time.perf_counter()
sess_no_opt_options = ort.SessionOptions()
sess_no_opt_options.intra_op_num_threads = 1
sess_no_opt_options.inter_op_num_threads = 2
sess_no_opt_options.enable_cpu_mem_arena = True
sess_no_opt_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

ort_no_opt_session = ort.InferenceSession(
    "non_opt_model.onnx", sess_options=sess_no_opt_options, providers=["CPUExecutionProvider"]
)
no_opt_load_end = time.perf_counter()
no_opt_load_time = no_opt_load_end - no_opt_load_start
print(f"No Opt model loading time: {no_opt_load_time:.4f} s")

No Opt model loading time: 0.9791 s


The online optimization model loading took 4 times more time, then when you du that before saving.

### 2. Inference for both models

In [51]:
inputs_onnx = {
    "input_ids": encoded_inputs["input_ids"].cpu().numpy(),
    "attention_mask": encoded_inputs["attention_mask"].cpu().numpy()
}


opt_time = measure_inference_time_onnx(ort_session_optimized, inputs_onnx)
print(f"Time Inference ONNX CPU optimized before save: {opt_time:.4f} s")

non_opt_time = measure_inference_time_onnx(ort_no_opt_session, inputs_onnx)
print(f"Time Inference ONNX CPU optimized after model load: {non_opt_time:.4f} s")


Time Inference ONNX CPU optimized before save: 0.2621 s
Time Inference ONNX CPU optimized after model load: 0.1251 s


There is huge difference: Model which was optimized after loading outperformed the one with optimization before saving. 

Furthermore, it is faster than on torch.

### 3. Docker deplyoment

The pytorch app deployment is in the directory "torch_app".

The onnx app deployment is in the directory "onnx_app".

### 4. Comparision

Builded container weights 988MB.

Builded container weights 1.38GB.

Command:
```bash
docker images
```

Result:

| IMAGE            | ID             | DISK USAGE | CONTENT SIZE | EXTRA |
|-----------------|----------------|------------|--------------|-------|
| onnx-app:latest  | a529f8ed6d24   | 988MB      | 332MB        | U     |
| torch-app:latest | 237bcf77cd19   | 1.88GB     | 419MB        | U     |
