# Lab 07 MLops

In [1]:
!nvidia-smi

Wed Nov 19 08:46:30 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.274.02             Driver Version: 535.274.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  | 00000000:2D:00.0 Off |                    0 |
| N/A   39C    P0              43W / 300W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Imports

In [17]:
from transformers import AutoTokenizer, AutoModel
import torch
import time

## Exercise 1

### Load model

In [5]:
model_name = "sentence-transformers/multi-qa-mpnet-base-cos-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

### Sample input - used for experiment

In [16]:
texts = [
    "Hello World1!",
    "Java is better programming language than python - change my mind.",
    "Transformers are powerful models for NLP.",
    "Batch processing allows multiple sentences to be encoded together.",
    "This is another example input."
]

encoded_inputs = tokenizer(
    texts,
    padding=True,          
    truncation=True,      
    return_tensors="pt"    
)

input_ids = encoded_inputs["input_ids"]

for text, ids in zip(texts, input_ids):
    token_list = tokenizer.convert_ids_to_tokens(ids.tolist())
    print(f"\nTEXT: {text}\nTOKENS: {token_list}\nIDS:{ids.tolist()}")


TEXT: Hello World1!
TOKENS: ['<s>', 'hello', 'world', '##1', '!', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
IDS:[0, 7596, 2092, 2491, 1003, 2, 1, 1, 1, 1, 1, 1, 1, 1]

TEXT: Java is better programming language than python - change my mind.
TOKENS: ['<s>', 'java', 'is', 'better', 'programming', 'language', 'than', 'python', '-', 'change', 'my', 'mind', '.', '</s>']
IDS:[0, 9266, 2007, 2492, 4734, 2657, 2088, 18754, 1015, 2693, 2030, 2572, 1016, 2]

TEXT: Transformers are powerful models for NLP.
TOKENS: ['<s>', 'transformers', 'are', 'powerful', 'models', 'for', 'nl', '##p', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
IDS:[0, 19085, 2028, 3932, 4279, 2009, 17957, 2365, 1016, 2, 1, 1, 1, 1]

TEXT: Batch processing allows multiple sentences to be encoded together.
TOKENS: ['<s>', 'batch', 'processing', 'allows', 'multiple', 'sentences', 'to', 'be', 'encoded', 'together', '.', '</s>', '<pad>', '<pad>']
IDS:[0, 14112, 6368, 4477, 3678, 11750, 2004

In [21]:
encoded_inputs_cpu = {k: v.to("cpu") for k, v in encoded_inputs.items()}
encoded_inputs_gpu = {k: v.to("cuda") for k, v in encoded_inputs.items()}

### Measuring inference time

In [29]:
def measure_inference_time(model, encoded_inputs, runs=200, warmup=10):
    for _ in range(warmup):
        _ = model(**encoded_inputs)

    start_time = time.perf_counter()
    
    for _ in range(runs):
        outputs = model(**encoded_inputs)
    
    end_time = time.perf_counter()
    total_time = end_time - start_time
    return total_time / runs

#### Plain Pytorch

In [30]:
model.to(device="cpu")
plain_cpu = measure_inference_time(model, encoded_inputs_cpu)
print(f"Time Plain Torch CPU: {plain_cpu:.4f} s")

model.to(device="cuda")
plain_gpu = measure_inference_time(model, encoded_inputs_gpu)
print(f"Time Plain Torch GPU: {plain_gpu:.4f} s")

Time Plain Torch CPU: 0.2386 s
Time Plain Torch GPU: 0.0078 s


#### Eval mode

In [31]:
model.to(device="cpu")
model.eval()
eval_cpu = measure_inference_time(model, encoded_inputs_cpu)
print(f"Time Eval Torch CPU: {eval_cpu:.4f} s")

model.to(device="cuda")
model.eval()
eval_gpu = measure_inference_time(model, encoded_inputs_gpu)
print(f"Time Eval Torch GPU: {eval_gpu:.4f} s")

Time Eval Torch CPU: 0.2405 s
Time Eval Torch GPU: 0.0076 s


#### Eval mode and no grad

In [33]:
model.to(device="cpu")
model.eval()
with torch.no_grad():
    eval_no_grad_cpu = measure_inference_time(model, encoded_inputs_cpu)
    print(f"Time Eval and no Grad Torch CPU: {eval_no_grad_cpu:.4f} s")

model.to(device="cuda")
model.eval()
with torch.no_grad():
    eval_no_grad_gpu = measure_inference_time(model, encoded_inputs_gpu)
    print(f"Time Eval and no Grad Torch GPU: {eval_no_grad_gpu:.4f} s")

Time Eval and no Grad Torch CPU: 0.2318 s
Time Eval and no Grad Torch GPU: 0.0062 s


#### Full inference mode

In [35]:
model.to(device="cpu")
with torch.inference_mode():
    eval_inference_cpu = measure_inference_time(model, encoded_inputs_cpu)
    print(f"Time Inference Mode CPU: {eval_inference_cpu:.4f} s")

model.to(device="cuda")
with torch.inference_mode():
    eval_inference_gpu = measure_inference_time(model, encoded_inputs_gpu)
    print(f"Time Inference Mode GPU: {eval_inference_gpu:.4f} s")

Time Inference Mode CPU: 0.2312 s
Time Inference Mode GPU: 0.0055 s


### Result of experiment

| Tryb                 | CPU (s) | GPU (s) |
| -------------------- | ------- | ------- |
| Plain Torch          | 0.2386  | 0.0078  |
| Eval Torch           | 0.2405  | 0.0076  |
| Eval + no_grad Torch | 0.2318  | 0.0062  |
| Inference Mode       | 0.2312  | 0.0055  |
