<div class="markdown-google-sans">
  <h2>Machine Learning Hardware Course​</h2>
</div>

<div class="markdown-google-sans">
  <h2>Lab 4b: HW benchmarking</h2>
</div>


Run the below code twice. One time using a CPU and the other one use a GPU. Before re-running the code for the different HW, record your results with this notebook TO AVOID LOSING YOUR PROGRESS!

In [None]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision import datasets, models
from tqdm import tqdm
import time

# Load an image from COCO dataset
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

# Define preprocessing for AlexNet
transform = T.Compose([
    T.Resize((224, 224)),  # Resize to 224x224 as expected by AlexNet
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalization for ImageNet
])

# Apply preprocessing
image = transform(image).unsqueeze(0)  # Add batch dimension

In [None]:
# Move to GPU/MPS if available
device, dev_name = (torch.device("mps"), "mps") if torch.backends.mps.is_available() else \
         (torch.device("cuda"), "cuda") if torch.cuda.is_available() else (torch.device("cpu"), "cpu")

In [None]:
def profile_workload(model, device, dev_name, image, iterations=100):
    try:
        model_name = type(model).__name__
        print(f"profiling {model_name} on {dev_name}...")
    except:
        print(f"profiling on {dev_name}...")
    model.to(device)
    image = image.to(device)

    # Run inference
    if dev_name=="cpu":
        start_time = time.time()
        for _ in tqdm(range(iterations), desc ='profiling latency is in progress...'):
            with torch.no_grad():
              output = model(image)
        elapsed_time = time.time()-start_time
        latency = elapsed_time/iterations*1000
    elif dev_name=="cuda":
        torch.cuda.synchronize()  # Ensure any pending tasks are done
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        for _ in tqdm(range(iterations), desc ='profiling latency is in progress...'):
            with torch.no_grad():
              output = model(image)
        end.record()
        torch.cuda.synchronize()  # Wait for all kernels to finish
        latency = start.elapsed_time(end)/iterations
    elif dev_name=="mps":
        torch.mps.synchronize()  # Ensure all pending tasks are complete before starting
        start_time = time.time()
        for _ in tqdm(range(iterations), desc ='profiling latency is in progress...'):
            with torch.no_grad():
              output = model(image)
        elapsed_time = time.time()-start_time
        torch.mps.synchronize()  # Ensure all pending tasks are complete before starting
        latency = elapsed_time/iterations*1000  
    # Get predicted class
    predicted_class = output.argmax(dim=1).item()
    # print(f"Predicted Class: {predicted_class}")
    return latency 

In [None]:
def profile_workload_on_ViT(device, dev_name, iterations=100):
    # Load an image from COCO dataset
    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
    image = Image.open(requests.get(url, stream=True).raw)

    # Use ViTImageProcessor instead of the deprecated ViTFeatureExtractor
    processor = ViTImageProcessor.from_pretrained('google/vit-large-patch16-224')
    ViT_large = ViTForImageClassification.from_pretrained('google/vit-large-patch16-224')
    # Apply feature extractor directly on the raw image
    inputs = processor(images=image, return_tensors="pt")
    
    ViT_large.to(device)
    inputs = inputs.to(device)

    # Run inference
    if dev_name=="cpu":
        start_time = time.time()
        for _ in tqdm(range(iterations), desc ='profiling latency is in progress...'):
            with torch.no_grad():
              outputs = ViT_large(**inputs)
        elapsed_time = time.time()-start_time
        latency = elapsed_time/iterations*1000
    elif dev_name=="cuda":
        torch.cuda.synchronize()  # Ensure any pending tasks are done
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        for _ in tqdm(range(iterations), desc ='profiling latency is in progress...'):
            with torch.no_grad():
              outputs = ViT_large(**inputs)
        end.record()
        torch.cuda.synchronize()  # Wait for all kernels to finish
        latency = start.elapsed_time(end)/iterations
    elif dev_name=="mps":
        torch.mps.synchronize()  # Ensure all pending tasks are complete before starting
        start_time = time.time()
        for _ in tqdm(range(iterations), desc ='profiling latency is in progress...'):
            with torch.no_grad():
              outputs = ViT_large(**inputs)
        elapsed_time = time.time()-start_time
        torch.mps.synchronize()  # Ensure all pending tasks are complete before starting
        latency = elapsed_time/iterations*1000  
    # Get predicted class
    return latency 

In [None]:
alexnet = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=True)
alexnet_inference_latency = profile_workload(alexnet, device, dev_name, image, iterations=100)
print(f"\n\nAlexNet inference latency: {alexnet_inference_latency:.2f} ms")

In [None]:
resnet152 = torch.hub.load('pytorch/vision:v0.10.0', 'resnet152', pretrained=True)
resnet152_inference_latency = profile_workload(resnet152, device, dev_name, image, iterations=100)
print(f"\n\nResNet152 inference latency: {resnet152_inference_latency:.2f} ms")

In [None]:
from transformers import ViTImageProcessor, ViTForImageClassification
import torch
from PIL import Image
import requests

ViTLarge_inference_latency = profile_workload_on_ViT(device, dev_name, iterations=100)
print(f"\n\nViTLarge inference latency: {ViTLarge_inference_latency:.2f} ms")

## Record the profiled latancy on:
- CPU AlexNet latency: ## ms
- CPU ResNet152 latency: ## ms
- CPU ViT-Large latency: ## ms
- GPU AlexNet latency: ## ms
- GPU ResNet152 latency: ## ms
- GPU ViT-Large latency: ## ms

Compare your results for the different DNN models you profiles on the different hardware:

Why did you get different latencies for each DNN model?

Why did you get different latencies for different hardware?