# Advanced Inference Optimization Workshop
This notebook contains code snippets for profiling, quantization, pruning, TVM compilation, serving, and performance benchmarking.

In [None]:
%%bash
pip install torch torchvision psutil onnx onnxruntime matplotlib numpy

In [None]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = models.resnet50(pretrained=True).to(device)
model.eval()

input = torch.randn(32, 3, 224, 224).to(device)  # Batch size of 32

with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,
        with_stack=True) as prof:
    with record_function("model_inference"):
        with torch.no_grad():
            model(input)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

In [None]:
import torch
import torchvision.models as models
import time
import torch._C
import psutil
import platform

model = models.resnet50(pretrained=True).eval().to("cpu")
dummy_input = torch.randn(1, 3, 224, 224)

# TorchScript compilation
scripted_model = torch.jit.trace(model, dummy_input)
scripted_model = torch.jit.optimize_for_inference(scripted_model)

# Set intra-op threads and profile
torch.set_num_threads(4)
N = 100
latencies = []
with torch.no_grad():
    for _ in range(N):
        start = time.time()
        scripted_model(dummy_input)
        latencies.append((time.time() - start) * 1000)
script_mt_time = sum(latencies) / N

print(f"TorchScript optimized latency (4 threads): {script_mt_time:.2f} ms")
print(f"Min: {min(latencies):.2f} ms | Max: {max(latencies):.2f} ms | Std Dev: {torch.std(torch.tensor(latencies)):.2f} ms")
print(f"System: {platform.processor()}, Threads: {psutil.cpu_count(logical=True)}")

In [None]:
import onnx
import onnxruntime as ort
import torch.onnx

onnx_path = "resnet50.onnx"
torch.onnx.export(model, dummy_input, onnx_path, input_names=['input'], output_names=['output'],
                  dynamic_axes={'input': {0: 'batch_size'}})

session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
ort_input = {session.get_inputs()[0].name: dummy_input.numpy()}

ort_latencies = []
for _ in range(N):
    start = time.time()
    session.run(None, ort_input)
    ort_latencies.append((time.time() - start) * 1000)
onnx_time = sum(ort_latencies) / N
print(f"ONNX Runtime latency (CPU EP): {onnx_time:.2f} ms")
print(f"Min: {min(ort_latencies):.2f} ms | Max: {max(ort_latencies):.2f} ms | Std Dev: {torch.std(torch.tensor(ort_latencies)):.2f} ms")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

methods = ["TorchScript (4T)", "ONNX Runtime"]
means = [script_mt_time, onnx_time]
errors = [
    torch.std(torch.tensor(latencies)),
    torch.std(torch.tensor(ort_latencies))
]

plt.figure(figsize=(10, 6))
plt.bar(methods, means, yerr=errors, capsize=5, color=['blue', 'green'])
plt.ylabel("Latency (ms)")
plt.title("Advanced Inference Optimizations on ResNet (with Variability)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Print Comparative Summary
for name, time_val, err in zip(methods, means, errors):
    print(f"{name}: {time_val:.2f} ± {err:.2f} ms")
