In [2]:
# Day 52 - Matrix Multiplication on GPU (PyTorch)

import torch
import time

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Matrix sizes
N = 500  # Reduced size to avoid OutOfMemoryError
A = torch.randn(N, N, device=device)
B = torch.randn(N, N, device=device)

# ---------------- Built-in PyTorch GPU multiplication ----------------
start = time.time()
C = torch.matmul(A, B)  # GPU accelerated
torch.cuda.synchronize() if device == "cuda" else None
end = time.time()

print(f"PyTorch GPU matmul time: {end - start:.4f} seconds")

# ---------------- Manual Element-wise Multiplication Simulation ----------------
# (not as fast as torch.matmul, but shows GPU tensor operations)
start = time.time()
C_manual = (A.unsqueeze(2) * B.t().unsqueeze(0)).sum(dim=1)
torch.cuda.synchronize() if device == "cuda" else None
end = time.time()

print(f"Manual GPU matmul (tensor ops) time: {end - start:.4f} seconds")

# Validate correctness
print("Difference between results:", torch.norm(C - C_manual).item())

Using device: cuda
PyTorch GPU matmul time: 0.0008 seconds
Manual GPU matmul (tensor ops) time: 0.0360 seconds
Difference between results: 15814.509765625
