In [None]:
# First, let's install the required libraries if they are not already installed
!pip install torch

# Import required libraries
import torch
import time

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the matrix size
M, K, N = 512, 512, 512  # A(MxK) * B(KxN) = C(MxN)

# Create random matrices A and B on the CPU
A = torch.randn(M, K, device='cpu')
B = torch.randn(K, N, device='cpu')

# Move matrices to the GPU
A_gpu = A.to(device)
B_gpu = B.to(device)

# Initialize an empty tensor for the result matrix on the GPU
C_gpu = torch.zeros(M, N, device=device)

# Function for matrix multiplication on the CPU
def matrix_multiply_cpu(A, B):
    return torch.mm(A, B)

# Function for matrix multiplication on the GPU
def matrix_multiply_gpu(A_gpu, B_gpu):
    return torch.mm(A_gpu, B_gpu)

# Measure the time taken for matrix multiplication on the CPU
start_time = time.time()
C_cpu = matrix_multiply_cpu(A, B)
cpu_time = time.time() - start_time
print(f"Matrix multiplication on CPU took {cpu_time:.4f} seconds.")

# Measure the time taken for matrix multiplication on the GPU
start_time = time.time()
C_gpu_result = matrix_multiply_gpu(A_gpu, B_gpu)
gpu_time = time.time() - start_time
print(f"Matrix multiplication on GPU took {gpu_time:.4f} seconds.")

# Check if the results match
print(f"Are the results from CPU and GPU identical? {torch.allclose(C_cpu, C_gpu_result)}")
