In [41]:
!pip -q install gputil psutil humanize
# Import packages
import os,sys,humanize,psutil,GPUtil

# Define function
def mem_report():
  print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ))

  GPUs = GPUtil.getGPUs()
  for i, gpu in enumerate(GPUs):
    print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'
    .format(i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))

# Execute function
mem_report()

CPU RAM Free: 11.7 GB
GPU 0 ... Mem Free: 14861MB / 15360MB | Utilization   2%


In [39]:
!nvidia-smi

Mon Dec 30 00:32:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0              28W /  70W |    241MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Tesla T4 → Compute Capability 7.5 → Architecture 7.5

In [40]:
import os
os.environ['TORCH_CUDA_ARCH_LIST'] = "7.5"

In [None]:
!sudo apt-get install python3-pybind11

In [None]:
%%writefile matmul_kernel.cu

#include <torch/extension.h>

template <typename T>
__global__ void matmul_kernel(const T* A, const T* B, T* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < N && col < N) {
        T sum = 0;
        for (int k = 0; k < N; ++k) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

template <typename T>
void matmul_launcher(torch::Tensor A, torch::Tensor B, torch::Tensor C, int N) {
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + 15) / 16, (N + 15) / 16);
    matmul_kernel<<<numBlocks, threadsPerBlock>>>(A.data_ptr<T>(), B.data_ptr<T>(), C.data_ptr<T>(), N);
}

void matmul_binding(torch::Tensor A, torch::Tensor B, torch::Tensor C, int N) {
    AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "matmul_launcher", ([&] {
        matmul_launcher<scalar_t>(A, B, C, N);
    }));
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("matmul", &matmul_binding, "Matrix multiplication kernel");
}


In [None]:
from torch.utils.cpp_extension import load

matmul_kernel = load(
    name="matmul_kernel",
    sources=["matmul_kernel.cu"],
    extra_cuda_cflags=["-O3"]
)

In [None]:
import torch

def matmul_cuda(A, B, N):
    C = torch.zeros((N, N), dtype=A.dtype, device='cuda')
    matmul_kernel.matmul(A, B, C, N)
    return C


In [38]:
N = 2048
A = torch.randn((N, N), dtype=torch.float32, device='cuda')
B = torch.randn((N, N), dtype=torch.float32, device='cuda')



import time


start = time.time()
C_pytorch = torch.matmul(A, B)
print("PyTorch Time:", time.time() - start)


start = time.time()
C_cuda = matmul_cuda(A, B, N)
print("CUDA Kernel Time:", time.time() - start)

PyTorch Time: 0.0005533695220947266
CUDA Kernel Time: 0.0004603862762451172
