
# CUDA Kernel Performance Comparison

This notebook compares two CUDA convolution kernels:
- **Standard tiled kernel**
- **PTX-optimized kernel**

We measure and visualize:
- Execution time
- Memory throughput (based on roofline estimates)
- Performance implications for CNN-like loads

Ensure you have `nvcc`, `pycuda`, and a CUDA-capable GPU.


In [None]:

!pip install pycuda matplotlib --quiet


In [None]:

import pycuda.autoinit
import pycuda.driver as cuda
import numpy as np
import matplotlib.pyplot as plt
from subprocess import run, PIPE
from time import time


In [None]:

def compile_kernels():
    run(["nvcc", "-arch=sm_52", "-ptx", "kernel1.cu", "-o", "kernel1.ptx"], check=True)
    run(["nvcc", "-arch=sm_52", "-ptx", "kernel2.cu", "-o", "kernel2.ptx"], check=True)
    print("Kernels compiled.")
    
compile_kernels()


In [None]:

from pycuda.compiler import SourceModule

def load_module(file):
    with open(file, 'r') as f:
        return SourceModule(f.read())

def run_kernel(mod, func_name, image_size=(256, 256), channels=3, block=(16,16,1)):
    func = mod.get_function(func_name)
    height, width = image_size
    mask_width = 5

    I = np.random.rand(height, width, channels).astype(np.float32)
    M = (np.random.rand(mask_width, mask_width) / (mask_width * mask_width / 4.0)).astype(np.float32)
    P = np.zeros_like(I).astype(np.float32)

    I_gpu = cuda.mem_alloc(I.nbytes)
    M_gpu = cuda.mem_alloc(M.nbytes)
    P_gpu = cuda.mem_alloc(P.nbytes)

    cuda.memcpy_htod(I_gpu, I)
    cuda.memcpy_htod(M_gpu, M)

    grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1])

    start = cuda.Event()
    end = cuda.Event()
    start.record()
    func(I_gpu, M_gpu, P_gpu,
         np.int32(channels), np.int32(width), np.int32(height),
         block=block, grid=grid)
    end.record()
    end.synchronize()
    ms = start.time_till(end)  # milliseconds

    cuda.memcpy_dtoh(P, P_gpu)
    return ms, I, P

kernel1_mod = load_module("kernel1.cu")
kernel2_mod = load_module("kernel2.cu")
t1, _, _ = run_kernel(kernel1_mod, "convolution")
t2, _, _ = run_kernel(kernel2_mod, "convolution_with_ptx")
print(f"Kernel 1 time: {t1:.3f} ms")
print(f"Kernel 2 time (with PTX): {t2:.3f} ms")


In [None]:

labels = ['Standard', 'PTX-Optimized']
times = [t1, t2]
plt.figure(figsize=(6,4))
plt.bar(labels, times, color=['skyblue', 'orange'])
plt.ylabel("Execution Time (ms)")
plt.title("Kernel Timing Comparison")
plt.grid(True, axis='y')
plt.show()
