In [21]:
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from scipy.signal import correlate, convolve2d

with open('./kernels.cpp', 'r') as f:
    kernel = f.read()

mod = SourceModule(kernel)
name = "conv2d_constShared"
# name = "conv2d_shared"
# name = "conv2d_naive"
func = mod.get_function(name)

threadx = 32
thready = 32
threadz = 1


def initMatrix(xshape=(4,4), kshape=(5,5)):
    X = np.arange(1, xshape[0]*xshape[1]+1).reshape(xshape).astype(np.float32)
    # K = np.triu(np.ones(kshape)).astype(np.float32)
    K = np.ones(kshape).astype(np.float32)
    Kflipped = np.flip(K, axis=[0,1]).astype(np.float32)
    return X, K, Kflipped
print(f"Performing {name}")
for size in [32, 64, 128, 512, 1024, 4096]:
    xshape = (size, size)
    kshape = (5, 5)
    blocksize = (threadx, thready, threadz)
    gridsize = ( int((xshape[0]-1)/threadx+1), int((xshape[1]-1)/thready+1), 1)
    
    
    
    X, K, Kflipped = initMatrix(xshape, kshape)
    
    Y = np.empty_like(X)
    Y_cpu = convolve2d(X, Kflipped, mode='same')
    
    X_gpu = gpuarray.to_gpu(X)
    Y_gpu = gpuarray.to_gpu(Y)
    if(name=="conv2d_shared" or name=="conv2d_naive"):
        K_gpu = gpuarray.to_gpu(K)
    elif(name=="conv2d_constShared"):
        K_gpu, _ = mod.get_global("kConst");
        cuda.memcpy_htod(K_gpu, K)
    xwidth = np.int32(X.shape[0])
    xheight = np.int32(X.shape[1])
    kwidth = np.int32(K.shape[0])
    kheight = np.int32(K.shape[1])
    print(f"grid = {gridsize}, xshape = {X.shape}, kshape={K.shape}")
    if(name=="conv2d_shared" or name=="conv2d_naive"):
        func(X_gpu, K_gpu, Y_gpu, xwidth, xheight, kwidth, kheight, block=blocksize, grid=
                   gridsize)
    elif(name=="conv2d_constShared"):
        func(X_gpu, Y_gpu, xwidth, xheight, block=blocksize, grid=
                   gridsize)
    
    Y = Y_gpu.get()
    
    cuda.Context.synchronize()
    
    valid = np.allclose(Y, Y_cpu)
    print(f"valid = {valid}")
    if(not valid):
        print(f"\tHUGE diff = {np.mean(np.abs(Y-Y_cpu))}")

Performing conv2d_constShared
grid = (1, 1, 1), xshape = (32, 32), kshape=(5, 5)
valid = True
grid = (2, 2, 1), xshape = (64, 64), kshape=(5, 5)
valid = True
grid = (4, 4, 1), xshape = (128, 128), kshape=(5, 5)
valid = True
grid = (16, 16, 1), xshape = (512, 512), kshape=(5, 5)
valid = True
grid = (32, 32, 1), xshape = (1024, 1024), kshape=(5, 5)
valid = True
grid = (128, 128, 1), xshape = (4096, 4096), kshape=(5, 5)
valid = True
