In [1]:
import ctypes
from ctypes import c_float, POINTER
import numpy as np
import time
import random
import shutil
"""
Build command (rtx 4090)
nvcc --shared add.cu -o add.dll -Xcompiler "/MD" -arch=sm_86
"""

'\nBuild command (rtx 4090)\nnvcc --shared add.cu -o add.dll -Xcompiler "/MD" -arch=sm_86\n'

In [None]:
# Jupyter notebook was marking the file as in use
# Even when finished processing, so we copy the file
# and keep the copy open so we can rebuild to add.dll
shutil.copy2("./add.dll","./use_add.dll")
lib = ctypes.CDLL("./use_add.dll") 
rf = ctypes.byref

lib.add_cuda.argtypes = [POINTER(c_float), POINTER(c_float), POINTER(c_float)]
lib.add_cuda.restype = None

lib.multiply_cuda.argtypes = [POINTER(c_float), POINTER(c_float), POINTER(c_float)]
lib.multiply_cuda.restype = None

lib.mat_add_cuda.argtypes = [
    ctypes.POINTER(ctypes.c_float),
    ctypes.POINTER(ctypes.c_float),
    ctypes.POINTER(ctypes.c_float),
    ctypes.c_int,
    ctypes.c_int
]
lib.mat_add_cuda.restype = None


In [10]:
a = c_float(3.5)
b = c_float(2.25)
result = c_float()

lib.add_cuda(rf(a), rf(b), ctypes.byref(result))
print("Result from add CUDA:", result.value)

lib.multiply_cuda(rf(a), rf(b), ctypes.byref(result))
print("Result from multiply CUDA:", result.value)

Result from add CUDA: 5.75
Result from multiply CUDA: 7.875


In [11]:
rows, cols = 10240, 10240

A = np.random.rand(rows, cols).astype(np.float32)
B = np.random.rand(rows, cols).astype(np.float32)
C = np.zeros((rows, cols), dtype=np.float32)

In [12]:
s = time.time()
lib.mat_add_cuda(
    A.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    B.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    C.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    rows,
    cols
)
end = time.time()
print(f"CUDA matrix addition for size {rows}*{cols}:  {end-s:.4f}s")

mat1 = []
mat2 = []
mat3 = []
for y in range(rows):
    mat1.append([])
    mat2.append([])
    mat3.append([])
    for x in range(cols):
        mat1[-1].append(random.randrange(0,4))
        mat2[-1].append(random.randrange(0,4))
s2 = time.time()
for y in range(rows):
    for x in range(cols):   
        mat3.append(mat1[y][x]+mat2[y][x])
print(f"Python matrix addition for size {rows}*{cols}:  {time.time()-s2:.4f}s")



CUDA matrix addition for size 10240*10240:  0.3671s
Python matrix addition for size 10240*10240:  12.2197s
