In [None]:
import ctypes
from ctypes import c_float, POINTER, c_int
import numpy as np
import time
import random
import os
import platform

# Build command (rtx 4090), windows
# cd C:\Users\luka\source\repos\Artifical-Neural-Networks-From-Scratch\cuda\main
# nvcc -shared math_ops.cu nn_layers.cu -o nn.dll -arch=sm_89 -Xcompiler "/MD"

# Linux: nvcc -shared ./cuda/main/math_ops.cu ./cuda/main/nn_layers.cu -o ./cuda/main/libnn.so -arch=sm_86 -Xcompiler -fPIC


In [None]:

class CudaNN:
    def __init__(self, dll_path="./main/nn.dll"):
        lib_path = None
        if lib_path is None:
            if platform.system() == "Windows":
                lib_path = "./main/nn.dll"       # Windows path
            else:
                lib_path = "./main/libnn.so" # Linux path
                
        # Jupyter runs from the directory the .ipynb file is in. 
        # Make sure this relative path is correct!
        if not os.path.exists(lib_path):
            raise FileNotFoundError(f"Library not found at {lib_path}. Check your notebook's current working directory: {os.getcwd()}")
        self.lib = ctypes.CDLL(dll_path)
        
        # Args and Outputs initialization
        
        self.lib.mat_add_cuda.argtypes = [
            POINTER(c_float), POINTER(c_float), POINTER(c_float),
            c_int, c_int
        ]
        self.lib.mat_add_cuda.restype = None

        self.lib.dot_product_cuda.argtypes = [
            POINTER(c_float), POINTER(c_float), POINTER(c_float), c_int
        ]
        self.lib.dot_product_cuda.restype = None

        self.lib.forward_layer_cuda.argtypes = [
            POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_float),
            c_int, c_int
        ]
        self.lib.forward_layer_cuda.restype = None

    def mat_add(self, A, B):
        rows, cols = A.shape
        C = np.zeros_like(A, dtype=np.float32)
        
        self.lib.mat_add_cuda(
            A.ctypes.data_as(POINTER(c_float)),
            B.ctypes.data_as(POINTER(c_float)),
            C.ctypes.data_as(POINTER(c_float)),
            rows, cols
        )
        return C

    def dot_product(self, a, b):
        n = a.size
        res = c_float()
        
        self.lib.dot_product_cuda(
            a.ctypes.data_as(POINTER(c_float)),
            b.ctypes.data_as(POINTER(c_float)),
            ctypes.byref(res),
            n
        )
        return res.value

    def forward_layer(self, inputs, weights, bias, n_out):
        n_in = inputs.size
        output = np.zeros(n_out, dtype=np.float32)
        
        self.lib.forward_layer_cuda(
            inputs.ctypes.data_as(POINTER(c_float)),
            weights.ctypes.data_as(POINTER(c_float)),
            bias.ctypes.data_as(POINTER(c_float)),
            output.ctypes.data_as(POINTER(c_float)),
            n_in, n_out
        )
        return output

In [10]:
cuda_nn = CudaNN("./main/libnn.so")

FileNotFoundError: Library not found at ./cuda/main/libnn.so. Check your notebook's current working directory: /home/luka/src/kaggle/ANN_FromScratch/cuda

In [11]:
# --- Test 1: Matrix Addition ---
rows, cols = 5120, 5120
A = np.random.rand(rows, cols).astype(np.float32)
B = np.random.rand(rows, cols).astype(np.float32)

start = time.time()
C = cuda_nn.mat_add(A, B) 
print(f"CUDA Add time: {time.time() - start:.4f}s")

CUDA Add time: 0.0542s


In [12]:
# --- Test 2: Dot Product ---
n = 10240
vec_a = np.ones(n, dtype=np.float32)
vec_b = np.ones(n, dtype=np.float32) * 3.0

dot_res = cuda_nn.dot_product(vec_a, vec_b)
print(f"Dot product: {dot_res}")

Dot product: 30720.0


In [13]:
# --- Test 3: Forward Layer ---
# 4 inputs -> 3 outputs
input_vec = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
# Flattened weights (3 neurons * 4 weights each)
weights = np.array([
    0.1, 0.2, 0.3, 0.4, 
    0.5, 0.6, 0.7, 0.8, 
    0.9, 1.0, 1.1, 1.2
], dtype=np.float32)
bias = np.array([0.1, 0.2, 0.3], dtype=np.float32)

cuda_out = cuda_nn.forward_layer(input_vec, weights, bias, n_out=3)

# Numpy verification
numpy_out = np.dot(weights.reshape(3, 4), input_vec) + bias

print(f"CUDA:  {cuda_out}")
print(f"Numpy: {numpy_out}")

if np.allclose(cuda_out, numpy_out, atol=1e-5):
    print("Success: CUDA matches NumPy")
else:
    print("Error: Results do not match")

CUDA:  [ 3.1  7.2 11.3]
Numpy: [ 3.1  7.2 11.3]
Success: CUDA matches NumPy
