 ## Environment Setup & Imports

In [1]:
import os

# 1. Create Directories
os.makedirs('src', exist_ok=True)
os.makedirs('kernels', exist_ok=True)
os.makedirs('notebooks', exist_ok=True)

print("✅ Directories created: src, kernels, notebooks")

with open('src/__init__.py', 'w') as f:
    pass


✅ Directories created: src, kernels, notebooks


In [2]:
%%writefile src/utils.py
import time
import numpy as np
import cupy as cp

def generate_matrices(n, dtype=np.float32):
    """
    Generates two random N*N matrices A and B.
    Using float32 is standard for GPU programming (single precision).
    Args:
        n: Size of the matrices (N x N).
        dtype: Data type of the matrices (default: np.float32).
    Returns: 
        Two N x N matrices A and B.
    """
    
    A = np.random.rand(n, n).astype(dtype)
    B = np.random.rand(n, n).astype(dtype)
    return A, B

def check_correctness(target, reference, tolerance=1e-4):
    """
    Compares two matrices using NumPy's allclose.
    Args:
        target: The matrix to test.
        reference: The reference matrix.
        tolerance: The tolerance for comparison (default: 1e-4).
    Returns:
        True if matrices are close within the given tolerance, False otherwise.
    """
    if hasattr(target, 'get'): 
        target = target.get()
    if hasattr(reference, 'get'): 
        reference = reference.get()
    try:
        np.testing.assert_allclose(target, reference, atol=tolerance, rtol=tolerance)
        return True
    except AssertionError:
        return False

def benchmark_function(func, name, *args):
    """
    Benchmarks the execution time of a given function.
    Args:
        func: The function to benchmark.
        name: Name of the function (for reporting).
        *args: Arguments to pass to the function.
    Returns:
        A tuple containing the result of the function and the execution time in milliseconds.
    """
    cp.cuda.Device(0).synchronize()
    
    start_time = time.perf_counter()
    result = func(*args)
    
    cp.cuda.Device(0).synchronize()

    end_time = time.perf_counter()
    
    execution_time_ms = (end_time - start_time) * 1000
    print(f"[{name}] Execution Time: {execution_time_ms:.4f} ms")
    return result, execution_time_ms

Overwriting src/utils.py


In [3]:
%%writefile src/cpu_baseline.py
import numpy as np

def cpu_matmul(A, B):
    """
    Standard Matrix Multiplication using Triple Nested Loops.
    C[i][j] = sum(A[i][k] * B[k][j])
    Args:
        A: First input matrix.
        B: Second input matrix.
    Returns:
        The resulting matrix after multiplication C = A * B.
    """

    A = np.array(A)
    B = np.array(B)
    
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    
    if cols_A != rows_B:
        raise ValueError("Cannot multiply: Dimensions do not match.")
        
    C = np.zeros((rows_A, cols_B), dtype=A.dtype)
    
    for i in range(rows_A):          
        for j in range(cols_B):      
            total = 0
            for k in range(cols_A):  
                total += A[i, k] * B[k, j]
            C[i, j] = total
            
    return C

Overwriting src/cpu_baseline.py


In [4]:
%%writefile src/gpu_ops.py

import cupy as cp
import numpy as np
import os

def transfer_to_gpu(A_host: np.ndarray, B_host: np.ndarray) -> tuple:
    
    """
    Transfers numpy arrays from Host (CPU) to Device (GPU).
    Args:
        A_host: First input matrix on host (CPU).
        B_host: Second input matrix on host (CPU).
    Returns:
        Two matrices A and B on device (GPU).
    """
    A_gpu = cp.asarray(A_host)
    B_gpu = cp.asarray(B_host)
    return A_gpu, B_gpu

def cupy_matmul_library(A_gpu: cp.ndarray, B_gpu: cp.ndarray) -> cp.ndarray:
    """
    Performs Matrix Multiplication using CuPy's optimized library.
    Args:
        A_gpu: First input matrix on device (GPU).
        B_gpu: Second input matrix on device (GPU).
    Returns:    
        The resulting matrix after multiplication C = A * B on device (GPU).
    """
    return cp.matmul(A_gpu, B_gpu)

def run_custom_kernel(kernel_source: str, function_name: str, grid: tuple, block: tuple, args: tuple):
    """
    Compiles and executes a raw CUDA kernel.
    Args:
        kernel_source: The source code of the CUDA kernel as a string.
        function_name: The name of the kernel function to execute.
        grid: The grid dimensions for kernel launch.
        block: The block dimensions for kernel launch.
        args: The arguments to pass to the kernel.
    """
    module = cp.RawModule(code=kernel_source)
    kernel = module.get_function(function_name)
    kernel(grid, block, args)
    
def run_naive_kernel(A_gpu, B_gpu, N, block_size=(16, 16)):
    """
    Runs the naive matrix multiplication kernel.
    Args:
        A_gpu: First input matrix on device (GPU).
        B_gpu: Second input matrix on device (GPU).
        N: Size of the matrices (N x N).
        block_size: The block dimensions for kernel launch (default: (16, 16)).
    Returns:    
        The resulting matrix after multiplication C = A * B on device (GPU).
    """
    with open('kernels/matmul.cu', 'r') as f:
        kernel_code = f.read()
    
    kernel = cp.RawKernel(kernel_code, 'matmul_kernel')
    
    C_gpu = cp.zeros((N, N), dtype=cp.float32)
    
    grid_x = (N + block_size[0] - 1) // block_size[0]
    grid_y = (N + block_size[1] - 1) // block_size[1]
    grid_dim = (grid_x, grid_y)
    
    kernel(grid_dim, block_size, (A_gpu, B_gpu, C_gpu, cp.int32(N)))
    
    return C_gpu

Overwriting src/gpu_ops.py


In [5]:
%%writefile kernels/matmul.cu
extern "C" {
    __global__ void matmul_kernel(const float* A, const float* B, float* C, int N) {
        

        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col = blockIdx.x * blockDim.x + threadIdx.x;

        if (row < N && col < N) {
            
            float sum = 0.0f;
            
            for (int k = 0; k < N; k++) {
                float a = A[row * N + k];
                float b = B[k * N + col];
                sum += a * b;
            }

            C[row * N + col] = sum;
        }
    }
}

Overwriting kernels/matmul.cu


## CPU Baseline

In [6]:
import sys
import os
import numpy as np

from src.utils import generate_matrices, check_correctness, benchmark_function
from src.cpu_baseline import cpu_matmul

print("Modules imported successfully.")


N = 512
print(f"Part 1: CPU Baseline Benchmark (N={N})")

A_host, B_host = generate_matrices(N)


C_cpu, time_cpu = benchmark_function(cpu_matmul, "CPU Naive", A_host, B_host)

C_ref = np.dot(A_host, B_host)

if check_correctness(C_cpu, C_ref):
    print("PASS: CPU implementation matches NumPy reference.")
else:
    print("FAIL: CPU implementation is incorrect.")

Modules imported successfully.
Part 1: CPU Baseline Benchmark (N=512)
[CPU Naive] Execution Time: 51823.6198 ms
PASS: CPU implementation matches NumPy reference.


## GPU using CuPy

In [7]:
import cupy as cp
from src.gpu_ops import transfer_to_gpu, cupy_matmul_library

print("Part 2: CuPy (GPU) Implementation")

print(f"\nExperiment A: Small Matrix (N={N})")


print("Transferring data to GPU...", end=" ")
A_gpu, B_gpu = transfer_to_gpu(A_host, B_host) 
print("Done.")

print("Warming up GPU...", end=" ")
cupy_matmul_library(A_gpu, B_gpu)
cp.cuda.Stream.null.synchronize()
print("Done.")

C_gpu, time_gpu = benchmark_function(cupy_matmul_library, "CuPy Library", A_gpu, B_gpu)


if check_correctness(C_gpu, C_ref):
    print("PASS: CuPy result matches Reference.")
    print(f"Speedup vs CPU: {time_cpu / time_gpu:.2f}x")
else:
    print("FAIL: CuPy result incorrect.")


N_large = 2000
print(f"\nExperiment B: Large Matrix (N={N_large})")

A_large, B_large = generate_matrices(N_large)
A_large_gpu, B_large_gpu = transfer_to_gpu(A_large, B_large)

C_large_gpu, time_large_gpu = benchmark_function(cupy_matmul_library, f"CuPy (N={N_large})", A_large_gpu, B_large_gpu)

print(f"Note: A CPU naive loop for N={N_large} would take hours.")

Part 2: CuPy (GPU) Implementation

Experiment A: Small Matrix (N=512)
Transferring data to GPU... Done.
Warming up GPU... Done.
[CuPy Library] Execution Time: 0.3865 ms
PASS: CuPy result matches Reference.
Speedup vs CPU: 134081.97x

Experiment B: Large Matrix (N=2000)
[CuPy (N=2000)] Execution Time: 6.5000 ms
Note: A CPU naive loop for N=2000 would take hours.


## Custom Kernel 

In [8]:
import importlib
from src.gpu_ops import run_naive_kernel, transfer_to_gpu

print("Part 3: Custom Kernel Benchmark")

N = 2000 
print(f"Matrix Size: {N}x{N}")

A_host, B_host = generate_matrices(N)
A_gpu, B_gpu = transfer_to_gpu(A_host, B_host)

block_sizes = [(8, 8), (16, 16), (32, 32)]

for bs in block_sizes:
    print(f"\n--- Testing Block Size: {bs} ---")
    
    func_to_test = lambda: run_naive_kernel(A_gpu, B_gpu, N, block_size=bs)
    

    C_custom, time_custom = benchmark_function(func_to_test, f"Naive Kernel {bs}")
    
    C_ref_gpu = cupy_matmul_library(A_gpu, B_gpu)
    if check_correctness(C_custom, C_ref_gpu, tolerance=1e-3):
        print("Result Correct")
    else:
        print("Result Incorrect")

print("\n--- Comparison ---")

_, time_lib = benchmark_function(lambda: cupy_matmul_library(A_gpu, B_gpu), "CuPy Library")
print(f"Library is {time_custom/time_lib:.2f}x faster than the Naive Kernel.")

Part 3: Custom Kernel Benchmark
Matrix Size: 2000x2000

--- Testing Block Size: (8, 8) ---
[Naive Kernel (8, 8)] Execution Time: 116.8105 ms
Result Correct

--- Testing Block Size: (16, 16) ---
[Naive Kernel (16, 16)] Execution Time: 48.0739 ms
Result Correct

--- Testing Block Size: (32, 32) ---
[Naive Kernel (32, 32)] Execution Time: 41.4584 ms
Result Correct

--- Comparison ---
[CuPy Library] Execution Time: 4.1911 ms
Library is 9.89x faster than the Naive Kernel.
