 ## Environment Setup & Imports

In [9]:
import os

# 1. Create Directories
os.makedirs('src', exist_ok=True)
os.makedirs('kernels', exist_ok=True)
os.makedirs('notebooks', exist_ok=True)

print("✅ Directories created: src, kernels, notebooks")

with open('src/__init__.py', 'w') as f:
    pass


✅ Directories created: src, kernels, notebooks


In [11]:
%%writefile src/utils.py
import time
import numpy as np

def generate_matrices(n, dtype=np.float32):
    """
    Generates two random N*N matrices A and B.
    Using float32 is standard for GPU programming (single precision).
    Args:
        n: Size of the matrices (N x N).
        dtype: Data type of the matrices (default: np.float32).
    Returns: 
        Two N x N matrices A and B.
    """
    
    A = np.random.rand(n, n).astype(dtype)
    B = np.random.rand(n, n).astype(dtype)
    return A, B

def check_correctness(target, reference, tolerance=1e-4):
    """
    Compares two matrices using NumPy's allclose.
    Args:
        target: The matrix to test.
        reference: The reference matrix.
        tolerance: The tolerance for comparison (default: 1e-4).
    Returns:
        True if matrices are close within the given tolerance, False otherwise.
    """
    try:
        np.testing.assert_allclose(target, reference, atol=tolerance, rtol=tolerance)
        return True
    except AssertionError:
        return False

def benchmark_function(func, name, *args):
    """
    Benchmarks the execution time of a given function.
    Args:
        func: The function to benchmark.
        name: Name of the function (for reporting).
        *args: Arguments to pass to the function.
    Returns:
        A tuple containing the result of the function and the execution time in milliseconds.
    """
    start_time = time.perf_counter()
    result = func(*args)
    end_time = time.perf_counter()
    
    execution_time_ms = (end_time - start_time) * 1000
    print(f"[{name}] Execution Time: {execution_time_ms:.4f} ms")
    return result, execution_time_ms

Overwriting src/utils.py


In [12]:
%%writefile src/cpu_baseline.py
import numpy as np

def cpu_matmul(A, B):
    """
    Standard Matrix Multiplication using Triple Nested Loops.
    C[i][j] = sum(A[i][k] * B[k][j])
    Args:
        A: First input matrix.
        B: Second input matrix.
    Returns:
        The resulting matrix after multiplication C = A * B.
    """

    A = np.array(A)
    B = np.array(B)
    
    rows_A, cols_A = A.shape
    rows_B, cols_B = B.shape
    
    if cols_A != rows_B:
        raise ValueError("Cannot multiply: Dimensions do not match.")
        
    C = np.zeros((rows_A, cols_B), dtype=A.dtype)
    
    for i in range(rows_A):          
        for j in range(cols_B):      
            total = 0
            for k in range(cols_A):  
                total += A[i, k] * B[k, j]
            C[i, j] = total
            
    return C

Writing src/cpu_baseline.py


## CPU Baseline

In [15]:
import sys
import os
import numpy as np

from src.utils import generate_matrices, check_correctness, benchmark_function
from src.cpu_baseline import cpu_matmul

print("Modules imported successfully.")


N = 512
print(f"Part 1: CPU Baseline Benchmark (N={N})")

A_host, B_host = generate_matrices(N)


C_cpu, time_cpu = benchmark_function(cpu_matmul, "CPU Naive", A_host, B_host)

C_ref = np.dot(A_host, B_host)

if check_correctness(C_cpu, C_ref):
    print("PASS: CPU implementation matches NumPy reference.")
else:
    print("FAIL: CPU implementation is incorrect.")

Modules imported successfully.
Part 1: CPU Baseline Benchmark (N=512)
[CPU Naive] Execution Time: 52450.3938 ms
PASS: CPU implementation matches NumPy reference.
