In [3]:
%%writefile main.cu
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>

#define CUDA_CHECK(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
                cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

__global__ void ELL_kernel(const float* A, const float* X, float* data_ell,
                           int* indices_ell, float* data_coo, int* row_coo,
                           int* col_coo, float* output_matrix, const int threshold,
                           const int N, const int M, int* global_coo_counter) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= N) return;

    int counter = 0;

    for (int col = 0; col < M; ++col) {
        float val = A[row * M + col];
        if (val != 0) {
            if (counter < threshold) {
                data_ell[counter * N + row] = val;
                indices_ell[counter * N + row] = col;
                counter++;
            } else {
                int coo_index = atomicAdd(global_coo_counter, 1);
                data_coo[coo_index] = val;
                row_coo[coo_index] = row;
                col_coo[coo_index] = col;
            }
        }
    }

    for (int i = counter; i < threshold; ++i) {
        data_ell[i * N + row] = 0;
        indices_ell[i * N + row] = -1;
    }

    float acc = 0.0f;
    for (int p = 0; p < threshold; ++p) {
        int index = indices_ell[p * N + row];
        if (index != -1) {
            acc += data_ell[p * N + row] * X[index];
        }
    }

    // Add COO contribution
    for (int i = 0; i < *global_coo_counter; ++i) {
        if (row_coo[i] == row) {
            acc += data_coo[i] * X[col_coo[i]];
        }
    }

    output_matrix[row] = acc;
}

int main() {
    const int N = 1000;        // Rows (will be replaced by Python)
    const int M = 1000;        // Cols (will be replaced by Python)
    const int threshold = 20;  // ELL slots per row (will be replaced by Python)

    float* A = new float[N * M];
    float* data_ell = new float[N * threshold]();
    float* data_coo = new float[N * M]();
    int* indices_ell = new int[N * threshold]();
    int* row_coo = new int[N * M]();
    int* col_coo = new int[N * M]();
    float* X = new float[M];
    float* output_matrix = new float[N];

    int* d_global_coo_counter;
    CUDA_CHECK(cudaMalloc(&d_global_coo_counter, sizeof(int)));
    CUDA_CHECK(cudaMemset(d_global_coo_counter, 0, sizeof(int)));

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            A[i * M + j] = (i + j) % 3 == 0 ? i + j : 0;
        }
    }
    for (int i = 0; i < M; i++) X[i] = 1.0f;

    float *d_A, *d_X, *d_data_ell, *d_data_coo, *d_output_matrix;
    int *d_indices_ell, *d_row_coo, *d_col_coo;

    CUDA_CHECK(cudaMalloc(&d_A, N * M * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_X, M * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_data_ell, N * threshold * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_data_coo, N * M * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_indices_ell, N * threshold * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_row_coo, N * M * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_col_coo, N * M * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_output_matrix, N * sizeof(float)));

    CUDA_CHECK(cudaMemcpy(d_A, A, N * M * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_X, X, M * sizeof(float), cudaMemcpyHostToDevice));

    int block_size = 256;
    int num_blocks = (N + block_size - 1) / block_size;

    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    CUDA_CHECK(cudaEventRecord(start));

    ELL_kernel<<<num_blocks, block_size>>>(d_A, d_X, d_data_ell, d_indices_ell,
                                           d_data_coo, d_row_coo, d_col_coo,
                                           d_output_matrix, threshold, N, M, d_global_coo_counter);

    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));

    float milliseconds = 0;
    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
    std::cout << "CUDA kernel time: " << milliseconds / 1000.0 << " seconds" << std::endl;

    CUDA_CHECK(cudaMemcpy(data_ell, d_data_ell, N * threshold * sizeof(float), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(data_coo, d_data_coo, N * M * sizeof(float), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(indices_ell, d_indices_ell, N * threshold * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(row_coo, d_row_coo, N * M * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(col_coo, d_col_coo, N * M * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(output_matrix, d_output_matrix, N * sizeof(float), cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));

    int h_global_coo_counter;
    CUDA_CHECK(cudaMemcpy(&h_global_coo_counter, d_global_coo_counter, sizeof(int), cudaMemcpyDeviceToHost));
    for (int i = 0; i < 10; ++i) {
        std::cout << "COO[" << i << "]: val = " << data_coo[i] << ", row = " << row_coo[i] << ", col = " << col_coo[i] << std::endl;
    }

    FILE *output_file = fopen("cuda_results.txt", "w");
    if (output_file == nullptr) {
        std::cerr << "Failed to open output file!" << std::endl;
        return EXIT_FAILURE;
    }
    for (int i = 0; i < N; i++) fprintf(output_file, "%.10f\n", output_matrix[i]);
    fclose(output_file);
    std::cout << "Wrote " << N << " values to cuda_results.txt" << std::endl;

    CUDA_CHECK(cudaFree(d_A));
    CUDA_CHECK(cudaFree(d_X));
    CUDA_CHECK(cudaFree(d_data_ell));
    CUDA_CHECK(cudaFree(d_data_coo));
    CUDA_CHECK(cudaFree(d_indices_ell));
    CUDA_CHECK(cudaFree(d_row_coo));
    CUDA_CHECK(cudaFree(d_col_coo));
    CUDA_CHECK(cudaFree(d_output_matrix));

    delete[] A;
    delete[] data_ell;
    delete[] data_coo;
    delete[] indices_ell;
    delete[] row_coo;
    delete[] col_coo;
    delete[] X;
    delete[] output_matrix;
    return 0;
}


Writing main.cu


In [4]:
%%writefile run_benchmark.py
import subprocess, numpy as np, torch, time, matplotlib.pyplot as plt, psutil, os, gc

def get_memory_info_gb():
    m = psutil.virtual_memory()
    return m.used / (1024**3), m.total / (1024**3)

def estimate_memory_usage_gb(N, M):
    # ~1/3 nonzeros for (i+j)%3==0, PyTorch uses 64-bit indices, float32 values
    nnz = (N * M) // 3
    bytes_per_nnz = 8 + 8 + 4  # row idx + col idx + value
    return nnz * bytes_per_nnz / (1024**3)

def compile_cuda_program():
    subprocess.run(["nvcc", "mainy.cu", "-O3", "-o", "mainy"], check=True)

def run_cuda_program(N, M):
    with open("main.cu", "r") as f:
        content = f.read()

    content = content.replace("const int N = 1000;", f"const int N = {N};")
    content = content.replace("const int M = 1000;", f"const int M = {M};")

    thr = max(1, min(M, int(np.floor(M/3))))
    content = content.replace("const int threshold = 20;", f"const int threshold = {thr};")

    with open("mainy.cu", "w") as f:
        f.write(content)

    compile_cuda_program()
    result = subprocess.run(["./mainy"], capture_output=True, text=True)
    out = result.stdout
    line = [l for l in out.splitlines() if "CUDA kernel time:" in l][0]
    secs = float(line.split(":")[1].strip().split()[0])
    return secs

def create_sparse_matrix_and_vector(N, M, device):
    # Build the same pattern as CUDA: (i+j)%3==0 ? i+j : 0
    # Chunked to keep memory reasonable while constructing indices/values
    chunk_rows = max(1, 10_000 // max(1, M))
    idx_rows, idx_cols, vals = [], [], []

    for r0 in range(0, N, chunk_rows):
        r1 = min(N, r0 + chunk_rows)
        for i in range(r0, r1):
            # Positions j where (i+j)%3==0 -> j ≡ -i (mod 3)
            start_j = (3 - (i % 3)) % 3
            for j in range(start_j, M, 3):
                idx_rows.append(i)
                idx_cols.append(j)
                vals.append(float(i + j))

    indices = torch.tensor([idx_rows, idx_cols], dtype=torch.long, device=device)
    values  = torch.tensor(vals, dtype=torch.float32, device=device)
    A = torch.sparse_coo_tensor(indices, values, (N, M), device=device).coalesce()
    X = torch.ones(M, 1, dtype=torch.float32, device=device)
    return A, X

def run_torch_program(N, M, iters=20):
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA GPU not available for PyTorch.")
    device = torch.device("cuda")
    A, X = create_sparse_matrix_and_vector(N, M, device)

    # Warmup
    torch.cuda.synchronize()
    _ = torch.sparse.mm(A, X)
    torch.cuda.synchronize()

    times_ms = []
    for _ in range(iters):
        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end   = torch.cuda.Event(enable_timing=True)
        start.record()
        Y = torch.sparse.mm(A, X)
        end.record()
        torch.cuda.synchronize()
        times_ms.append(start.elapsed_time(end))
    del A, X, Y
    torch.cuda.empty_cache()
    return float(np.mean(times_ms) / 1000.0)

def main():
    sizes = [(500,500), (1000,1000), (2000,2000), (3000,3000)]
    used, total = get_memory_info_gb()
    print(f"System memory: used {used:.2f} GB / total {total:.2f} GB")
    print(f"PyTorch CUDA available: {torch.cuda.is_available()}")

    labels, cuda_times, torch_times = [], [], []
    for (N, M) in sizes:
        est = estimate_memory_usage_gb(N, M)
        safe = est <= total * 0.5   # skip if too big for sparse tensor construction
        print(f"\nSize {N}x{M} | estimated sparse memory ~ {est:.2f} GB | safe: {safe}")
        if not safe:
            print("Skipping PyTorch for this size due to memory estimate.")
        labels.append(f"{N}x{M}")

        # CUDA (always runs – small dense arrays in CUDA code are on device)
        try:
            t_cuda = run_cuda_program(N, M)
            cuda_times.append(t_cuda)
            print(f"Custom CUDA time: {t_cuda:.6f} s")
        except Exception as e:
            print(f"CUDA run failed: {e}")
            cuda_times.append(None)

        # PyTorch sparse timing (only if safe)
        if safe:
            try:
                t_torch = run_torch_program(N, M, iters=20)
                torch_times.append(t_torch)
                print(f"PyTorch sparse time: {t_torch:.6f} s")
            except Exception as e:
                print(f"PyTorch run failed: {e}")
                torch_times.append(None)
        else:
            torch_times.append(None)

        gc.collect()
        torch.cuda.empty_cache()

    # Plot
    import matplotlib.pyplot as plt
    plt.figure(figsize=(7,4))
    plt.plot(labels, cuda_times, marker="o", label="Custom CUDA (ELL+COO)")
    plt.plot(labels, torch_times, marker="s", label="PyTorch sparse.mm")
    plt.xlabel("Matrix size")
    plt.ylabel("Time (s)")
    plt.title("SpMV performance comparison")
    plt.legend()
    plt.xticks(rotation=30)
    plt.grid(True, linestyle="--", alpha=0.3)
    plt.show()

if __name__ == "__main__":
    main()


Writing run_benchmark.py


In [2]:
# Step 1: Install CUDA compiler (nvcc)
!apt-get update -qq
!apt-get install -y cuda-toolkit-11-8
!nvcc --version


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package cuda-toolkit-11-8
/bin/bash: line 1: nvcc: command not found
