In [21]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdlib.h>

// Helper function to check CUDA errors
#define CHECK(call) \
{ \
    const cudaError_t error = call; \
    if (error != cudaSuccess) \
    { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
        exit(1); \
    } \
}

__global__ void vectorAdd(const float *a, const float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main(int argc, char *argv[]) {
    int n = 10000000;
    size_t bytes = n * sizeof(float);
    printf("Debug Mode: Vector Add N=%d\n", n);

    // Host Alloc
    float *h_a = (float *)malloc(bytes);
    float *h_b = (float *)malloc(bytes);
    float *h_c = (float *)malloc(bytes);
    float *cpu_ref = (float *)malloc(bytes);

    // Init with non-zero values
    for (int i = 0; i < n; i++) {
        h_a[i] = sinf(i) * 10.0f;
        h_b[i] = cosf(i) * 5.0f;
        cpu_ref[i] = h_a[i] + h_b[i];
    }

    // Device Alloc
    float *d_a, *d_b, *d_c;
    printf("Allocating GPU memory...\n");
    CHECK(cudaMalloc(&d_a, bytes));
    CHECK(cudaMalloc(&d_b, bytes));
    CHECK(cudaMalloc(&d_c, bytes));

    // Copy TO Device
    printf("Copying to GPU...\n");
    CHECK(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice));

    // Kernel Launch
    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;
    printf("Launching Kernel...\n");

    vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Check for Kernel Launch errors (sync required)
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaGetLastError());

    // Copy FROM Device
    printf("Copying from GPU...\n");
    CHECK(cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost));

    // Verify
    printf("Verifying...\n");
    int errors = 0;
    for (int i = 0; i < 10; i++) { // Check first 10
        if (fabs(h_c[i] - cpu_ref[i]) > 1e-5) {
            printf("Mismatch at %d: GPU=%.5f vs CPU=%.5f\n", i, h_c[i], cpu_ref[i]);
            errors++;
        }
    }

    if (errors == 0) printf("SUCCESS: GPU calculated correctly.\n");
    else printf("FAILURE: GPU returned zeros or garbage.\n");

    return 0;
}

Overwriting vector_add.cu


In [22]:
%%writefile multiply_scale.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdlib.h>

// Helper to catch errors immediately
#define CHECK(call) \
{ \
    const cudaError_t error = call; \
    if (error != cudaSuccess) \
    { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
        exit(1); \
    } \
}

// Kernel: Element-wise multiply and scale
// c = alpha * a * b
__global__ void multiplyScale(const float *a, const float *b, float *c, float alpha, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = alpha * a[idx] * b[idx];
    }
}

int main(int argc, char *argv[]) {
    int n = 10000000;
    if (argc > 1) n = atoi(argv[1]);

    float alpha = 2.5f;
    size_t bytes = n * sizeof(float);

    printf("Multiply & Scale (N=%d, Alpha=%.2f)\n", n, alpha);

    // Host Memory
    float *h_a = (float *)malloc(bytes);
    float *h_b = (float *)malloc(bytes);
    float *h_c = (float *)malloc(bytes);
    float *ref = (float *)malloc(bytes);

    // Initialize
    for (int i = 0; i < n; i++) {
        h_a[i] = sinf(i);
        h_b[i] = cosf(i);
        ref[i] = alpha * h_a[i] * h_b[i];
    }

    // GPU Memory
    float *d_a, *d_b, *d_c;
    CHECK(cudaMalloc(&d_a, bytes));
    CHECK(cudaMalloc(&d_b, bytes));
    CHECK(cudaMalloc(&d_c, bytes));

    // Copy to GPU
    CHECK(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice));

    // Grid Setup
    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // Timing
    cudaEvent_t start, stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));

    printf("Launching Kernel...\n");
    CHECK(cudaEventRecord(start));
    multiplyScale<<<gridSize, blockSize>>>(d_a, d_b, d_c, alpha, n);
    CHECK(cudaEventRecord(stop));

    CHECK(cudaDeviceSynchronize());
    CHECK(cudaGetLastError()); // Check for kernel errors

    float ms = 0;
    CHECK(cudaEventElapsedTime(&ms, start, stop));

    float bandwidth = (3.0f * n * sizeof(float)) / (ms * 1e6);
    printf("Time: %.3f ms\n", ms);
    printf("Bandwidth: %.2f GB/s\n", bandwidth);

    // Copy back
    CHECK(cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost));

    // Verify
    int errors = 0;
    for (int i = 0; i < n; i++) {
        if (fabs(h_c[i] - ref[i]) > 1e-5) {
            errors++;
            if (errors < 5) printf("Error index %d: GPU=%.5f vs CPU=%.5f\n", i, h_c[i], ref[i]);
        }
    }

    if (errors == 0) printf("SUCCESS: Passed verification.\n");
    else printf("FAILURE: Found %d errors.\n", errors);

    // Cleanup
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(h_a); free(h_b); free(h_c); free(ref);

    return 0;
}

Overwriting multiply_scale.cu


In [23]:
%%writefile relu.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdlib.h>

// Helper to catch errors immediately
#define CHECK(call) \
{ \
    const cudaError_t error = call; \
    if (error != cudaSuccess) \
    { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
        exit(1); \
    } \
}

// Kernel: ReLU (y = max(x, 0))
__global__ void relu(const float *x, float *y, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        y[idx] = fmaxf(x[idx], 0.0f);
    }
}

int main(int argc, char *argv[]) {
    int n = 10000000;
    if (argc > 1) n = atoi(argv[1]);

    size_t bytes = n * sizeof(float);
    printf("ReLU Activation (N=%d)\n", n);

    // Host Alloc
    float *h_x = (float *)malloc(bytes);
    float *h_y = (float *)malloc(bytes);
    float *ref = (float *)malloc(bytes);

    // Init data (mix of positive and negative)
    int neg_count = 0;
    for (int i = 0; i < n; i++) {
        h_x[i] = sinf(i) * 100.0f - 50.0f; // Range approx [-150, 50]
        ref[i] = fmaxf(h_x[i], 0.0f);
        if (h_x[i] < 0) neg_count++;
    }
    printf("Data initialized (%d negatives)\n", neg_count);

    // GPU Alloc
    float *d_x, *d_y;
    CHECK(cudaMalloc(&d_x, bytes));
    CHECK(cudaMalloc(&d_y, bytes));

    CHECK(cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice));

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    cudaEvent_t start, stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));

    printf("Launching Kernel...\n");
    CHECK(cudaEventRecord(start));
    relu<<<gridSize, blockSize>>>(d_x, d_y, n);
    CHECK(cudaEventRecord(stop));

    CHECK(cudaDeviceSynchronize());
    CHECK(cudaGetLastError());

    float ms = 0;
    CHECK(cudaEventElapsedTime(&ms, start, stop));

    float bandwidth = (2.0f * n * sizeof(float)) / (ms * 1e6);
    printf("Time: %.3f ms\n", ms);
    printf("Bandwidth: %.2f GB/s\n", bandwidth);

    CHECK(cudaMemcpy(h_y, d_y, bytes, cudaMemcpyDeviceToHost));

    // Verify
    int errors = 0;
    for (int i = 0; i < n; i++) {
        if (fabs(h_y[i] - ref[i]) > 1e-5) {
            errors++;
            if (errors < 5) printf("Error at %d: got %f expected %f\n", i, h_y[i], ref[i]);
        }
    }

    if (errors == 0) printf("SUCCESS: Passed verification.\n");
    else printf("FAILURE: Found %d errors.\n", errors);

    // Cleanup
    cudaFree(d_x); cudaFree(d_y);
    free(h_x); free(h_y); free(ref);

    return 0;
}

Overwriting relu.cu


In [25]:
#Cell code credits: Gemini
#This cell was used to check id the files where loaded in the correct environment
!nvcc -arch=sm_75 -o vector_add vector_add.cu
!./vector_add

!nvcc -arch=sm_75 -o multiply_scale multiply_scale.cu
!./multiply_scale

!nvcc -arch=sm_75 -o relu relu.cu
!./relu

Debug Mode: Vector Add N=10000000
Allocating GPU memory...
Copying to GPU...
Launching Kernel...
Copying from GPU...
Verifying...
SUCCESS: GPU calculated correctly.
Multiply & Scale (N=10000000, Alpha=2.50)
Launching Kernel...
Time: 0.562 ms
Bandwidth: 213.64 GB/s
SUCCESS: Passed verification.
ReLU Activation (N=10000000)
Data initialized (6666664 negatives)
Launching Kernel...
Time: 0.434 ms
Bandwidth: 184.42 GB/s
SUCCESS: Passed verification.
