# Setup CUDA Directory Environment

In [1]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the PATH
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin


## C program


In [18]:
%%writefile C_mvp.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

void c_version(float* A, float* x, float* y, int m, int n) {
    int i, j;

    // The result vector 'y' must be initialized to zero before summation.
    for (i = 0; i < m; i++) {
        y[i] = 0.0f;
    }

    // Standard Matrix-Vector Multiplication loop structure:
    // y_i = sum_j ( A_ij * x_j )
    for (i = 0; i < m; i++) { // Loop over rows of A (index for y)
        float sum = 0.0f;
        for (j = 0; j < n; j++) { // Loop over columns of A (index for x)
            // A[i * n + j] accesses the element A_ij in row-major order
            sum += A[i * n + j] * x[j];
        }
        y[i] = sum;
    }
}

int main(int argc, char** argv)
{
    const size_t M = 1 << 14; 
    const size_t N = M;       

    const size_t MATRIX_SIZE = M * N; 
    const size_t VECTOR_SIZE = N;     

    const size_t MATRIX_BYTES = MATRIX_SIZE * sizeof(float);
    const size_t VECTOR_BYTES = VECTOR_SIZE * sizeof(float);

    const size_t loope = 30;

    //declare array 
    float *A, *x, *y;

    A = (float*)malloc(MATRIX_BYTES);
    x = (float*)malloc(VECTOR_BYTES);
    y = (float*)malloc(VECTOR_BYTES);

    // Check for allocation failure 
    if (A == NULL || x == NULL || y == NULL) {
        // Use fprintf(stderr, ...) like daxpy
        fprintf(stderr, "Failed to allocate memory for matrix/vectors.\n");
        // Free any that might have succeeded
        free(A);
        free(x);
        free(y);
        return 1; // Exit with an error code
    }

  //timer variables
    clock_t start, end;

    // Initialize Matrix A:
    printf("Initializing Matrix A (%lu elements, %lu x %lu)...\n", MATRIX_SIZE, M, N);
    for (size_t i = 0; i < M; i++) {
        for (size_t j = 0; j < N; j++) {
            // A[i * n + j] = 1.0 / (i + j + 1.0)
            A[i * N + j] = 1.0f / ((float)i + (float)j + 1.0f);
        }
    }

    // Initialize Vector x:
    printf("Initializing Vector x (%lu elements)...\n", VECTOR_SIZE);
    for (size_t j = 0; j < N; j++) {
        x[j] = sinf((float)j * 0.01f) * cosf((float)j * 0.007f) + 1.0f;
    }
    printf("Initialization complete.\n");


    // fill-in cache
    c_version(A, x, y, M, N);

    // time here***
    double elapse = 0.0, time_taken;
    
    for (size_t i = 0; i < loope; i++){
        start = clock();
        c_version(A, x, y, M, N);
        end = clock();
        time_taken = ((double)(end - start)) * 1E3 / CLOCKS_PER_SEC;
        elapse += time_taken;
    }

    printf("\n--- Benchmark Results ---\n");
    printf("Function c_version (in C) average time for %lu loops is %f milliseconds for matrix (m=%lu, n=%lu)\n", 
           loope, elapse / (double)loope, M, N);

    printf("\n--- Output Vector Y (m=%lu) ---\n", M);
    
    // Print first 3 elements
    printf("First 3 elements:\n");
    for(size_t i = 0; i < 3 && i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // Print last 3 elements
    printf("\nLast 3 elements:\n");
    size_t start_index = (M > 3) ? (M - 3) : 0; 
    for(size_t i = start_index; i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // error checking routine here -- 
    printf("\nStarting error check...\n");
    size_t err_count = 0;
    for (size_t i = 0; i < M; i++){ // For each element in y
        // Recalculate the correct value for y[i]
        float correct_y = 0.0f;
        for (size_t j = 0; j < N; j++) {
            correct_y += A[i * N + j] * x[j];
        }

        float epsilon = 1e-5f; // A small tolerance for floating point errors
        if (fabsf(y[i] - correct_y) > epsilon) {
            err_count++;
        }
    }
    printf("Error count (C program): %lu\n", err_count);

    // Free memory 
    free(A);
    free(x);
    free(y);
    return 0;
}

Overwriting C_mvp.c


In [19]:
%%bash
gcc C_mvp.c -o C_mvp -lm

In [20]:
%%bash
./C_mvp

Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

--- Benchmark Results ---
Function c_version (in C) average time for 30 loops is 2078.384433 milliseconds for matrix (m=16384, n=16384)

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (C program): 0


## CUDA Grid-stride loop

In [23]:
%%writefile CUDA_mvp2.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__
void mvp_kernel(const float* A, const float* x, float* y, size_t m, size_t n)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    for (size_t i = index; i < m; i += stride) {
        float sum = 0.0f;
        for (size_t j = 0; j < n; j++) {
            // A[i * n + j] accesses the element A_ij
            sum += A[i * n + j] * x[j];
        }
        y[i] = sum;
    }
}

int main(int argc, char** argv)
{
    const size_t M = 1 << 14; 
    const size_t N = M;       

    const size_t MATRIX_SIZE = M * N; 
    const size_t VECTOR_SIZE = N;     

    const size_t MATRIX_BYTES = MATRIX_SIZE * sizeof(float);
    const size_t VECTOR_BYTES = VECTOR_SIZE * sizeof(float);

    const size_t loope = 30;

    //declare array 
    float *A, *x, *y;

    // Allocate managed memory 
    cudaMallocManaged(&A, MATRIX_BYTES);
    cudaMallocManaged(&x, VECTOR_BYTES);
    cudaMallocManaged(&y, VECTOR_BYTES);

    // Check for allocation failure
    if (A == NULL || x == NULL || y == NULL) {
        fprintf(stderr, "Failed to allocate memory for matrix/vectors.\n");
        cudaFree(A);
        cudaFree(x);
        cudaFree(y);
        return 1;
    }

    // ***--- initialize your array here ---------
    printf("Initializing Matrix A (%lu elements, %lu x %lu)...\n", MATRIX_SIZE, M, N);
    for (size_t i = 0; i < M; i++) {
        for (size_t j = 0; j < N; j++) {
            A[i * N + j] = 1.0f / ((float)i + (float)j + 1.0f);
        }
    }
    printf("Initializing Vector x (%lu elements)...\n", VECTOR_SIZE);
    for (size_t j = 0; j < N; j++) {
        x[j] = sinf((float)j * 0.01f) * cosf((float)j * 0.007f) + 1.0f;
    }
    printf("Initialization complete.\n");

    // *** setup CUDA kernel
    size_t numThreads = 1024; 
    size_t numBlocks = (M + numThreads - 1) / numThreads;

    printf("\n*** function = Matrix Vector Product (CUDA)\n");
    printf("Matrix (m x n) = %lu x %lu\n", M, N);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i = 0; i < loope; i++){
        mvp_kernel<<<numBlocks, numThreads>>>(A, x, y, M, N);
    }

    //barrier
    cudaDeviceSynchronize();

    printf("\n--- Output Vector Y (m=%lu) ---\n", M);
    
    // Print first 3 elements
    printf("First 3 elements:\n");
    for(size_t i = 0; i < 3 && i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // Print last 3 elements
    printf("\nLast 3 elements:\n");
    size_t start_index = (M > 3) ? (M - 3) : 0; 
    for(size_t i = start_index; i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }


    // error checking routine here 
    printf("\nStarting error check...\n");
    size_t err_count = 0;
    for (size_t i = 0; i < M; i++){ 
        float correct_y = 0.0f;
        for (size_t j = 0; j < N; j++) {
            correct_y += A[i * N + j] * x[j];
        }
        float epsilon = 1e-5f; // A small tolerance for floating point errors
        if (fabsf(y[i] - correct_y) > epsilon) {
            err_count++;
        }
    }
    printf("Error count (CUDA program): %lu\n", err_count);

    // Free memory 
    cudaFree(A);
    cudaFree(x);
    cudaFree(y);
    
    return 0;
}


Overwriting CUDA_mvp2.cu


In [24]:
%%bash
nvcc CUDA_mvp2.cu -o CUDA_mvp2 -lm -Wno-deprecated-gpu-targets

In [25]:
%%bash
nvprof ./CUDA_mvp2

==1047855== NVPROF is profiling process 1047855, command: ./CUDA_mvp2


Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

*** function = Matrix Vector Product (CUDA)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0


==1047855== Profiling application: ./CUDA_mvp2
==1047855== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.79134s        30  59.711ms  42.401ms  484.03ms  mvp_kernel(float const *, float const *, float*, unsigned long, unsigned long)
      API calls:   52.16%  2.08889s         3  696.30ms  198.69us  2.08756s  cudaMallocManaged
                   44.79%  1.79343s         1  1.79343s  1.79343s  1.79343s  cudaDeviceSynchronize
                    2.73%  109.37ms         3  36.457ms  656.90us  107.73ms  cudaFree
                    0.27%  10.679ms        30  355.98us  13.560us  9.9582ms  cudaLaunchKernel
                    0.04%  1.4876ms       114  13.048us     163ns  587.47us  cuDeviceGetAttribute
                    0.01%  423.36us         1  423.36us  423.36us  423.36us  cuDeviceGetName
                    0.00%  69.786us         1  69.786us  69.786us  69.786us  cuDeviceTotalMem
                    0.00% 

In [1]:
%%bash
nsys profile  -o CUDA_mvp2 ./CUDA_mvp2

bash: line 1: nsys: command not found


CalledProcessError: Command 'b'nsys profile  -o CUDA_mvp2 ./CUDA_mvp2\n'' returned non-zero exit status 127.

## CUDA Grid-stride loop with prefetching 

In [6]:
%%writefile CUDA_mvp3.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__
void mvp_kernel(const float* A, const float* x, float* y, size_t m, size_t n)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    for (size_t i = index; i < m; i += stride) {
        float sum = 0.0f;
        for (size_t j = 0; j < n; j++) {
            // A[i * n + j] accesses the element A_ij
            sum += A[i * n + j] * x[j];
        }
        y[i] = sum;
    }
}

int main(int argc, char** argv)
{
    const size_t M = 1 << 14; 
    const size_t N = M;       

    const size_t MATRIX_SIZE = M * N; 
    const size_t VECTOR_SIZE = N;     

    const size_t MATRIX_BYTES = MATRIX_SIZE * sizeof(float);
    const size_t VECTOR_BYTES = VECTOR_SIZE * sizeof(float);

    const size_t loope = 30;

    //declare array 
    float *A, *x, *y;

    // Allocate managed memory 
    cudaMallocManaged(&A, MATRIX_BYTES);
    cudaMallocManaged(&x, VECTOR_BYTES);
    cudaMallocManaged(&y, VECTOR_BYTES);

    // Check for allocation failure
    if (A == NULL || x == NULL || y == NULL) {
        fprintf(stderr, "Failed to allocate memory for matrix/vectors.\n");
        cudaFree(A);
        cudaFree(x);
        cudaFree(y);
        return 1;
    }

    //get gpu id
    int device = -1;
    cudaGetDevice(&device);
    
    // ***--- initialize your array here ---------
    printf("Initializing Matrix A (%lu elements, %lu x %lu)...\n", MATRIX_SIZE, M, N);
    for (size_t i = 0; i < M; i++) {
        for (size_t j = 0; j < N; j++) {
            A[i * N + j] = 1.0f / ((float)i + (float)j + 1.0f);
        }
    }
    printf("Initializing Vector x (%lu elements)...\n", VECTOR_SIZE);
    for (size_t j = 0; j < N; j++) {
        x[j] = sinf((float)j * 0.01f) * cosf((float)j * 0.007f) + 1.0f;
    }
    printf("Initialization complete.\n");

    //"Prefetch data" from CPU-GPU
    cudaMemPrefetchAsync(A, MATRIX_BYTES, device, NULL);
    cudaMemPrefetchAsync(x, VECTOR_BYTES, device, NULL);
    
    // *** setup CUDA kernel
    size_t numThreads = 1024; 
    size_t numBlocks = (M + numThreads - 1) / numThreads;

    printf("\n*** function = Matrix Vector Product (CUDA)\n");
    printf("Matrix (m x n) = %lu x %lu\n", M, N);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i = 0; i < loope; i++){
        mvp_kernel<<<numBlocks, numThreads>>>(A, x, y, M, N);
    }

    //barrier
    cudaDeviceSynchronize();

    //"Prefetch data" from GPU-CPU
    cudaMemPrefetchAsync(A, MATRIX_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(x, VECTOR_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(y, VECTOR_BYTES, cudaCpuDeviceId, NULL);

    // --- Display first 3 and last 3 elements 
    printf("\n--- Output Vector Y (m=%lu) ---\n", M);
    
    // Print first 3 elements
    printf("First 3 elements:\n");
    for(size_t i = 0; i < 3 && i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // Print last 3 elements
    printf("\nLast 3 elements:\n");
    size_t start_index = (M > 3) ? (M - 3) : 0; 
    for(size_t i = start_index; i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // error checking routine here 
    printf("\nStarting error check...\n");
    size_t err_count = 0;
    for (size_t i = 0; i < M; i++){ 
        float correct_y = 0.0f;
        for (size_t j = 0; j < N; j++) {
            correct_y += A[i * N + j] * x[j];
        }
        float epsilon = 1e-5f; // A small tolerance for floating point errors
        if (fabsf(y[i] - correct_y) > epsilon) {
            err_count++;
        }
    }
    printf("Error count (CUDA program): %lu\n", err_count);

    // Free memory 
    cudaFree(A);
    cudaFree(x);
    cudaFree(y);
    
    return 0;
}


Overwriting CUDA_mvp3.cu


In [7]:
%%bash
nvcc CUDA_mvp3.cu -o CUDA_mvp3 -lm -Wno-deprecated-gpu-targets

In [8]:
%%bash
nvprof ./CUDA_mvp3

==1060552== NVPROF is profiling process 1060552, command: ./CUDA_mvp3


Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

*** function = Matrix Vector Product (CUDA)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0


==1060552== Profiling application: ./CUDA_mvp3
==1060552== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  866.34ms        30  28.878ms  27.642ms  30.216ms  mvp_kernel(float const *, float const *, float*, unsigned long, unsigned long)
      API calls:   55.84%  1.97716s         3  659.05ms  22.848us  1.97659s  cudaMallocManaged
                   24.54%  868.90ms         1  868.90ms  868.90ms  868.90ms  cudaDeviceSynchronize
                   15.49%  548.39ms         5  109.68ms  124.60us  437.56ms  cudaMemPrefetchAsync
                    2.60%  92.084ms         3  30.695ms  384.29us  90.848ms  cudaFree
                    1.48%  52.479ms        30  1.7493ms  11.521us  51.744ms  cudaLaunchKernel
                    0.03%  1.0209ms       114  8.9550us     136ns  395.08us  cuDeviceGetAttribute
                    0.01%  383.52us         1  383.52us  383.52us  383.52us  cuDeviceGetName
                    0.

In [9]:
%%bash
nsys profile  -o CUDA_mvp3 ./CUDA_mvp3

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

*** function = Matrix Vector Product (CUDA)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0


Failed to create '/home/jupyter-gerald_corpuz@dlsu-94f82/CUDA_mvp3.nsys-rep': File exists.
Use `--force-overwrite true` to overwrite existing files.


Collecting data...
Generating '/tmp/nsys-report-b846.qdstrm'
Generated:
	/tmp/nsys-report-6055.nsys-rep


### CUDA Grid-stride loop with prefetch and page creation

In [30]:
%%writefile CUDA_mvp4.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__
void mvp_kernel(const float* A, const float* x, float* y, size_t m, size_t n)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    for (size_t i = index; i < m; i += stride) {
        float sum = 0.0f;
        for (size_t j = 0; j < n; j++) {
            // A[i * n + j] accesses the element A_ij
            sum += A[i * n + j] * x[j];
        }
        y[i] = sum;
    }
}

int main(int argc, char** argv)
{
    const size_t M = 1 << 14; 
    const size_t N = M;       

    const size_t MATRIX_SIZE = M * N; 
    const size_t VECTOR_SIZE = N;     

    const size_t MATRIX_BYTES = MATRIX_SIZE * sizeof(float);
    const size_t VECTOR_BYTES = VECTOR_SIZE * sizeof(float);

    const size_t loope = 30;

    //declare array 
    float *A, *x, *y;

    // Allocate managed memory 
    cudaMallocManaged(&A, MATRIX_BYTES);
    cudaMallocManaged(&x, VECTOR_BYTES);
    cudaMallocManaged(&y, VECTOR_BYTES);

    // Check for allocation failure
    if (A == NULL || x == NULL || y == NULL) {
        fprintf(stderr, "Failed to allocate memory for matrix/vectors.\n");
        cudaFree(A);
        cudaFree(x);
        cudaFree(y);
        return 1;
    }

    //get gpu id
    int device = -1;
    cudaGetDevice(&device);

    //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(A,MATRIX_BYTES,cudaCpuDeviceId,NULL);
    cudaMemPrefetchAsync(x,VECTOR_BYTES,cudaCpuDeviceId,NULL);
    //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(y,VECTOR_BYTES,device,NULL);

    // ***--- initialize your array here ---------
    printf("Initializing Matrix A (%lu elements, %lu x %lu)...\n", MATRIX_SIZE, M, N);
    for (size_t i = 0; i < M; i++) {
        for (size_t j = 0; j < N; j++) {
            A[i * N + j] = 1.0f / ((float)i + (float)j + 1.0f);
        }
    }
    printf("Initializing Vector x (%lu elements)...\n", VECTOR_SIZE);
    for (size_t j = 0; j < N; j++) {
        x[j] = sinf((float)j * 0.01f) * cosf((float)j * 0.007f) + 1.0f;
    }
    printf("Initialization complete.\n");

    //"Prefetch data" from CPU-GPU
    cudaMemPrefetchAsync(A, MATRIX_BYTES, device, NULL);
    cudaMemPrefetchAsync(x, VECTOR_BYTES, device, NULL);
    
    // *** setup CUDA kernel
    size_t numThreads = 1024; 
    size_t numBlocks = (M + numThreads - 1) / numThreads;

    printf("\n*** function = Matrix Vector Product (CUDA)\n");
    printf("Matrix (m x n) = %lu x %lu\n", M, N);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i = 0; i < loope; i++){
        mvp_kernel<<<numBlocks, numThreads>>>(A, x, y, M, N);
    }

    //barrier
    cudaDeviceSynchronize();

    //"Prefetch data" from GPU-CPU
    cudaMemPrefetchAsync(A, MATRIX_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(x, VECTOR_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(y, VECTOR_BYTES, cudaCpuDeviceId, NULL);

    // --- Display first 3 and last 3 elements 
    printf("\n--- Output Vector Y (m=%lu) ---\n", M);
    
    // Print first 3 elements
    printf("First 3 elements:\n");
    for(size_t i = 0; i < 3 && i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // Print last 3 elements
    printf("\nLast 3 elements:\n");
    size_t start_index = (M > 3) ? (M - 3) : 0; 
    for(size_t i = start_index; i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // error checking routine here 
    printf("\nStarting error check...\n");
    size_t err_count = 0;
    for (size_t i = 0; i < M; i++){ 
        float correct_y = 0.0f;
        for (size_t j = 0; j < N; j++) {
            correct_y += A[i * N + j] * x[j];
        }
        float epsilon = 1e-5f; // A small tolerance for floating point errors
        if (fabsf(y[i] - correct_y) > epsilon) {
            err_count++;
        }
    }
    printf("Error count (CUDA program): %lu\n", err_count);

    // Free memory 
    cudaFree(A);
    cudaFree(x);
    cudaFree(y);
    
    return 0;
}


Writing CUDA_mvp4.cu


In [31]:
%%bash
nvcc CUDA_mvp4.cu -o CUDA_mvp4 -lm -Wno-deprecated-gpu-targets

In [32]:
%%bash
nvprof ./CUDA_mvp4

==1048884== NVPROF is profiling process 1048884, command: ./CUDA_mvp4


Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

*** function = Matrix Vector Product (CUDA)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0


==1048884== Profiling application: ./CUDA_mvp4
==1048884== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  858.08ms        30  28.603ms  13.258ms  246.49ms  mvp_kernel(float const *, float const *, float*, unsigned long, unsigned long)
      API calls:   37.21%  1.73487s         8  216.86ms  268.88us  1.19062s  cudaMemPrefetchAsync
                   35.44%  1.65244s         3  550.81ms  67.413us  1.65152s  cudaMallocManaged
                   19.73%  919.92ms         1  919.92ms  919.92ms  919.92ms  cudaDeviceSynchronize
                    3.82%  178.08ms        30  5.9361ms  19.024us  176.98ms  cudaLaunchKernel
                    3.77%  175.62ms         3  58.540ms  728.49us  172.52ms  cudaFree
                    0.03%  1.3627ms       114  11.953us     129ns  576.11us  cuDeviceGetAttribute
                    0.01%  523.28us         1  523.28us  523.28us  523.28us  cuDeviceGetName
                    0.

In [33]:
%%bash
nsys profile  -o CUDA_mvp4 ./CUDA_mvp4

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

*** function = Matrix Vector Product (CUDA)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0
Collecting data...
Generating '/tmp/nsys-report-6cff.qdstrm'
Generated:
	/home/jupyter-gerald_corpuz@dlsu-94f82/CUDA_mvp4.nsys-rep


### CUDA Grid-stride loop with prefetch and page creation and mem advise

In [34]:
%%writefile CUDA_mvp5.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__
void mvp_kernel(const float* A, const float* x, float* y, size_t m, size_t n)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    for (size_t i = index; i < m; i += stride) {
        float sum = 0.0f;
        for (size_t j = 0; j < n; j++) {
            // A[i * n + j] accesses the element A_ij
            sum += A[i * n + j] * x[j];
        }
        y[i] = sum;
    }
}

int main(int argc, char** argv)
{
    const size_t M = 1 << 14; 
    const size_t N = M;       

    const size_t MATRIX_SIZE = M * N; 
    const size_t VECTOR_SIZE = N;     

    const size_t MATRIX_BYTES = MATRIX_SIZE * sizeof(float);
    const size_t VECTOR_BYTES = VECTOR_SIZE * sizeof(float);

    const size_t loope = 30;

    //declare array 
    float *A, *x, *y;

    // Allocate managed memory 
    cudaMallocManaged(&A, MATRIX_BYTES);
    cudaMallocManaged(&x, VECTOR_BYTES);
    cudaMallocManaged(&y, VECTOR_BYTES);

    // Check for allocation failure
    if (A == NULL || x == NULL || y == NULL) {
        fprintf(stderr, "Failed to allocate memory for matrix/vectors.\n");
        cudaFree(A);
        cudaFree(x);
        cudaFree(y);
        return 1;
    }

    //get gpu id
    int device = -1;
    cudaGetDevice(&device);

    // memory advise
    cudaMemAdvise(A, MATRIX_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(A, MATRIX_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
    cudaMemAdvise(x, VECTOR_BYTES, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemAdvise(x, VECTOR_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

    //"prefetch data" to create CPU page memory
    cudaMemPrefetchAsync(A,MATRIX_BYTES,cudaCpuDeviceId,NULL);
    cudaMemPrefetchAsync(x,VECTOR_BYTES,cudaCpuDeviceId,NULL);
    //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(y,VECTOR_BYTES,device,NULL);

    // ***--- initialize your array here ---------
    printf("Initializing Matrix A (%lu elements, %lu x %lu)...\n", MATRIX_SIZE, M, N);
    for (size_t i = 0; i < M; i++) {
        for (size_t j = 0; j < N; j++) {
            A[i * N + j] = 1.0f / ((float)i + (float)j + 1.0f);
        }
    }
    printf("Initializing Vector x (%lu elements)...\n", VECTOR_SIZE);
    for (size_t j = 0; j < N; j++) {
        x[j] = sinf((float)j * 0.01f) * cosf((float)j * 0.007f) + 1.0f;
    }
    printf("Initialization complete.\n");

    //"Prefetch data" from CPU-GPU
    cudaMemPrefetchAsync(A, MATRIX_BYTES, device, NULL);
    cudaMemPrefetchAsync(x, VECTOR_BYTES, device, NULL);
    
    // *** setup CUDA kernel
    size_t numThreads = 1024; 
    size_t numBlocks = (M + numThreads - 1) / numThreads;

    printf("\n*** function = Matrix Vector Product (CUDA)\n");
    printf("Matrix (m x n) = %lu x %lu\n", M, N);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i = 0; i < loope; i++){
        mvp_kernel<<<numBlocks, numThreads>>>(A, x, y, M, N);
    }
    
    //barrier
    cudaDeviceSynchronize();

    //"Prefetch data" from GPU-CPU
    cudaMemPrefetchAsync(A, MATRIX_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(x, VECTOR_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(y, VECTOR_BYTES, cudaCpuDeviceId, NULL);

    // --- Display first 3 and last 3 elements 
    printf("\n--- Output Vector Y (m=%lu) ---\n", M);
    
    // Print first 3 elements
    printf("First 3 elements:\n");
    for(size_t i = 0; i < 3 && i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // Print last 3 elements
    printf("\nLast 3 elements:\n");
    size_t start_index = (M > 3) ? (M - 3) : 0; 
    for(size_t i = start_index; i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // error checking routine here 
    printf("\nStarting error check...\n");
    size_t err_count = 0;
    for (size_t i = 0; i < M; i++){ 
        float correct_y = 0.0f;
        for (size_t j = 0; j < N; j++) {
            correct_y += A[i * N + j] * x[j];
        }
        float epsilon = 1e-5f; // A small tolerance for floating point errors
        if (fabsf(y[i] - correct_y) > epsilon) {
            err_count++;
        }
    }
    printf("Error count (CUDA program): %lu\n", err_count);

    // Free memory 
    cudaFree(A);
    cudaFree(x);
    cudaFree(y);
    
    return 0;
}

Writing CUDA_mvp5.cu


In [35]:
%%bash
nvcc CUDA_mvp5.cu -o CUDA_mvp5 -lm -Wno-deprecated-gpu-targets

In [36]:
%%bash
nvprof ./CUDA_mvp5

==1049512== NVPROF is profiling process 1049512, command: ./CUDA_mvp5


Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

*** function = Matrix Vector Product (CUDA)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0


==1049512== Profiling application: ./CUDA_mvp5
==1049512== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  431.93ms        30  14.398ms  13.257ms  14.873ms  mvp_kernel(float const *, float const *, float*, unsigned long, unsigned long)
      API calls:   50.53%  2.19424s         3  731.41ms  81.789us  2.19316s  cudaMallocManaged
                   37.36%  1.62221s         8  202.78ms  619.46us  1.35592s  cudaMemPrefetchAsync
                    9.93%  431.12ms         1  431.12ms  431.12ms  431.12ms  cudaDeviceSynchronize
                    2.03%  88.136ms         3  29.379ms  722.90us  86.625ms  cudaFree
                    0.11%  4.5782ms        30  152.60us  14.801us  3.6960ms  cudaLaunchKernel
                    0.03%  1.3738ms       114  12.050us     127ns  549.41us  cuDeviceGetAttribute
                    0.01%  379.43us         1  379.43us  379.43us  379.43us  cuDeviceGetName
                    0.

In [37]:
%%bash
nsys profile  -o CUDA_mvp5 ./CUDA_mvp5

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



Initializing Matrix A (268435456 elements, 16384 x 16384)...
Initializing Vector x (16384 elements)...
Initialization complete.

*** function = Matrix Vector Product (CUDA)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0
Collecting data...
Generating '/tmp/nsys-report-e8a4.qdstrm'
Generated:
	/home/jupyter-gerald_corpuz@dlsu-94f82/CUDA_mvp5.nsys-rep


### Classic MemCopy method (no Unified memory)

In [6]:
%%writefile CUDA_mvp6.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__
void mvp_kernel(const float* A, const float* x, float* y, size_t m, size_t n)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    for (size_t i = index; i < m; i += stride) {
        float sum = 0.0f;
        for (size_t j = 0; j < n; j++) {
            // A[i * n + j] accesses the element A_ij
            sum += A[i * n + j] * x[j];
        }
        y[i] = sum;
    }
}

int main(int argc, char** argv)
{
    const size_t M = 1 << 14; 
    const size_t N = M;       

    const size_t MATRIX_SIZE = M * N; 
    const size_t VECTOR_SIZE = N;     

    const size_t MATRIX_BYTES = MATRIX_SIZE * sizeof(float);
    const size_t VECTOR_BYTES = VECTOR_SIZE * sizeof(float);

    const size_t loope = 30;

    //declare host arrays 
    float *h_A, *h_x, *h_y;
    //declare device arrays
    float *d_A, *d_x, *d_y;

    // Allocate host memory 
    h_A = (float*)malloc(MATRIX_BYTES);
    h_x = (float*)malloc(VECTOR_BYTES);
    h_y = (float*)malloc(VECTOR_BYTES);

    if (h_A == NULL || h_x == NULL || h_y == NULL) {
        fprintf(stderr, "Failed to allocate host memory for matrix/vectors.\n");
        free(h_A);
        free(h_x);
        free(h_y);
        return 1;
    }

    // Allocate device memory
    cudaMalloc(&d_A, MATRIX_BYTES);
    cudaMalloc(&d_x, VECTOR_BYTES);
    cudaMalloc(&d_y, VECTOR_BYTES);
    
    if (d_A == NULL || d_x == NULL || d_y == NULL) {
        fprintf(stderr, "Failed to allocate device memory for matrix/vectors.\n");
        cudaFree(d_A);
        cudaFree(d_x);
        cudaFree(d_y);
        free(h_A);
        free(h_x);
        free(h_y);
        return 1;
    }


    // ***--- initialize your array on the HOST here --------
    printf("Initializing Matrix A (%lu elements, %lu x %lu)... on host\n", MATRIX_SIZE, M, N);
    for (size_t i = 0; i < M; i++) {
        for (size_t j = 0; j < N; j++) {
            h_A[i * N + j] = 1.0f / ((float)i + (float)j + 1.0f);
        }
    }
    printf("Initializing Vector x (%lu elements)... on host\n", VECTOR_SIZE);
    for (size_t j = 0; j < N; j++) {
        h_x[j] = sinf((float)j * 0.01f) * cosf((float)j * 0.007f) + 1.0f;
    }
    printf("Initialization complete.\n");

    // *** Copy data from Host to Device
    printf("Copying data from Host to Device...\n");
    cudaMemcpy(d_A, h_A, MATRIX_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, h_x, VECTOR_BYTES, cudaMemcpyHostToDevice);
    printf("Copy complete.\n");

    // *** setup CUDA kernel
    size_t numThreads = 1024; 
    size_t numBlocks = (M + numThreads - 1) / numThreads;

    printf("\n*** function = Matrix Vector Product (CUDA - Classic MemCpy)\n");
    printf("Matrix (m x n) = %lu x %lu\n", M, N);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i = 0; i < loope; i++){
        // Launch kernel with DEVICE pointers
        mvp_kernel<<<numBlocks, numThreads>>>(d_A, d_x, d_y, M, N);
    }

    //barrier
    cudaDeviceSynchronize();
    printf("Kernel execution complete.\n");

    // *** Copy result from Device to Host
    printf("Copying result from Device to Host...\n");
    cudaMemcpy(h_y, d_y, VECTOR_BYTES, cudaMemcpyDeviceToHost);
    printf("Copy complete.\n");


    printf("\n--- Output Vector Y (m=%lu) ---\n", M);
    
    // Print first 3 elements (from host array)
    printf("First 3 elements:\n");
    for(size_t i = 0; i < 3 && i < M; i++) {
        printf("y[%lu] = %f\n", i, h_y[i]);
    }

    // Print last 3 elements (from host array)
    printf("\nLast 3 elements:\n");
    size_t start_index = (M > 3) ? (M - 3) : 0; 
    for(size_t i = start_index; i < M; i++) {
        printf("y[%lu] = %f\n", i, h_y[i]);
    }


    // error checking routine here (using host arrays)
    printf("\nStarting error check...\n");
    size_t err_count = 0;
    for (size_t i = 0; i < M; i++){ 
        float correct_y = 0.0f;
        for (size_t j = 0; j < N; j++) {
            correct_y += h_A[i * N + j] * h_x[j];
        }
        float epsilon = 1e-5f; // A small tolerance for floating point errors
        if (fabsf(h_y[i] - correct_y) > epsilon) {
            err_count++;
        }
    }
    printf("Error count (CUDA program): %lu\n", err_count);

    // Free device memory 
    cudaFree(d_A);
    cudaFree(d_x);
    cudaFree(d_y);
    
    // Free host memory
    free(h_A);
    free(h_x);
    free(h_y);
    
    return 0;
}

Overwriting CUDA_mvp6.cu


In [7]:
%%bash
nvcc CUDA_mvp6.cu -o CUDA_mvp6 -lm -Wno-deprecated-gpu-targets

In [8]:
%%bash
nvprof ./CUDA_mvp6

==1065633== NVPROF is profiling process 1065633, command: ./CUDA_mvp6


Initializing Matrix A (268435456 elements, 16384 x 16384)... on host
Initializing Vector x (16384 elements)... on host
Initialization complete.
Copying data from Host to Device...
Copy complete.

*** function = Matrix Vector Product (CUDA - Classic MemCpy)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 
Kernel execution complete.
Copying result from Device to Host...
Copy complete.

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0


==1065633== Profiling application: ./CUDA_mvp6
==1065633== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   77.77%  1.48308s         2  741.54ms  12.928us  1.48306s  [CUDA memcpy HtoD]
                   22.22%  423.80ms        30  14.127ms  13.408ms  14.833ms  mvp_kernel(float const *, float const *, float*, unsigned long, unsigned long)
                    0.00%  20.095us         1  20.095us  20.095us  20.095us  [CUDA memcpy DtoH]
      API calls:   47.69%  1.90540s         3  635.13ms  16.899us  1.90463s  cudaMalloc
                   37.20%  1.48623s         3  495.41ms  419.03us  1.48472s  cudaMemcpy
                   10.59%  423.15ms         1  423.15ms  423.15ms  423.15ms  cudaDeviceSynchronize
                    4.41%  176.14ms         3  58.712ms  306.34us  172.83ms  cudaFree
                    0.09%  3.5588ms        30  118.63us  17.666us  2.8505ms  cudaLaunchKernel
                    0.02%  827.94us   

In [9]:
%%bash
nsys profile  -o CUDA_mvp6 ./CUDA_mvp6

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



Initializing Matrix A (268435456 elements, 16384 x 16384)... on host
Initializing Vector x (16384 elements)... on host
Initialization complete.
Copying data from Host to Device...
Copy complete.

*** function = Matrix Vector Product (CUDA - Classic MemCpy)
Matrix (m x n) = 16384 x 16384
numBlocks = 16, numThreads = 1024 
Kernel execution complete.
Copying result from Device to Host...
Copy complete.

--- Output Vector Y (m=16384) ---
First 3 elements:
y[0] = 11.800177
y[1] = 10.766556
y[2] = 10.237693

Last 3 elements:
y[16381] = 0.703410
y[16382] = 0.703379
y[16383] = 0.703348

Starting error check...
Error count (CUDA program): 0


Failed to create '/home/jupyter-gerald_corpuz@dlsu-94f82/CUDA_mvp6.nsys-rep': File exists.
Use `--force-overwrite true` to overwrite existing files.


Collecting data...
Generating '/tmp/nsys-report-31b7.qdstrm'
Generated:
	/tmp/nsys-report-bc7e.nsys-rep


###  CUDA Kernel with Data Initialization

In [10]:
%%writefile CUDA_mvp7.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__
void init_A_kernel(float* A, size_t M, size_t N)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;
    size_t total_elements = M * N;

    for (size_t idx = index; idx < total_elements; idx += stride)
    {
        size_t i = idx / N; // Get row
        size_t j = idx % N; // Get col
        // A[idx] is the same as A[i * N + j]
        A[idx] = 1.0f / ((float)i + (float)j + 1.0f);
    }
}

__global__
void init_x_kernel(float* x, size_t N)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockIdx.x * gridDim.x;

    for (size_t j = index; j < N; j += stride)
    {
        x[j] = sinf((float)j * 0.01f) * cosf((float)j * 0.007f) + 1.0f;
    }
}


__global__
void mvp_kernel(const float* A, const float* x, float* y, size_t m, size_t n)
{
    size_t index = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = (size_t)blockDim.x * gridDim.x;

    for (size_t i = index; i < m; i += stride) {
        float sum = 0.0f;
        for (size_t j = 0; j < n; j++) {
            // A[i * n + j] accesses the element A_ij
            sum += A[i * n + j] * x[j];
        }
        y[i] = sum;
    }
}

int main(int argc, char** argv)
{
    const size_t M = 1 << 14; 
    const size_t N = M;       

    const size_t MATRIX_SIZE = M * N; 
    const size_t VECTOR_SIZE = N;     

    const size_t MATRIX_BYTES = MATRIX_SIZE * sizeof(float);
    const size_t VECTOR_BYTES = VECTOR_SIZE * sizeof(float);

    const size_t loope = 30;

    //declare array 
    float *A, *x, *y;

    // Allocate managed memory 
    cudaMallocManaged(&A, MATRIX_BYTES);
    cudaMallocManaged(&x, VECTOR_BYTES);
    cudaMallocManaged(&y, VECTOR_BYTES);

    // Check for allocation failure
    if (A == NULL || x == NULL || y == NULL) {
        fprintf(stderr, "Failed to allocate memory for matrix/vectors.\n");
        cudaFree(A);
        cudaFree(x);
        cudaFree(y);
        return 1;
    }

    // *** setup CUDA kernel launch parameters
    size_t numThreads = 1024; 
    
    // ***--- initialize your array here using CUDA Kernels ---
    printf("Initializing Matrix A (%lu elements, %lu x %lu)... on GPU\n", MATRIX_SIZE, M, N);
    size_t total_elements_A = M * N;
    size_t numBlocks_A = (total_elements_A + numThreads - 1) / numThreads;
    init_A_kernel<<<numBlocks_A, numThreads>>>(A, M, N);
    
    printf("Initializing Vector x (%lu elements)... on GPU\n", VECTOR_SIZE);
    size_t numBlocks_x = (N + numThreads - 1) / numThreads;
    init_x_kernel<<<numBlocks_x, numThreads>>>(x, N);

    // Synchronize to ensure initialization is complete
    cudaDeviceSynchronize();
    printf("Initialization complete.\n");


    // *** setup CUDA kernel for MVP
    size_t numBlocks = (M + numThreads - 1) / numThreads;

    printf("\n*** function = Matrix Vector Product (CUDA - Kernel Init)\n");
    printf("Matrix (m x n) = %lu x %lu\n", M, N);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    for (size_t i = 0; i < loope; i++){
        mvp_kernel<<<numBlocks, numThreads>>>(A, x, y, M, N);
    }

    //barrier
    cudaDeviceSynchronize();

    printf("\n--- Output Vector Y (m=%lu) ---\n", M);
    
    // Print first 3 elements
    // This will trigger Device-to-Host page faults/migration for y
    printf("First 3 elements:\n");
    for(size_t i = 0; i < 3 && i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }

    // Print last 3 elements
    printf("\nLast 3 elements:\n");
    size_t start_index = (M > 3) ? (M - 3) : 0; 
    for(size_t i = start_index; i < M; i++) {
        printf("y[%lu] = %f\n", i, y[i]);
    }


    // error checking routine here 
    // This will trigger Device-to-Host page faults/migration for A and x
    printf("\nStarting error check...\n");
    size_t err_count = 0;
    for (size_t i = 0; i < M; i++){ 
        float correct_y = 0.0f;
        for (size_t j = 0; j < N; j++) {
            correct_y += A[i * N + j] * x[j];
        }
        float epsilon = 1e-5f; // A small tolerance for floating point errors
        if (fabsf(y[i] - correct_y) > epsilon) {
            err_count++;
        }
    }
    printf("Error count (CUDA program): %lu\n", err_count);

    // Free memory 
    cudaFree(A);
    cudaFree(x);
    cudaFree(y);
    
    return 0;
}

Overwriting CUDA_mvp7.cu


In [11]:
%%bash
nvcc CUDA_mvp7.cu -o CUDA_mvp7 -lm -Wno-deprecated-gpu-targets

In [None]:
%%bash
nvprof ./CUDA_mvp7

In [None]:
%%bash
nsys profile  -o CUDA_mvp7 ./CUDA_mvp7