<a href="https://colab.research.google.com/github/Neel-Dandiwala/CUDA-Programs/blob/master/ArrayAdditionCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%%writefile add_grid.cu

#include <math.h>
#include <iostream>

__global__
void add(int n, float *x, float *y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for(int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];
}

int main(void){
    int N = 1<<20;
    float *x, *y;

    cudaMallocManaged(&x, N*sizeof(float));
    cudaMallocManaged(&y, N*sizeof(float));

    for(int i = 0; i < N; i++){
        x[i] = 1.0f;
        y[i] = 2.0f;
    }

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    add<<<numBlocks, blockSize>>>(N, x, y);

    cudaDeviceSynchronize();

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
      maxError = fmax(maxError, fabs(y[i] - 3.0f));
    std::cout << "Max Error: " << maxError << std::endl;

    cudaFree(x);
    cudaFree(y);

    return 0;
}

Writing add_grid.cu


In [5]:
%%shell 

nvcc add_grid.cu -o add_grid
nsys profile -o prefetch --stats=true ./add_grid

Collecting data...
Max Error: 0
Processing events...
Capturing symbol files...
Saving temporary "/tmp/nsys-report-8a67-2a55-663a-a0fa.qdstrm" file to disk...
Creating final output files...

Saved report file to "/tmp/nsys-report-8a67-2a55-663a-a0fa.qdrep"

Exported successfully to
/tmp/nsys-report-8a67-2a55-663a-a0fa.sqlite

Generating CUDA API Statistics...
CUDA API Statistics (nanoseconds)

Time(%)      Total Time       Calls         Average         Minimum         Maximum  Name                                                                            
-------  --------------  ----------  --------------  --------------  --------------  --------------------------------------------------------------------------------
   99.1       340789488           2     170394744.0           43539       340745949  cudaMallocManaged                                                               
    0.7         2402811           1       2402811.0         2402811         2402811  cudaDeviceSynchronize

