### **CUDA**
Below is an example that runs native CUDA code.

1.   We investigate the CUDA version, drivers and the avaiable GPU with nvidia-smi and nvcc-version
2.   We use the IPython magic command "%%writefile filename" to save a *.cu program
3.   We then compile and run the *.cu program with nvcc







In [None]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Thu Oct 27 15:43:18 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+------


## Next, we write a naive CUDA code and save it as 'vectorAdd_naive.cu'


In [None]:
%%writefile vectorAdd_naive.cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int *a, int *b, int *c) {
  *c = *a + *b;
}

int main() {

  // host copies of variables a, b & c
  int a, b, c;

  // device copies of variables a, b & c
  int *d_a, *d_b, *d_c;

  // Allocate space for device copies of a, b, c
  int size = sizeof(int);
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // Setup input values
  c = 0;
  a = 3;
  b = 5;

  // Copy input data from host to device
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

  // Launch add() kernel on GPU
  add<<<1,1>>>(d_a, d_b, d_c);

  // Copy result from device back to host
  cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }

  printf("result is %d\n",c);

  // Cleanup
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  return 0;
}

Overwriting vectorAdd_naive.cu


## We compile the saved cuda code using nvcc compiler

In [None]:
!nvcc vectorAdd_naive.cu -o vectorAdd_naive
!ls


sample_data  vectorAdd_naive  vectorAdd_naive.cu


## Finally, we execute the binary of the compiled code

In [None]:
!./vectorAdd_naive

result is 8



## Next, we write a more complete version of vectorAdd CUDA code and save it as 'vectorAdd_v2.cu'


In [None]:
%%writefile vectorAdd_v2.cu
#include <stdio.h>
#include <sys/time.h>

#define DataType double

__global__ void vecAdd(DataType *in1, DataType *in2, DataType *out, int len) {
  //@@ Insert code to implement vector addition here
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index < len) {
    out[index] = in1[index] + in2[index];
  }
}

//@@ Insert code to implement timer
struct timeval t_start, t_end;
void myCPUTimer_start(){
  gettimeofday(&t_start, 0);
}
//@@ Insert code to implement timer
void myCPUTimer_stop(){
  cudaDeviceSynchronize();
  gettimeofday(&t_end, 0);
  double time = (1000000.0*(t_end.tv_sec-t_start.tv_sec) + t_end.tv_usec-t_start.tv_usec);
  printf("Elasped %6.1f microseconds \n", time);
}

int main(int argc, char **argv) {

  int inputLength;
  DataType *hostInput1;
  DataType *hostInput2;
  DataType *hostOutput;
  DataType *resultRef;
  DataType *deviceInput1;
  DataType *deviceInput2;
  DataType *deviceOutput;


  //@@ Insert code below to read in inputLength from args
  inputLength = atoi(argv[1]);
  printf("The input length is %d\n", inputLength);

  //@@ Insert code below to allocate Host memory for input and output
  hostInput1 = (DataType *)malloc(inputLength * sizeof(DataType));
  hostInput2 = (DataType *)malloc(inputLength * sizeof(DataType));
  hostOutput = (DataType *)malloc(inputLength * sizeof(DataType));
  resultRef  = (DataType *)malloc(inputLength * sizeof(DataType));

  //@@ Insert code below to initialize hostInput1 and hostInput2 to random numbers, and create reference result in CPU
  for(int i=0; i<inputLength; i++){
    hostInput1[i] = 1.0;
    hostInput2[i] = 2.0;
    resultRef[i]  = hostInput1[i] + hostInput2[i];
  }

  //@@ Insert code below to allocate GPU memory here
  cudaMalloc((void **)&deviceInput1, inputLength * sizeof(DataType));
  cudaMalloc((void **)&deviceInput2, inputLength * sizeof(DataType));
  cudaMalloc((void **)&deviceOutput, inputLength * sizeof(DataType));

  //@@ Insert code to below to Copy memory to the GPU here
  cudaMemcpy(deviceInput1, hostInput1, inputLength * sizeof(DataType), cudaMemcpyHostToDevice);
  cudaMemcpy(deviceInput2, hostInput2, inputLength * sizeof(DataType), cudaMemcpyHostToDevice);

  //@@ Initialize the 1D grid and block dimensions here
  dim3 blockDim(32);
  dim3 gridDim(ceil(((float)inputLength) / ((float)blockDim.x)));

  //@@ Launch the GPU Kernel here
  vecAdd<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput,
                                inputLength);


  //@@ Copy the GPU memory back to the CPU here
  cudaMemcpy(hostOutput, deviceOutput, inputLength * sizeof(DataType), cudaMemcpyDeviceToHost);


  //@@ Insert code below to compare the output with the reference
  bool valid = true;
  for(int i=0; i<inputLength; i++){
    if( hostOutput[i] != resultRef[i] ){
      printf("hostOutput[%d] = %f != %f\n", i, hostOutput[i], resultRef[i]);
      valid = false;
    }
  }
  if(valid) printf("valid\n");

  //@@ Free the GPU memory here
  cudaFree(deviceInput1);
  cudaFree(deviceInput2);
  cudaFree(deviceOutput);

  //@@ Free the CPU memory here
  free(hostInput1);
  free(hostInput2);
  free(hostOutput);
  free(resultRef);

  return 0;
}


Writing vectorAdd_v2.cu


In [None]:
!nvcc -o vectorAdd vectorAdd_v2.cu

In [None]:
!./vectorAdd 131070

The input length is 131070
valid


# Next, we use NVPROF to profile the program and geting basic timing of activities on GPU

In [None]:
!nvprof ./vectorAdd 131070

The input length is 131070
==3374== NVPROF is profiling process 3374, command: ./vectorAdd 131070
valid
==3374== Profiling application: ./vectorAdd 131070
==3374== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   59.70%  175.55us         2  87.774us  87.486us  88.062us  [CUDA memcpy HtoD]
                   34.12%  100.32us         1  100.32us  100.32us  100.32us  [CUDA memcpy DtoH]
                    6.18%  18.176us         1  18.176us  18.176us  18.176us  vecAdd(double*, double*, double*, int)
      API calls:   98.77%  243.64ms         3  81.212ms  6.3460us  243.55ms  cudaMalloc
                    0.61%  1.5094ms         3  503.14us  247.05us  851.02us  cudaMemcpy
                    0.43%  1.0728ms         1  1.0728ms  1.0728ms  1.0728ms  cuDeviceGetPCIBusId
                    0.10%  257.65us         3  85.883us  24.121us  119.73us  cudaFree
                    0.05%  134.95us       101  1.3360us     133ns  55