# Raw code

In [1]:
%%writefile raw_add.cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int a, int b, int *res) {
  *res = a + b;
}


int main() {
  int res=0;
  int *d_res = NULL;

  // Launch add() kernel on GPU
  add<<<1,1>>>(2, 2, d_res);

  cudaMemcpy(&res, d_res, sizeof(int), cudaMemcpyDeviceToHost);
  printf("2 + 2 = %d\n", res);

  return EXIT_SUCCESS;
}

Writing raw_add.cu


In [2]:
!nvcc raw_add.cu -o raw_add

In [3]:
!./raw_add

2 + 2 = 0


# Debugging

In [4]:
%%writefile add.cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int a, int b, int *res) {
  *res = a + b;
}
int main() {
  int res=0;
  int *d_res = NULL;

  // Launch add() kernel on GPU
  add<<<1,1>>>(2, 2, d_res);

  cudaMemcpy(&res, d_res, sizeof(int), cudaMemcpyDeviceToHost);
  printf("2 + 2 = %d\n", res);

  return EXIT_SUCCESS;
}

Writing add.cu


In [5]:
! nvcc -g -G add.cu -o add

**Debugging file**

cuda-gdb is interactive (you are expected to type commands as you go along), but running programs in the Colab environnement is not. Typical commands would go like this:
1. set the debugger up to check lots of possible errors:
    1. stop in case of API failures: api_failures stop,
    2. stop on exceptions: catch throw,
2. run the program (possibly with command line options): r option1 option2,
3. show the kernel call stack (GPU): bt,
4. print all local variables: info locals,
5. switch to the host thread: thread 1
6. and show the host program call stack (CPU): bt

In [6]:
%%writefile debug_instructions.txt

set cuda api_failures stop
catch throw
r
bt
info locals
thread 1
bt

Writing debug_instructions.txt


In [7]:
! cuda-gdb -batch -x debug_instructions.txt ./add

Catchpoint 1 (throw)
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff5fff000 (LWP 1339)]
[New Thread 0x7ffff4bff000 (LWP 1342)]
[Detaching after fork from child process 1343]
[New Thread 0x7fffeefde000 (LWP 1348)]
Cuda API error detected: cudaLaunchKernel returned (0xde)
#0  0x00007ffff61ad970 in cudbgReportDriverApiError () from /usr/lib64-nvidia/libcuda.so.1
#1  0x00007ffff642b32b in ?? () from /usr/lib64-nvidia/libcuda.so.1
#2  0x00007ffff4d54ba7 in ?? () from /usr/lib64-nvidia/libcudadebugger.so.1
#3  0x00007ffff4d31b2e in ?? () from /usr/lib64-nvidia/libcudadebugger.so.1
#4  0x00007ffff4d42fda in ?? () from /usr/lib64-nvidia/libcudadebugger.so.1
#5  0x00007ffff4d280d7 in ?? () from /usr/lib64-nvidia/libcudadebugger.so.1
#6  0x00007ffff4e9e526 in ?? () from /usr/lib64-nvidia/libcudadebugger.so.1
#7  0x00007ffff6442066 in ?? () from /usr/lib64-nvidia/libcuda.so.1
#8  0x00005555555c82f8 in cud

**Analysis**

Cause: CUDA program crashes because cudaLaunchKernel encounters an error (0xde).

Step by step:


1. An invalid CUDA operation was attempted:
```
Cuda API error detected: cudaLaunchKernel returned (0xde)
```
2. The error trace shows a call to cudaLaunchKernel, which internally tries to launch your kernel (add) but fails:
```
add (__cuda_0=2, __cuda_1=2, __cuda_2=0x0) at /content/add.cu:4
```
this line suggests that the third argument (the pointer `c`) is `0x0` (NULL), meaning it was not properly allocated.

# Code with error management

In [8]:
%%writefile add.cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int a, int b, int *res) {
  *res = a + b;
}


int main() {
  int res=0;
  int *d_res = NULL;
  cudaError_t err;

  // Launch add() kernel on GPU
  add<<<1,1>>>(2, 2, d_res);
  err = cudaPeekAtLastError();
  if (err != cudaSuccess){
      fprintf(stderr,"GPUassert: add launch failed with the error : %s \n", cudaGetErrorString(err));
      exit(err);
   }
  err = cudaDeviceSynchronize() ;
  if (err != cudaSuccess){
      fprintf(stderr,"GPUassert: add execution failed with the error : %s \n", cudaGetErrorString(err));
      exit(err);
  }

  err = cudaMemcpy(&res, d_res, sizeof(int), cudaMemcpyDeviceToHost);
  if (err != cudaSuccess){
      fprintf(stderr,"GPUassert: cudaMemcpy failed with the error : %s \n", cudaGetErrorString(err));
      exit(err);
   }

  printf("2 + 2 = %d\n", res);

  return EXIT_SUCCESS;
}

Overwriting add.cu


This code:
- Checks for invalid kernel launch argument with the error code of `cudaPeekAtLastError`
- check if errors occurred during the kernel execution thanks to the error code of `cudaDeviceSynchronize`

Note: `cudaMemcpy` can be used as synchronization primitive (the `cudaDeviceSynchronize` would be in duplicate then). Then the `cudaMemcpy` call can return either errors which occurred during the kernel execution or those from the memory copy itself.

In [9]:
! nvcc -arch=sm_75 add.cu -o add

In [10]:
!./add

GPUassert: add execution failed with the error : an illegal memory access was encountered 


**Analysis**

- `cudaDeviceSynchronize()` catches the error because it forces all prior operations to complete, revealing the illegal memory access.
- The kernel `add<<<1,1>>>(2, 2, d_res);` tries to dereference `*d_res = a + b;`, leading to an illegal memory access.
- `d_res` is a `NULL` pointer because memory was not allocated using cudaMalloc.

# CUDA error management Utilities in a separate cell


In [11]:
%%writefile cuda_stuff.cuh
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

#ifndef cuda_stuff_H
#define cuda_stuff_H

//MACRO TO DEBUG CUDA FUNCTIONS
/** Error checking,
 *  taken from https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
 */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#endif

Writing cuda_stuff.cuh


In [12]:
%%writefile addition.cu
#include <stdio.h>
#include <stdlib.h>

#include "cuda_stuff.cuh"

__global__ void add(int a, int b, int *res) {
  *res = a + b;
}


int main() {
  int res=0;
  int *d_res = NULL;

  // Launch add() kernel on GPU
  add<<<1,1>>>(2, 2, d_res);
  gpuErrchk( cudaPeekAtLastError() );
  gpuErrchk( cudaDeviceSynchronize() );

  gpuErrchk(cudaMemcpy(&res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
  printf("2 + 2 = %d\n", res);

  return EXIT_SUCCESS;
}

Writing addition.cu


In [13]:
!nvcc -arch=sm_75 addition.cu -o addition

In [14]:
! ./addition

GPUassert: an illegal memory access was encountered addition.cu 18


# Error fix

In [15]:
%%writefile add_fixed.cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int a, int b, int *res) {
  *res = a + b;
}
int main() {
  int res=0;
  int *d_res = NULL;

  cudaMalloc((void**)&d_res, sizeof(int)); // fix: Allocate memory on the device

  // Launch add() kernel on GPU
  add<<<1,1>>>(2, 2, d_res);

  cudaMemcpy(&res, d_res, sizeof(int), cudaMemcpyDeviceToHost);
  printf("2 + 2 = %d\n", res);

  return EXIT_SUCCESS;
}

Writing add_fixed.cu


In [16]:
!nvcc -arch=sm_75 add_fixed.cu -o add_fixed

In [17]:
! ./add_fixed

2 + 2 = 4
