<a href="https://colab.research.google.com/github/MatteoOnger/GPU_Project/blob/main/GPU_EA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GPU Project:**

*   **Author:** Matteo Onger
*   **Date:** June 2024

**Documentation**:
*   [CUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/contents.html)
*   [Nvcc4jupyter](https://github.com/MatteoOnger/nvcc4jupyter)

**Notes**:
* To execute this notebook, GPU-equipped runtime is necessary.


## **VM Setup**


In [None]:
# Download nvcc4jupyter extension
!git clone https://github.com/MatteoOnger/nvcc4jupyter.git

Cloning into 'nvcc4jupyter'...
remote: Enumerating objects: 429, done.[K
remote: Counting objects: 100% (308/308), done.[K
remote: Compressing objects: 100% (203/203), done.[K
remote: Total 429 (delta 141), reused 171 (delta 89), pack-reused 121[K
Receiving objects: 100% (429/429), 107.12 KiB | 884.00 KiB/s, done.
Resolving deltas: 100% (179/179), done.


In [None]:
# Load and configure the extension
%cd ./nvcc4jupyter/

%load_ext nvcc4jupyter
%config NVCCPlugin.wd = './src'
%reload_ext nvcc4jupyter

%cd /content

/content/nvcc4jupyter
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp42rtmhlq".
Detected platform "Colab". Running its setup...
Source files will be saved in "./src".
/content


In [None]:
# Download and install NVIDIA Nsight Systems
!wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2024_3/nsight-systems-2024.3.1_2024.3.1.75-1_amd64.deb
!apt update

!apt install ./nsight-systems-2024.3.1_2024.3.1.75-1_amd64.deb
!apt --fix-broken install

!rm ./nsight-systems-2024.3.1_2024.3.1.75-1_amd64.deb

--2024-06-06 06:55:03--  https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2024_3/nsight-systems-2024.3.1_2024.3.1.75-1_amd64.deb
Resolving developer.nvidia.com (developer.nvidia.com)... 152.195.19.142
Connecting to developer.nvidia.com (developer.nvidia.com)|152.195.19.142|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://developer.download.nvidia.com/assets/tools/secure/nsight-systems/2024_3/nsight-systems-2024.3.1_2024.3.1.75-1_amd64.deb?Y-GnG1V4Msw1sA9Y-iWV4IZ4TPds7FgS4WLWSVKq3IXJ702t4lzhIPgOaZ_uf7-owW8MPNjD3th0A7fxPr4_cNIH2bqddWRug2bziDsTqeOnF21CTHMOeaisnNTVf3Yz0w0q3MpgncXgeqm6L3v9jedWe4P4A1O5-W5cyyLMl7gT0yS_mQ_uChnWBvjbKeak8tl-xEyhPsU= [following]
--2024-06-06 06:55:03--  https://developer.download.nvidia.com/assets/tools/secure/nsight-systems/2024_3/nsight-systems-2024.3.1_2024.3.1.75-1_amd64.deb?Y-GnG1V4Msw1sA9Y-iWV4IZ4TPds7FgS4WLWSVKq3IXJ702t4lzhIPgOaZ_uf7-owW8MPNjD3th0A7fxPr4_cNIH2bqddWRug2bziDsTqeOnF21CTHMOeaisnNTV

## **CUDA Code**

In [None]:
# -------- CHECK FUNCS  --------
%%cuda_group_save --group shared --name "check.h"
#include <stdio.h>


#define CHECK(call);                                                                    \
{                                                                                       \
    const cudaError_t error = call;                                                     \
    if (error != cudaSuccess)                                                           \
    {                                                                                   \
        printf("Error: %s:%d, ", __FILE__, __LINE__);                                   \
        printf("code: %d, reason: %s\n", error, cudaGetErrorString(error));             \
    }                                                                                   \
}

In [None]:
# -------- UTILITY FUNCS  --------
%%cuda_group_save --group shared --name "utils.h"
#include <stdio.h>


/*
  Returns the ID of a block considering a linearized grid.
*/
__device__ unsigned int getLinBlockIdx(){
    // Glob. block ID
    uint bId = blockIdx.x                   //1D
      + blockIdx.y * gridDim.x              //2D
      + blockIdx.z * gridDim.x * gridDim.y; //3D

    return bId;
}


/*
  Returns the local ID of a thread considering a linearized block.
*/
__device__ unsigned int getLinThreadIdx(){
    // Thread ID of the block
    unsigned int tId = threadIdx.x              //1D
      + threadIdx.y * blockDim.x                //2D
      + threadIdx.z * blockDim.x * blockDim.y;  //3D

    return tId;
}


/*
  Returns the glob. ID of a thread considering a linearized grid & blocks.
*/
__device__ unsigned int getGlobalLinThreadIdx(){
    // Tot. number of threads per block
    unsigned int bSize = blockDim.x * blockDim.y * blockDim.z;

    return getLinThreadIdx() + getLinBlockIdx() * bSize;
}


/*
  Returns the glob. ID of a thread coordinate-wise.
*/
__device__ uint3 getGlobalThreadIdx(){
    uint3 coords;

    coords.x = threadIdx.x + blockDim.x * blockIdx.x;
    coords.y = threadIdx.y + blockDim.y * blockIdx.y;
    coords.z = threadIdx.z + blockDim.z * blockIdx.z;

    return coords;
}

In [None]:
%%cuda_group_save --group "project" --name "main.cu"
#include <stdio.h>
#include <time.h>

#include "/content/src/shared/check.h"
#include "/content/src/shared/utils.h"

__global__ void kernel(){
  const int len = 64;
  __shared__ int smem[len];

  const int idx = getGlobalLinThreadIdx();
  smem[idx] = idx;
  smem[32 + idx] = 32 + idx;

  printf("Thread %d -> %d, %d\n", idx, smem[idx], smem[idx+32]);
}

int main(){
  kernel<<<1,32>>>();

  cudaDeviceReset();
  return 0;
}

In [None]:
!nvcc -o src/project/main.exe src/project/main.cu
!src/project/main.exe

In [None]:
! ncu /content/src/project/main.exe

In [None]:
!nsys profile -o src/project/main%n.nsys-rep src/project/main.exe