<a href="https://colab.research.google.com/github/Harshal292004/learn-cuda/blob/main/CUDA_in_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install CUDA C++ plugin for Colab:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpy5ta76xn".


In [None]:
# Detect selected GPU and its NVIDA architecture:
import subprocess
gpu_info = subprocess.getoutput("nvidia-smi --query-gpu=name,compute_cap --format=csv,noheader,nounits")
if "not found" in gpu_info.lower(): raise RuntimeError("Error: No GPU found. Please select a GPU runtime environment.")
gpu_name, compute_cap = map(str.strip, gpu_info.split(','))
gpu_arch = f"sm_{compute_cap.replace('.', '')}"

print(f"{'GPU Name':<15}: {gpu_name}")
print(f"{'Architecture':<15}: {gpu_arch}")

GPU Name       : Tesla T4
Architecture   : sm_75


In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>

__global__ void one_d_kernel(){
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;

    /*
    Grid launch: <<<4 blocks, 1 thread per block>>>

    Blocks:     [0] [1] [2] [3]
    ThreadIdx:   0   0   0   0

    Global thread IDs:

    Block 0: threadIdx.x = 0  -> global: 0
    Block 1: threadIdx.x = 0  -> global: 1
    Block 2: threadIdx.x = 0  -> global: 2
    Block 3: threadIdx.x = 0  -> global: 3

    So visually:

    globalThreadId:
        0   1   2   3
        |   |   |   |
    blk0 blk1 blk2 blk3

    Formula:
    globalThreadId = threadIdx.x + blockIdx.x * blockDim.x
    */

    int globalThreadId = threadId + blockId * blockDim.x;

    printf("Block id %d , Thread id in the block %d  Global thread id %d\n",
           blockId, threadId, globalThreadId);
}

int main(){
    int numBlocks = 4;
    int threadsPerBlock = 1;

    one_d_kernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize();
    return 0;
}


Block id 2 , Thread id in the block 0  Global thread id 2
Block id 0 , Thread id in the block 0  Global thread id 0
Block id 3 , Thread id in the block 0  Global thread id 3
Block id 1 , Thread id in the block 0  Global thread id 1



In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>

__global__ void two_d_kernel(){
    /*
    Grid layout: gridDim = (2 in X, 3 in Y)

    Block coordinates (blockIdx.x, blockIdx.y):

      y=2   (0,2) (1,2)
      y=1   (0,1) (1,1)
      y=0   (0,0) (1,0)
            x=0   x=1

    Flattened block IDs:
      blockId = x + y * gridDim.x

      ID mapping:
      y=2:   4   5
      y=1:   2   3
      y=0:   0   1

    -----------------------------------------------

    Inside each block:
    blockDim = (2 in X, 2 in Y)

    Thread coordinates (threadIdx.x, threadIdx.y):

      (0,1) (1,1)
      (0,0) (1,0)

    Flattened thread IDs:
      threadId = x + y * blockDim.x

      ID mapping:
      y=1:   2   3
      y=0:   0   1

    -----------------------------------------------

    Global thread ID:
      globalThreadId = threadId +
                  blockId * (blockDim.x * blockDim.y)

    This shifts threads of each previous block out of the way.
    */

    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
    int threadId = threadIdx.x + threadIdx.y * blockDim.x;
    int globalThreadId = threadId + blockId * (blockDim.x * blockDim.y);

    printf("Block(%d,%d) blockId=%d | Thread(%d,%d) threadId=%d | Global Thread Id =%d\n",
        blockIdx.x, blockIdx.y, blockId,
        threadIdx.x, threadIdx.y, threadId,
        globalThreadId
    );
}

int main(){
    dim3 numBlocks(2, 3);       // 2 in X, 3 in Y
    dim3 threadsPerBlock(2, 2); // 2 in X, 2 in Y

    two_d_kernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize();
    return 0;
}


In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>
// gridDim is the number of blocks in the grid
// blockDim is the number of threads in the block

__global__ void whoami(void) {
    int block_id =
        blockIdx.x +
        blockIdx.y * gridDim.x +
        blockIdx.z * gridDim.x * gridDim.y;
    int block_offset =
        block_id *
        blockDim.x * blockDim.y * blockDim.z;
    int thread_offset =
        threadIdx.x +
        threadIdx.y * blockDim.x +
        threadIdx.z * blockDim.x * blockDim.y;

    int id = block_offset + thread_offset;

    printf("%04d | Block(%d %d %d) = %3d | Thread(%d %d %d) = %3d\n",
        id,
        blockIdx.x, blockIdx.y, blockIdx.z, block_id,
        threadIdx.x, threadIdx.y, threadIdx.z, thread_offset);
    // printf("blockIdx.x: %d, blockIdx.y: %d, blockIdx.z: %d, threadIdx.x: %d, threadIdx.y: %d, threadIdx.z: %d\n", blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
}

int main(int argc, char **argv) {
    const int b_x = 2, b_y = 3, b_z = 4;
    const int t_x = 4, t_y = 4, t_z = 4; // the max warp size is 32, so
    // we will get 2 warp of 32 threads per block

    int blocks_per_grid = b_x * b_y * b_z;
    int threads_per_block = t_x * t_y * t_z;

    printf("%d blocks/grid\n", blocks_per_grid);
    printf("%d threads/block\n", threads_per_block);
    printf("%d total threads\n", blocks_per_grid * threads_per_block);

    dim3 blocksPerGrid(b_x, b_y, b_z); // 3d cube of shape 2*3*4 = 24
    dim3 threadsPerBlock(t_x, t_y, t_z); // 3d cube of shape 4*4*4 = 64

    whoami<<<blocksPerGrid, threadsPerBlock>>>();
    cudaDeviceSynchronize();
}

In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>

__global__ void three_d_kernel(){

    /*
    Example Configuration:
        gridDim  = (2 in X, 2 in Y, 2 in Z) → 8 blocks total
        blockDim = (2 in X, 2 in Y, 2 in Z) → 8 threads per block

    ------------------------ BLOCK FLATTENING ------------------------

    Block indexing in (x,y,z):

    z = 1 layer:   (0,1,1) (1,1,1)
                   (0,0,1) (1,0,1)

    z = 0 layer:   (0,1,0) (1,1,0)
                   (0,0,0) (1,0,0)

    Linear block ID:
        blockId = x + y*gridDim.x + z*(gridDim.x*gridDim.y)

    Mapping (for gridDim = 2,2,2):

    z=0:     0 1
             2 3

    z=1:     4 5
             6 7

    ------------------------ THREAD FLATTENING ------------------------

    Inside each block:
        threadDim = (2,2,2)

    Thread (x,y,z) layout:

    z=1:   (0,1,1) (1,1,1)
           (0,0,1) (1,0,1)

    z=0:   (0,1,0) (1,1,0)
           (0,0,0) (1,0,0)

    linear thread ID:
        threadId = x + y*blockDim.x + z*(blockDim.x*blockDim.y)

    Mapping:
        z=0:   0 1 2 3
        z=1:   4 5 6 7

    ------------------- GLOBAL THREAD FLATTENING ---------------------

    Every block has 8 threads
    so thread ranges become:

        block 0 →  0..7
        block 1 →  8..15
        block 2 → 16..23
        ...
        block 7 → 56..63

    formula:
        globalId = threadId + blockId*(blockDim.x*blockDim.y*blockDim.z)
    */

    int blockId = blockIdx.x +
                  blockIdx.y * gridDim.x +
                  blockIdx.z * gridDim.x * gridDim.y;

    int threadId = threadIdx.x +
                   threadIdx.y * blockDim.x +
                   threadIdx.z * blockDim.x * blockDim.y;

    int globalThreadId = threadId +
                         blockId * (blockDim.x * blockDim.y * blockDim.z);

    printf("Block(%d,%d,%d) blockId=%2d | Thread(%d,%d,%d) threadId=%2d | GlobalId=%2d\n",
        blockIdx.x, blockIdx.y, blockIdx.z, blockId,
        threadIdx.x, threadIdx.y, threadIdx.z, threadId,
        globalThreadId);
}

int main(){
    dim3 blocks(2,2,2);        // 8 blocks
    dim3 threads(2,2,2);       // 8 threads per block

    three_d_kernel<<<blocks, threads>>>();
    cudaDeviceSynchronize();
    return 0;
}


In [None]:
%%cuda -c "--gpu-architecture $gpu-arch"

#include <stdio.h>

void vector_add_cpu(int *a,int *b, int *c , int n){
    for(int i=0;i<n;i++){
        c[i]=a[i]+b[i];
    }
}


__global__ vector_add_gpu(int *a, int *b, int *c, int n){
    int globalThreadId= threadIdx.x + blockIdx.x * blockDim.x;
    if(i<n)
      c[globalThreadId]=a[globalThreadId]+b[globalThreadId];
}

void vector_init(int *a, int *b, int *c, int n){
    *a=new int[n];
    *b=new int[n];
    *c=new int[n];
    for(int i=0;i<n;i++){
        a[i]=i;
        b[i]=i+1;
    }
}

int main(){
    int *a,*b,*c;
    int n=5;
    vector_init(&a,&b,&c,n);


    vector_add_cpu(a,b,c,n);

    delete[] a;
    delete[] b;
    delete[] c;

    vector_init(&a,&b,&c,n);
    int number_of_blocks=4;
    int number_of_threads_per_block=4;
    vector_add_gpu<<<number_of_blocks, number_of_threads_per_block>>>();
    cudaDeviceSyncronize();
    return 0;
}