## Multi Dimensional Kernel Defining

In [1]:
%%writefile 1.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void kernel(){

  int bx = blockIdx.x;
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  int gd = gridDim.x;
  int bd = blockDim.x;

  int idx = ( by * gridDim.x + bx) * (blockDim.x * blockDim.y) + (ty * blockDim.x) + tx;
  // int idx = (0 * 5 + bx) * (bd * 1) + (0 * 5) +  tx                           ----->Same as above line

  printf("Block (%d, %d), Thread (%d, %d), Griddim.x (%d), Blockdim.x (%d), Global index: %d\n", bx, by, tx, ty, gd, bd, idx);

}

int main(){
  dim3 A(2,2);
  dim3 blockDim(2,2);

  kernel<<<A, blockDim>>>();

  cudaDeviceSynchronize();

  return 0;

}


Writing 1.cu


In [2]:
!nvcc -o 1 1.cu

In [3]:
!./1

Block (0, 1), Thread (0, 0), Griddim.x (2), Blockdim.x (2), Global index: 8
Block (0, 1), Thread (1, 0), Griddim.x (2), Blockdim.x (2), Global index: 9
Block (0, 1), Thread (0, 1), Griddim.x (2), Blockdim.x (2), Global index: 10
Block (0, 1), Thread (1, 1), Griddim.x (2), Blockdim.x (2), Global index: 11
Block (0, 0), Thread (0, 0), Griddim.x (2), Blockdim.x (2), Global index: 0
Block (0, 0), Thread (1, 0), Griddim.x (2), Blockdim.x (2), Global index: 1
Block (0, 0), Thread (0, 1), Griddim.x (2), Blockdim.x (2), Global index: 2
Block (0, 0), Thread (1, 1), Griddim.x (2), Blockdim.x (2), Global index: 3
Block (1, 1), Thread (0, 0), Griddim.x (2), Blockdim.x (2), Global index: 12
Block (1, 1), Thread (1, 0), Griddim.x (2), Blockdim.x (2), Global index: 13
Block (1, 1), Thread (0, 1), Griddim.x (2), Blockdim.x (2), Global index: 14
Block (1, 1), Thread (1, 1), Griddim.x (2), Blockdim.x (2), Global index: 15
Block (1, 0), Thread (0, 0), Griddim.x (2), Blockdim.x (2), Global index: 4
Block 

In [12]:
  %%writefile 2.cu

  #include <stdio.h>
  #include <cuda_runtime.h>

  #define M 4
  #define N 5

  __global__ void addArrays2D(int *d_result, int* d_array1, int *d_array2){
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int row = by * blockDim.y + ty;
    int col = bx * blockDim.x + tx;

    int tid = row * (blockDim.x) + col;

    printf("Globally unique thread number: %d\n", tid);

    if(row < M && col < N){
      int result = d_array1[row * N + col] + d_array2[row * N + col];
      d_result[row * N + col] = result;
    }
  }

  int main(){
    int host_array1[M][N] = {
      {1,2,3,4,5},
      {6,7,8,9,10},
      {11,12,13,14,15},
      {16,17,18,19,20},
    };
    int host_array2[M][N] = {
      {21,22,23,24,25},
      {26,27,28,29,30},
      {31,32,33,34,35},
      {36,37,38,39,40}
    };

    int *d_array1, *d_array2, *d_result;
    cudaMalloc((void**)&d_array1, M*N * sizeof(int));
    cudaMalloc((void**)&d_array2, M*N * sizeof(int));
    cudaMalloc((void**)&d_result, M*N * sizeof(int));

    cudaMemcpy(d_array1, host_array1, M*N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_array2, host_array2, M*N * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlockX = N;
    int threadsPerBlockY = M;

    dim3 threadsPerBlock(threadsPerBlockX,threadsPerBlockY,1) ;

    int numBlockX = (N + threadsPerBlockX -1) / threadsPerBlockX;
    int numBlockY = (M + threadsPerBlockY -1) / threadsPerBlockY;
    dim3 blocksPerGrid(numBlockX, numBlockY, 1);

    addArrays2D<<<blocksPerGrid, threadsPerBlock>>> (d_result, d_array1, d_array2);

    int host_result[M][N];

    cudaMemcpy(host_result, d_result, M*N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Resulting Array: \n");
    for(int i=0; i<M; i++){
      for(int j=0; j<N; j++){
        printf("%d ", host_result[i][j]);
      }
      printf("\n");
    }

    cudaFree(d_array1);
    cudaFree(d_array2);
    cudaFree(d_result);
  }

Overwriting 2.cu


In [13]:
!nvcc -o 2 2.cu

In [14]:
!./2

Globally unique thread number: 0
Globally unique thread number: 1
Globally unique thread number: 2
Globally unique thread number: 3
Globally unique thread number: 4
Globally unique thread number: 5
Globally unique thread number: 6
Globally unique thread number: 7
Globally unique thread number: 8
Globally unique thread number: 9
Globally unique thread number: 10
Globally unique thread number: 11
Globally unique thread number: 12
Globally unique thread number: 13
Globally unique thread number: 14
Globally unique thread number: 15
Globally unique thread number: 16
Globally unique thread number: 17
Globally unique thread number: 18
Globally unique thread number: 19
Resulting Array: 
22 24 26 28 30 
32 34 36 38 40 
42 44 46 48 50 
52 54 56 58 60 
