In [None]:
pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
# Q1
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <stdlib.h>

int main()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        printf("There is no device supporting CUDA\n");
    }
    int dev;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        if (dev == 0) {
            if (deviceProp.major < 1) {
                printf("There is no device supporting CUDA.\n");
            } else if (deviceCount == 1) {
                printf("There is 1 device supporting CUDA\n");
            } else {
                printf("There are %d devices supporting CUDA\n", deviceCount);
            }
        }
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        printf("  Major revision number:                         %d\n", deviceProp.major);
        printf("  Minor revision number:                         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %zu bytes\n", deviceProp.totalGlobalMem);
        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Multiprocessor count:                          %d\n",deviceProp.multiProcessorCount );
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],  deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
        printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    }
}


Writing cuda_device_info.cu


In [None]:
!nvcc cuda_device_info.cu -o cuda_device_info

In [None]:
!./cuda_device_info

There is 1 device supporting CUDA

Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15835660288 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


In [10]:
%%writefile hello.cu
#include<stdio.h>

__global__ void hello()
{
  int thread = threadIdx.x;
  printf("Hello from thread : %d \n", thread);
}

int main()
{
  hello<<<1,4>>>();

  cudaDeviceSynchronize();
  return 0;
}

Overwriting hello.cu


In [9]:
!nvcc hello.cu -o hello_cuda
!./hello_cuda

Hello from thread : 0 
Hello from thread : 1 
Hello from thread : 2 
Hello from thread : 3 
Hello from thread : 0 
Hello from thread : 1 
Hello from thread : 2 
Hello from thread : 3 


In [None]:
%%writefile hello_cuda2.cu
#include <stdio.h>

__global__ void helloWorld() {
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    int gi = threadId + blockId * blockDim.x;
    printf("Hello World from block %d, thread %d, global Id : %d\n ", blockId, threadId,gi);

}

int main() {

    helloWorld<<<3, 5>>>();

    cudaDeviceSynchronize();

    return 0;
}


Writing hello_cuda2.cu


In [None]:
!nvcc -o hello2 hello_cuda2.cu
!./hello2

Hello World from block 1, thread 0, global Id : 5
 Hello World from block 1, thread 1, global Id : 6
 Hello World from block 1, thread 2, global Id : 7
 Hello World from block 1, thread 3, global Id : 8
 Hello World from block 1, thread 4, global Id : 9
 Hello World from block 2, thread 0, global Id : 10
 Hello World from block 2, thread 1, global Id : 11
 Hello World from block 2, thread 2, global Id : 12
 Hello World from block 2, thread 3, global Id : 13
 Hello World from block 2, thread 4, global Id : 14
 Hello World from block 0, thread 0, global Id : 0
 Hello World from block 0, thread 1, global Id : 1
 Hello World from block 0, thread 2, global Id : 2
 Hello World from block 0, thread 3, global Id : 3
 Hello World from block 0, thread 4, global Id : 4
 

In [11]:
%%writefile hello3.cu
#include <stdio.h>

__global__ void solve()
{
    int thx = threadIdx.x;
    int thy = threadIdx.y;

    int blx = blockIdx.x;
    int bly = blockIdx.y;

    int gx = thx + blx * blockDim.x;
    int gy = thy + bly * blockDim.y;

    int gi = gy * (gridDim.x * blockDim.x) + gx;

    printf("2D (%d, %d) | 1D (%d)\n", gx, gy, gi);
}

int main()
{
  dim3 gridSize (2,2);
  dim3 blockSize(3,3);

  solve<<<gridSize, blockSize>>>();

  cudaDeviceSynchronize();
  return 0;
}

Writing hello3.cu


In [12]:
!nvcc -o hello3 hello3.cu
!./hello3

2D (3, 0) | 1D (3)
2D (4, 0) | 1D (4)
2D (5, 0) | 1D (5)
2D (3, 1) | 1D (9)
2D (4, 1) | 1D (10)
2D (5, 1) | 1D (11)
2D (3, 2) | 1D (15)
2D (4, 2) | 1D (16)
2D (5, 2) | 1D (17)
2D (3, 3) | 1D (21)
2D (4, 3) | 1D (22)
2D (5, 3) | 1D (23)
2D (3, 4) | 1D (27)
2D (4, 4) | 1D (28)
2D (5, 4) | 1D (29)
2D (3, 5) | 1D (33)
2D (4, 5) | 1D (34)
2D (5, 5) | 1D (35)
2D (0, 3) | 1D (18)
2D (1, 3) | 1D (19)
2D (2, 3) | 1D (20)
2D (0, 4) | 1D (24)
2D (1, 4) | 1D (25)
2D (2, 4) | 1D (26)
2D (0, 5) | 1D (30)
2D (1, 5) | 1D (31)
2D (2, 5) | 1D (32)
2D (0, 0) | 1D (0)
2D (1, 0) | 1D (1)
2D (2, 0) | 1D (2)
2D (0, 1) | 1D (6)
2D (1, 1) | 1D (7)
2D (2, 1) | 1D (8)
2D (0, 2) | 1D (12)
2D (1, 2) | 1D (13)
2D (2, 2) | 1D (14)


**Practise**

In [None]:
%%writefile temp.cu
#include <stdio.h>
__global__ void solve()
{
  int idx = threadIdx.x;
  int idx1 = blockIdx.x;
  printf("Hello from thread %d from block %d\n", idx,idx1);
}

int main()
{
  solve<<<2,5>>>();

  cudaDeviceSynchronize();
}

Overwriting temp.cu


In [None]:
!nvcc -o temp temp.cu
! ./temp

Hello from thread 0 from block 0
Hello from thread 1 from block 0
Hello from thread 2 from block 0
Hello from thread 3 from block 0
Hello from thread 4 from block 0
Hello from thread 0 from block 1
Hello from thread 1 from block 1
Hello from thread 2 from block 1
Hello from thread 3 from block 1
Hello from thread 4 from block 1


In [24]:
%%writefile add.cu
#include<stdio.h>

__global__ void add(int *a , int *b, int *c)
{
  *c = *a + *b;
}

int main()
{
  int a = 2,b=3,c;

  int *d_a, *d_b, *d_c;

  int sz = sizeof(int);

  cudaMalloc((void **)&d_a, sz);
  cudaMalloc((void **)&d_b, sz);
  cudaMalloc((void **)&d_c, sz);

  cudaMemcpy(d_a, &a, sz, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b,sz, cudaMemcpyHostToDevice);

  add<<<1,1>>>(d_a,d_b,d_c);

  cudaMemcpy(&c, d_c,sz, cudaMemcpyDeviceToHost);

  printf("Sum : %d",c);

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
}

Overwriting add.cu


In [25]:
!nvcc -o add add.cu
!./add

Sum : 5

In [None]:
%%writefile addv.cu
#include<stdio.h>
#define N 4
__global__ void add(int *a, int *b, int *c)
{
  int idx = threadIdx.x;
  if(idx < N)
  {
    c[idx] = a[idx] + b[idx];
  }
}

int main()
{
  int *a, *b, *c;
  int *d_a, *d_b, *d_c;
  int size = N * sizeof(int);

  cudaMalloc((void **)&d_a,size);
  cudaMalloc((void **)&d_b,size);
  cudaMalloc((void **)&d_c,size);

  a = (int *)malloc(size);
  b = (int *)malloc(size);
  c = (int *)malloc(size);

  for(int i=0;i<N;i++)
  {
    a[i] = i+1;
    b[i] = i+1;
  }


  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

  add<<<1, N>>>(d_a, d_b,d_c);

  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

  for(int i=0;i<N;i++)
  {
    printf(" %d ",c[i]);
  }

  cudaFree(a);
  cudaFree(b);
  cudaFree(c);
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);



}

Overwriting addv.cu


In [None]:
!nvcc -o addv addv.cu
!./addv

 2  4  6  8 

In [None]:
%%writefile a.cu
#include<stdio.h>
#define N 12
#define M 2
__global__ void add(int *a, int *b, int *c)
{
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if(idx < N)
  {
    c[idx] = a[idx] + b[idx];
  }
}

int main()
{
  int *a, *b, *c;
  int *d_a, *d_b, *d_c;
  int size = N * sizeof(int);

  cudaMalloc((void **)&d_a,size);
  cudaMalloc((void **)&d_b,size);
  cudaMalloc((void **)&d_c,size);

  a = (int *)malloc(size);
  b = (int *)malloc(size);
  c = (int *)malloc(size);

  for(int i=0;i<N;i++)
  {
    a[i] = i+1;
    b[i] = i+1;
  }


  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

  add<<<(N+M-1)/M, M>>>(d_a, d_b,d_c);

  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

  for(int i=0;i<N;i++)
  {
    printf(" %d ",c[i]);
  }

  cudaFree(a);
  cudaFree(b);
  cudaFree(c);
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);



}

Overwriting a.cu


In [None]:
!nvcc -o a a.cu

!./a

 2  4  6  8  10  12  14  16  18  20  22  24 