In [None]:
%%writefile 1.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void add (int a, int b, int *result){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Thread index: %d\n",idx);
    result[idx] = a + b;
}

int main(){
    const int N = 10;
    int a = 5, b = 10;
    int h_result[N];
    int *d_result;

    cudaMalloc(&d_result, N * sizeof(int));

    printf("Running with add<<<1, 10>>>...\n");
    add<<<1, 10>>>(a,b,d_result);
    cudaDeviceSynchronize();

    cudaMemcpy(h_result, d_result, N * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i<N; ++i){
        printf("Thread %d result: %d\n", i, h_result[i]);
    }

    printf("\n Running with add<<<10, 1>>>...\n");
    add<<<10, 1>>>(a,b,d_result);

    cudaDeviceSynchronize();

    cudaMemcpy(h_result, d_result, N * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i<N; ++i){
      printf("Thread %d result: %d\n", i, h_result[i]);
    }

    cudaFree(d_result);

    return 0;

}

Overwriting 1.cu


In [None]:
!nvcc -o 1 1.cu

In [None]:
!./1

Running with add<<<1, 10>>>...
Thread index: 0
Thread index: 1
Thread index: 2
Thread index: 3
Thread index: 4
Thread index: 5
Thread index: 6
Thread index: 7
Thread index: 8
Thread index: 9
Thread 0 result: 15
Thread 1 result: 15
Thread 2 result: 15
Thread 3 result: 15
Thread 4 result: 15
Thread 5 result: 15
Thread 6 result: 15
Thread 7 result: 15
Thread 8 result: 15
Thread 9 result: 15

 Running with add<<<10, 1>>>...
Thread index: 0
Thread index: 5
Thread index: 2
Thread index: 7
Thread index: 3
Thread index: 8
Thread index: 1
Thread index: 6
Thread index: 4
Thread index: 9
Thread 0 result: 15
Thread 1 result: 15
Thread 2 result: 15
Thread 3 result: 15
Thread 4 result: 15
Thread 5 result: 15
Thread 6 result: 15
Thread 7 result: 15
Thread 8 result: 15
Thread 9 result: 15


In [None]:
%%writefile 2.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void add (int a, int b, int *result){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Thread index: %d\n",idx);
    result[idx] = a + b;
}

int main(){
    const int N = 10;
    int a = 5, b = 10;
    int h_result[N];
    int *d_result;

    cudaMalloc(&d_result, N * sizeof(int));

    printf("Running with add<<<1, 10>>>...\n");
    add<<<1, 10>>>(a,b,d_result);
    cudaDeviceSynchronize();

    cudaMemcpy(h_result, d_result, N * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i<N; ++i){
        printf("Thread %d result: %d\n", i, h_result[i]);
    }

    printf("\n Running with add<<<10, 1>>>...\n");
    add<<<10, 1>>>(a,b,d_result);

    cudaDeviceSynchronize();

    cudaMemcpy(h_result, d_result, N * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i<N; ++i){
      printf("Thread %d result: %d\n", i, h_result[i]);
    }

    printf("\n Test3 Running with add<<<10, 1>>>...\n");
    add<<<2, 5>>>(a,b,d_result);

    cudaDeviceSynchronize();

    cudaMemcpy(h_result, d_result, N * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i<N; ++i){
      printf("Thread %d result: %d\n", i, h_result[i]);
    }

    cudaFree(d_result);

    return 0;

}

Writing 1.cu


In [None]:
!nvcc -o 2 2.cu

In [None]:
!./2

Running with add<<<1, 10>>>...
Thread 0 result: 0
Thread 1 result: 0
Thread 2 result: 0
Thread 3 result: 0
Thread 4 result: 0
Thread 5 result: 0
Thread 6 result: 0
Thread 7 result: 0
Thread 8 result: 0
Thread 9 result: 0

 Running with add<<<10, 1>>>...
Thread 0 result: 0
Thread 1 result: 0
Thread 2 result: 0
Thread 3 result: 0
Thread 4 result: 0
Thread 5 result: 0
Thread 6 result: 0
Thread 7 result: 0
Thread 8 result: 0
Thread 9 result: 0


In [None]:
%%writefile 3.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void add_arrays( int *c, const int *a, const int *b, int size){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if(i < size){
    c[i] = a[i] + b[i];
    printf("This thread has done processing ... Thread id: %d", i);
  }else{
    printf("This thread has not done processing ... Thread id: %d", i);
  }
}

int main(){
  const int size = 5;
  int a[size]={1,2,3,4,5};
  int b[size]={1,2,3,4,5};
  int *d_c;

  cudaMalloc((void **)&d_c, size * sizeof(int));

  int *d_a, *d_b;
  cudaMalloc((void **)&d_a, size * sizeof(int));
  cudaMalloc((void **)&d_b, size * sizeof(int));

  cudaMemcpy(d_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

  //add_arrays<<<1, 5>>>(d_c, d_a, d_b, size);           //Total No of thread = No of elements
  add_arrays<<<1, 3>>>(d_c, d_a, d_b, size);        //Total No of thread < No of elements
  // add_arrays<<<1, 10>>>(d_c, d_a, d_b, size);       //Total No of thread > No of elements

  cudaDeviceSynchronize();

  int *c = (int *)malloc(5 * sizeof(int));

  cudaMemcpy(c, d_c, 5 * sizeof(int), cudaMemcpyDeviceToHost);

  for (int i=0; i<5; ++i){
    printf("%d ", c[i]);
  }
  printf("\n");

  cudaFree(c);
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  return 0;
  }

Overwriting 3.cu


In [None]:
!nvcc -o 3 3.cu

In [None]:
!./3

0 0 0 0 0 


In [None]:
%%writefile 4.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void add_arrays(int *c, const int *a, const int *b, int size) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < size) {
    c[i] = a[i] + b[i];
    printf("This thread has done processing ... Thread id: %d\n", i);
  } else {
    printf("This thread has not done processing ... Thread id: %d\n", i);
  }
}

int main() {
  const int size = 5;
  int a[size];
  int b[size];


  printf("Enter the elements of array a: \n");
  for (int i=0; i<size; i++){
    scanf("%d", &a[i]);
  }

  printf("Enter the elements of array b: \n");
  for (int i=0; i<size; i++){
    scanf("%d", &b[i]);
  }

  int *d_c;
  cudaMalloc((void **)&d_c, size * sizeof(int));

  int *d_a, *d_b;
  cudaMalloc((void **)&d_a, size * sizeof(int));
  cudaMalloc((void **)&d_b, size * sizeof(int));

  cudaMemcpy(d_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

  // Launch kernel with different configurations
  // Uncomment one line at a time to test:
  // add_arrays<<<1, 5>>>(d_c, d_a, d_b, size);   // Threads == Elements
  // add_arrays<<<1, 3>>>(d_c, d_a, d_b, size);      // Threads < Elements
  add_arrays<<<1, 10>>>(d_c, d_a, d_b, size);  // Threads > Elements

  cudaDeviceSynchronize();

  int *c = (int *)malloc(size * sizeof(int));
  cudaMemcpy(c, d_c, size * sizeof(int), cudaMemcpyDeviceToHost);

  for (int i = 0; i < size; ++i) {
    printf("%d ", c[i]);
  }
  printf("\n");

  free(c);
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  return 0;
}


Overwriting 4.cu


In [None]:
!nvcc -o 4 4.cu


In [None]:
!./4

Enter the elements of array a: 
^C
