<a href="https://colab.research.google.com/github/JayVeer18/CUDA-Programs/blob/main/Dot_Product.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Step:1

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

### Step:2

In [None]:

!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

### Step:3

In [None]:
!nvcc --version

### Step-4

In [None]:
pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

### Step-5

In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


### Dot Product using Shared Memory

In [10]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#define imin(a,b) (a<b?a:b)

const int N = 33*1024;
const int threadsperblock = 256;
const int blockspergrid = imin(32, (N+threadsperblock-1)/threadsperblock);
__global__ void dot(int *a, int *b, int *c){
    __shared__ int cache[threadsperblock];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheindex = threadIdx.x;
    int temp = 0;
    while(tid < N){
        temp += a[tid] * b[tid];
        tid += blockDim.x * gridDim.x; 
    } 
    cache[cacheindex] = temp;
    __syncthreads();
    int i = blockDim.x/2;
    while(i != 0){
        if(cacheindex < i){
            cache[cacheindex] += cache[cacheindex + i];
            __syncthreads();
        }
        i /= 2;
    }
    if(cacheindex == 0){
        c[blockIdx.x] = cache[0];
    }
}
int main()
{
  int *a,*b,*partial_c;
  int *da, *db, *dc;
  int size = N*sizeof(int);
  a = new int[N];
  b = new int[N];
  //partial_c = (int*)malloc(blockspergrid*sizeof(int));
  partial_c = new int[blockspergrid];
 
  cudaMalloc((void **)&da, size);
  cudaMalloc((void **)&db, size);
  cudaMalloc((void **)&dc, blockspergrid*sizeof(int));
 int res=0;
 for(int i=0;i<N;i++){
      a[i] = i;
      b[i] = i * 2;
      res += (a[i] * b[i]);
  }
 
  cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
  
  dot<<<blockspergrid,threadsperblock>>>(da, db, dc);
  
  cudaMemcpy(partial_c, dc, blockspergrid*sizeof(int), cudaMemcpyDeviceToHost);
  
  int c=0;
  for(int i=0;i<blockspergrid;i++){
      c+=partial_c[i];
  }
  printf("Dot Product from GPU : %d \n Dot Product from CPU : %d",c,res);

  cudaFree(da);
  cudaFree(db);
  cudaFree(dc);
  return 0;
}

Dot Product from GPU : 1005595648 
 Dot Product from CPU : 1005595648


### Dot Product using Atomics


In [19]:
%%cu
#include <stdio.h>
#include <stdlib.h>

const int N = 500;
__global__ void dotProduct(int *a, int *b, int *c)
{
  int i = threadIdx.x;
  atomicAdd(c, a[i]*b[i]);
 
}
int main()
{
  int *a,*b;
  int *da, *db, *dc;
  int size = N*sizeof(int);
  a = new int[N];
  b = new int[N];
  int res=0,c=0;
  for(int i=0;i<N;i++){
        a[i] = i;
        b[i] = i * 2;
        res += (a[i] * b[i]);
    }
  cudaMalloc((void **)&da, size);
  cudaMalloc((void **)&db, size);
  cudaMalloc((void **)&dc, sizeof(int));
 
  cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

  dotProduct<<<1,N>>>(da, db, dc);
 
  cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);
  printf("Dot Product from GPU : %d \n Dot Product from CPU : %d",c,res);

  cudaFree(da);
  cudaFree(db);
  cudaFree(dc);
  return 0;
}

Dot Product from GPU : 83083500 
 Dot Product from CPU : 83083500
