In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-joj1pcxy
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-joj1pcxy
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=213d290f802b04ffab32d6894e75694bfdf512e825139868f298c0b342cf3760
  Stored in directory: /tmp/pip-ephem-wheel-cache-4t6nnhc3/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [18]:
%%cu
#include <cuda.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
using namespace std;

#define N 10

// #Interleaved (divergence)
__global__ void reduce_1(int* g_idata, int* g_odata){
  
  extern __shared__ int sdata[];
  // #Each thread loads one element from global to shared memory
  int tid = threadIdx.x;
  int i = blockIdx.x*blockDim.x  + threadIdx.x;
  sdata[tid] = g_idata[i];

  __syncthreads();  
  
  // #Do reduction in shared memory
  int dim = blockDim.x;
  for (int s=2; s<=dim; s=s*2){
      if (tid%s == 0)
          sdata[tid] += sdata[tid+(s/2)];
          
      __syncthreads();
  }

  // #Write result for this block to global memory
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

// # bank conflicts
__global__ void reduce_2(int* g_idata, int* g_odata){
  
  extern __shared__ int sdata[];
  // #Each thread loads one element from global to shared memory
  int tid = threadIdx.x;
  int i = blockIdx.x*blockDim.x  + threadIdx.x;
  sdata[tid] = g_idata[i];

  __syncthreads();  
  
  // #Do reduction in shared memory
  int index ;
  for (unsigned int s=1; s<blockDim.x; s*=2){
       index = 2 * s * tid;
      
      if (index < blockDim.x)  sdata[index] += sdata[index+s];
     
      __syncthreads();
  }

  // #Write result for this block to global memory
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

# // suquential reduce
__global__ void reduce_3(int* g_idata, int* g_odata){
  
  extern __shared__ int sdata[];
  // #Each thread loads one element from global to shared memory
  int tid = threadIdx.x;
  int i = blockIdx.x*blockDim.x  + threadIdx.x;
  sdata[tid] = g_idata[i];

  __syncthreads();  
  // #Do reduction in shared memory
  for (int s=blockDim.x/2; s>0; s=s/2){
      if (tid < s){
          sdata[tid] += sdata[tid+s];
      }
            
      __syncthreads();
  }
  // #Write result for this block to global memory
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

# // Using shared memeory
__global__ void reduce_4(int* g_idata, int* g_odata){
  
  extern __shared__ int sdata[];
  // #Each thread loads one element from global to shared memory
  int tid = threadIdx.x;
  int i = blockIdx.x*blockDim.x  + threadIdx.x;
  sdata[tid] = g_idata[i] + g_idata[1023-i];

  __syncthreads();  
  
 // #Do reduction in shared memory
  for (int s=blockDim.x/2; s>0; s=s/2){
      if (tid < s){
          sdata[tid] += sdata[tid+s];
      }
            
      __syncthreads();
  }
  // #Write result for this block to global memory
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

int main(){
  
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  const int count = 1024;
  const int size = count * sizeof(int);
  int h[1024];
  for (int i = 0; i < 1024; i++) {
        h[i] = 1;
    }

  int* d, * g, result;
  
  cudaMalloc(&d, size);
  cudaMalloc(&g, sizeof(int));
  cudaMemcpy(d, h, size, cudaMemcpyHostToDevice);

  cudaEventRecord(start,0);

  reduce_4<<<1, count/2, count*sizeof(int)>>>(d, g);

  cudaEventRecord(stop,0);
  
  cudaMemcpy(&result, g, sizeof(int), cudaMemcpyDeviceToHost);

  float diff;
  cudaEventElapsedTime(&diff, start, stop);
  cout<<endl<<"\nTime: "<< diff;

  
  cout << "\nSum is " << result << endl;
  
  cudaFree(d);

  return 0;
}



Time: 0.020448
Sum is 1024

