In [2]:
! pip install git+git://github.com/frehseg/nvcc4jupyter.git

Collecting git+git://github.com/frehseg/nvcc4jupyter.git
  Cloning git://github.com/frehseg/nvcc4jupyter.git to /tmp/pip-req-build-_ghd6ecr
  Running command git clone -q git://github.com/frehseg/nvcc4jupyter.git /tmp/pip-req-build-_ghd6ecr
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.1-cp36-none-any.whl size=2095 sha256=e895917d61d9ee133a53c18ff56201de5ee4bd1f1e2cd6de066252e5985bee66
  Stored in directory: /tmp/pip-ephem-wheel-cache-9wzdbfvq/wheels/a4/a5/24/17a2b61f9a725a10155cc6fca753aae28436921df21fa16114
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.1


In [0]:
%load_ext nvcc_plugin

In [7]:
%%cu
#include <cuda_runtime.h>
#include <stdio.h>
#define DIM 1024


// Recursive Implementation 
int recursiveReduce(int *data, int const size)
{
    if (size == 1) return data[0];

    int const stride = size / 2;

    for (int i = 0; i < stride; i++)
        data[i] += data[i + stride];

    return recursiveReduce(data, stride);
}

__global__ void reduce1(int *g_idata, int *g_odata,
                                     unsigned int  n)
{
    extern __shared__ int smem[];

    // set thread ID
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // boundary check
    if (idx >= n) return;

    smem[tid] = g_idata[idx];
    __syncthreads();

    // in-place reduction in global memory
    for (int stride = 1; stride < blockDim.x; stride *= 2)
    {
        if ((tid % (2 * stride)) == 0)
        {
            smem[tid] += smem[tid + stride];
        }

        // synchronize within threadblock
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = smem[0];
}

__global__ void reduce2(int *g_idata, int *g_odata,
                                     unsigned int  n)
{
    extern __shared__ int smem[];

    // set thread ID
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // boundary check
    if (idx >= n) return;

    smem[tid] = g_idata[idx];
    __syncthreads();

    // in-place reduction in global memory
    for (int stride = 1; stride < blockDim.x; stride *= 2)
    {
        int index = 2 * stride * tid;
     
        if (index < blockDim.x)
        {
            smem[index] += smem[index + stride];
        }

        // synchronize within threadblock
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = smem[0];
}


__global__ void reduce3(int *g_idata, int *g_odata,
                                     unsigned int  n)
{
    extern __shared__ int smem[];

    // set thread ID
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // boundary check
    if (idx >= n) return;

    smem[tid] = g_idata[idx];
    __syncthreads();

    // in-place reduction in global memory
    for (int stride =blockDim.x/2; stride>0; stride>>=1)
    {     
        if (tid < stride)
        {
            smem[tid] += smem[tid + stride];
        }

        // synchronize within threadblock
        __syncthreads();
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = smem[0];
}


__global__ void reduce_unroll(int *g_idata, int *g_odata, unsigned int n)
{
    extern __shared__ int smem[];

    // set thread ID
    unsigned int tid = threadIdx.x;

    // boundary check
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx >= n) return;

    // convert global data pointer to the local pointer of this block
    int *idata = g_idata + blockIdx.x * blockDim.x;

    // set to smem by each threads
    smem[tid] = idata[tid];
    __syncthreads();

    // in-place reduction in shared memory
    if (blockDim.x >= 1024 && tid < 512) smem[tid] += smem[tid + 512];

    __syncthreads();

    if (blockDim.x >= 512 && tid < 256) smem[tid] += smem[tid + 256];

    __syncthreads();

    if (blockDim.x >= 256 && tid < 128) smem[tid] += smem[tid + 128];

    __syncthreads();

    if (blockDim.x >= 128 && tid < 64)  smem[tid] += smem[tid + 64];

    __syncthreads();

    // unrolling warp
    if (tid < 32)
    {
        volatile int *vsmem = smem;
        vsmem[tid] += vsmem[tid + 32];
        vsmem[tid] += vsmem[tid + 16];
        vsmem[tid] += vsmem[tid +  8];
        vsmem[tid] += vsmem[tid +  4];
        vsmem[tid] += vsmem[tid +  2];
        vsmem[tid] += vsmem[tid +  1];
    }

    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = smem[0];
}

int main()
{
    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    printf("device %d: %s ", dev, deviceProp.name);
    cudaSetDevice(dev);
    
    int power = 10000;

    int blocksize = DIM;   

    /* Number of elements to reduce */
    int size = 2 << power; 
    printf("With array size %d  :", size);

    dim3 block (blocksize, 1);
    dim3 grid  ((size + block.x - 1) / block.x, 1);
    printf("grid %d block %d\n", grid.x, block.x);
 
    double cpu_time = 0.0;
    double red1_time = 0.0;
    double red2_time = 0.0;
    double red3_time = 0.0;
    double red_unroll_time = 0.0;

    /* Allocate host memory */
    size_t bytes = size * sizeof(int);
    int *h_idata = (int *) malloc(bytes);
    int *h_odata = (int *) malloc(grid.x * sizeof(int));
    int *tmp     = (int *) malloc(bytes);

    /* Initialize the array */
    for (int i = 0; i < size; i++)
        h_idata[i] = (int)( rand() & 0xFF );

    memcpy (tmp, h_idata, bytes);

    int gpu_sum = 0;

    /* Allocate device memory */
    int *d_idata = NULL;
    int *d_odata = NULL;
    cudaMalloc((void **) &d_idata, bytes);
    cudaMalloc((void **) &d_odata, grid.x * sizeof(int));
 
    int n_iter = 10;
    for(int i = 0; i < n_iter; i++){

    /* Cpu reduction */
    clock_t start = clock();
    int cpu_sum = recursiveReduce (tmp, size);
    clock_t end = clock();
    cpu_time += ((double) end-start)/CLOCKS_PER_SEC; // in seconds
    if(i == 0){
    printf("cpu reduce: %d\n", cpu_sum);
    }

    /* reduce1 */
    cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice);
    start = clock();
    reduce1<<<grid.x, block, blocksize*sizeof(int)>>>(d_idata, d_odata,
            size);
    end = clock();
    cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),
                     cudaMemcpyDeviceToHost);
    gpu_sum = 0;

    for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];
    red1_time += ((double) end-start)/CLOCKS_PER_SEC; // in seconds
    if(i == 0){
    printf("reduce1: %d <<<grid %d block %d>>>\n", gpu_sum, grid.x,
           block.x);
    }
 
     /* reduce2 */
    cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice);
    start = clock();
    reduce2<<<grid.x, block, blocksize*sizeof(int)>>>(d_idata, d_odata,
            size);
    end = clock();
    cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),
                     cudaMemcpyDeviceToHost);
    gpu_sum = 0;

    for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];
    red2_time += ((double) end-start)/CLOCKS_PER_SEC; // in seconds
    if(i == 0){
    printf("reduce2: %d <<<grid %d block %d>>>\n", gpu_sum, grid.x,
           block.x);
    }

    /* reduce3 */
    cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice);
    start = clock();
    reduce3<<<grid.x, block, blocksize*sizeof(int)>>>(d_idata, d_odata,
            size);
    end = clock();
    cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),
                     cudaMemcpyDeviceToHost);
    gpu_sum = 0;

    for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];
 
    red3_time += ((double) end-start)/CLOCKS_PER_SEC; // in seconds
    if(i == 0){
    printf("reduce3: %d <<<grid %d block %d>>>\n", gpu_sum, grid.x,
           block.x);
    }

    /* reduce_unroll */
    cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice);
    start = clock();
    reduce_unroll<<<grid.x, block, blocksize*sizeof(int)>>>(d_idata, d_odata,
            size);
    end = clock();
    cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),
                     cudaMemcpyDeviceToHost);
    gpu_sum = 0;

    for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];
 
    red_unroll_time += ((double) end-start)/CLOCKS_PER_SEC; // in seconds
    if(i == 0){
    printf("reduce_unroll: %d <<<grid %d block %d>>>\n", gpu_sum, grid.x,
           block.x);
    }
    }
    printf("\nRecursive version kernel took %f seconds to execute \n", cpu_time/n_iter);
    printf("\nInterleaved addressing with divergent branching took %f seconds to execute \n", red1_time/n_iter);
    printf("\nInterleaved addressing with bank conflicts took %f seconds to execute \n", red2_time/n_iter);
    printf("\nSequential addressing took %f seconds to execute \n", red3_time/n_iter);
    printf("\nUnroll version took %f seconds to execute \n", red_unroll_time/n_iter);

    // free host memory
    free(h_idata);
    free(h_odata);

    // free device memory
    cudaFree(d_idata);
    cudaFree(d_odata);

    // reset device
    cudaDeviceReset();

    return EXIT_SUCCESS;
}

device 0: Tesla T4 With array size 131072  :grid 128 block 1024
cpu reduce: 16747465
reduce1: 16747465 <<<grid 128 block 1024>>>
reduce2: 16747465 <<<grid 128 block 1024>>>
reduce3: 16747465 <<<grid 128 block 1024>>>
reduce_unroll: 16747465 <<<grid 128 block 1024>>>

Recursive version kernel took 0.000443 seconds to execute 

Interleaved addressing with divergent branching took 0.000008 seconds to execute 

Interleaved addressing with bank conflicts took 0.000005 seconds to execute 

Sequential addressing took 0.000005 seconds to execute 

Unroll version took 0.000005 seconds to execute 



In [13]:
%%cu
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <time.h>


int main(void) {
  printf(" Thrust Library version \n");

  thrust::device_vector<int> D(131072, 1);
  int sum_t;
 
  for (int s = 1; s < 10; s++) {
    clock_t start_gpu = clock();
    sum_t = thrust::reduce(D.begin(), D.end());
    clock_t end_gpu = clock();
    int time_thrust = end_gpu - start_gpu;
    printf("  t = %d µs \n", time_thrust);
  }

  printf("\n   S = %d ", sum_t);


  return 0;
}

 Thrust Library version 
  t = 80 µs 
  t = 44 µs 
  t = 39 µs 
  t = 37 µs 
  t = 38 µs 
  t = 38 µs 
  t = 37 µs 
  t = 37 µs 
  t = 37 µs 

   S = 131072 
