In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-joj1pcxy
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-joj1pcxy
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=213d290f802b04ffab32d6894e75694bfdf512e825139868f298c0b342cf3760
  Stored in directory: /tmp/pip-ephem-wheel-cache-4t6nnhc3/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


# Scan operation in CPU

In [22]:
%%cu
#include <iostream>

#define N 8

using namespace std;

void scan( float* output, float* input, int length)
{
  output[0] = input[0]; // # since this is a prescan, not a scan
  cout <<"output is "<< output[0] << " " ;
  for(int j = 1; j < length; ++j){
    output[j] = input[j] + output[j-1];
    cout << output[j] << " " ;
  }
}


int main()
{
    float input[5] ;
    float output[5] ;
    cout <<"input is " ;
    for (int i=0;i<=4;i++)
    {
        input[i] = i+1 ;
        cout<<  input[i] << " " ;
    }
    cout<<endl ;
    scan(  output, input, 5);
    return 0;
}

input is 1 2 3 4 5 
output is 1 3 6 10 15 


# Scan Operation in GPU

In [25]:
%%cu
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <time.h>
#include <iostream>
#include "device_launch_parameters.h"

#define N 8


using namespace std;

__global__ void scan(int *g_odata, int *g_idata, int n)
{
  extern __shared__ int temp[]; // allocated on invocation
  int thid = threadIdx.x;
  // load input into shared memory.
  temp[thid] =  g_idata[thid] ;
  __syncthreads();
  for ( int offset = 1; offset < n; offset = offset*2 ){

    if (thid >= offset){
        int t = temp[thid - offset];
        __syncthreads();
        temp[thid] += t;
        __syncthreads();
        
    }

  }
  g_odata[thid] = temp[thid]; // # write output
}




int main()
{
   
    size_t bytes_in = sizeof(int) * N;
    int out_size = N ; 
    size_t bytes_out = sizeof(int) * out_size;
    srand(time(NULL));
    // Vectors for holding the host-side (CPU-side) data
    int* h_a, *d_a, * d_c, *h_c;

    // Allocate pinned memory
    cudaMallocHost(&h_a, bytes_in);
    cudaMallocHost(&h_c, bytes_out);

    // Initialize random numbers in each array
    for (int i = 0; i < N; i++) {
        h_a[i] = rand() % 10;   
    }
    cout << "values of array a " << endl;
    for (int i = 0; i < N; i++)
    { 
        cout  << h_a[i] << " ";
        
    }
 
    cout << endl << "----------------- "<<endl ;

    // Allocate memory on the device
   
    cudaMalloc(&d_a, bytes_in);
    cudaMalloc(&d_c, bytes_out);

    // Copy data from the host to the device (CPU -> GPU)
    cudaMemcpy(d_a, h_a, bytes_in, cudaMemcpyHostToDevice);
    
      scan<<<1,N,bytes_out>>>(d_c, d_a,N) ;
    cudaMemcpy(h_c, d_c, bytes_out, cudaMemcpyDeviceToHost);
    
    cout << endl << "values of array c " << endl;
    for (int i = 0; i < N; i++)
    {

        cout << h_c[i] << " ";
        
    }


    // Free pinned memory
    cudaFreeHost(h_a);
    cudaFreeHost(h_c);

    // Free memory on device
    cudaFree(d_a);
    cudaFree(d_c);

    cout << endl<< "\nCOMPLETED SUCCESSFULLY\n";

    return 0;

}

values of array a 
6 7 9 1 5 4 4 7 
----------------- 

values of array c 
6 13 22 23 28 32 36 43 

COMPLETED SUCCESSFULLY

