In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-joj1pcxy
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-joj1pcxy
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=213d290f802b04ffab32d6894e75694bfdf512e825139868f298c0b342cf3760
  Stored in directory: /tmp/pip-ephem-wheel-cache-4t6nnhc3/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


## Stencil 1D operation in CPU 

In [3]:
%%cu
#include <cuda.h>
#include <iostream>
using namespace std;

void stencil_1D(int in[], int out[], int tout){
    int radius = 3;
    int sum;

    for (int i=0; i<tout; i++){
      int j = i + radius;
      sum = 0;
      for (int offset=-radius; offset<=radius; offset++){
          sum += in[j+offset];
      }
      out[i] = sum;
    }

}

int main(){
    int tout = 14;
    int arr[20] =  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
    int out[tout];
    stencil_1D(arr, out, tout);

    for (int i=0; i<tout; i++){
        cout << out[i] << " ";
    }

  return 0;
}

28 35 42 49 56 63 70 77 84 91 98 105 112 119 


## Stencil 1D in GPU (Using Global memory)

In [4]:
%%cu
#include <cuda.h>
#include <iostream>
using namespace std;

__global__ void stencil_1D(int in[], int out[], int len){
   
    int radius = 3;
    int sum;

    int j = threadIdx.x + radius;
    sum = 0;
    for (int offset=-radius; offset<=radius; offset++){
        sum += in[j+offset];
    }
    out[j-radius] = sum;

}

int main(){
    int tout = 14;
    int h_in[20] =  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
    int h_out[14];
    size_t bytes = 20 * sizeof(int);

    # // Vectors for holding the device side (GPU side) data
    int* d_in, * d_out;
    cudaMalloc(&d_in, bytes);
    cudaMalloc(&d_out, bytes);

    # // Copy data from the host to the device (CPU -> GPU)
    cudaMemcpy(d_in, h_in, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, h_out, 14 * sizeof(int), cudaMemcpyHostToDevice);

    stencil_1D<<<1, 14>>>(d_in, d_out, tout);
    cudaMemcpy(h_out, d_out, 14 * sizeof(int), cudaMemcpyDeviceToHost);
    
    for (int i=0; i<tout; i++) 
      cout << h_out[i] << " ";
    cout << endl;

  return 0;
}

28 35 42 49 56 63 70 77 84 91 98 105 112 119 



## Stencil 1D with 2D array in GPU (Using Shared memory)

In [5]:
# 1D Stencil (CPU)

%%cu
#include <cuda.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
using namespace std;

#define N 20
#define BLOCK_SIZE 14
#define RADIUS 3


__global__ void stencil_1d_array2d(int* in, int* out){
      
   __shared__ int temp[BLOCK_SIZE + 2*RADIUS];  // # Shared memory [Each block has this]
   int gindex = threadIdx.x + blockIdx.x * blockDim.x;  // # shows which block I am
   int lindex = threadIdx.x + RADIUS;

   // #Read input elements into shared memory
   
   temp[lindex] = in[gindex + RADIUS];

   if (threadIdx.x < RADIUS){
       temp[lindex - RADIUS] = in[gindex];
       temp[lindex + BLOCK_SIZE] = in[(gindex + RADIUS) + BLOCK_SIZE];
   }

  // # Synchronize (to be ensure all the data is available)
  __syncthreads();

   // #Apply the stencil
    int result = 0;
    for (int offset = -RADIUS; offset <= RADIUS; offset++)
        result += temp[lindex + offset];
    // #Store the result
    out[gindex] =  result;
       
}

int main(){
  
  size_t bytes_in = sizeof(int) * N*N;
  size_t bytes_out = sizeof(int) * BLOCK_SIZE*N;
  srand(time(NULL));

  // #Vectors for holding the host-side (CPU-side) data
  int* h_a, * h_c;
  // #Vectors for holding the device-side (GPU-side) data
  int* d_a, * d_c;

  // #Allocate pinned memory
  cudaMallocHost(&h_a, bytes_in);
  cudaMallocHost(&h_c, bytes_out);

  // #Initialize random numbers in each array
  for (int i=0; i<N*N; i++)
      h_a[i] = rand() %100;
  
  cout<<"values of array a "<<endl;
  for (int i=0; i<N; i++){
      for (int j=0; j<N; j++)
          cout<<h_a[i*N+j]<<" ";
          cout<<endl;
  }

  cout << endl << "-------------------------------------------------------------" << endl;

  // #Allocate memory on the device
  cudaMalloc(&d_a, bytes_in);
  cudaMalloc(&d_c, bytes_out);

  // #Copy data from the host to the device (CPU -> GPU)
  cudaMemcpy(d_a, h_a, bytes_in, cudaMemcpyHostToDevice);
  // #Lauch the kernel
  stencil_1d_array2d<<<N, BLOCK_SIZE>>>(d_a, d_c);
  // #Copy data from the device to the host (GPU -> CPU)
  cudaMemcpy(h_c, d_c, bytes_out, cudaMemcpyDeviceToHost);

  cout << endl << "values of array c " << endl;
  for (int i=0; i<N; i++){
      for (int j=0; j<BLOCK_SIZE; j++)
          cout << h_c[i*BLOCK_SIZE+j] << " ";
          cout << endl;
  }

  // #Free pinned memory
  cudaFreeHost(h_a);
  cudaFreeHost(h_c);

  // #Free memory on device
  cudaFree(d_a);
  cudaFree(d_c);

  cout << endl << " COMPLETED SUCCESSFULLY\n";

  return 0;
}

values of array a 
23 25 98 42 13 64 94 46 72 29 25 17 49 44 54 62 59 35 54 98 
67 49 83 77 60 97 71 58 40 13 61 63 91 11 57 4 76 4 3 0 
33 28 69 35 25 75 49 36 62 3 34 29 53 69 59 65 67 30 24 7 
95 37 23 86 49 80 43 77 36 46 29 70 26 51 5 51 26 6 87 89 
10 74 70 15 43 81 32 10 63 8 70 59 46 93 97 47 25 40 24 62 
86 5 84 13 56 41 16 83 47 56 72 9 30 94 24 25 76 9 88 39 
17 58 98 63 3 96 10 80 88 86 94 75 92 78 40 0 71 56 83 71 
64 7 80 94 2 57 72 78 66 60 17 35 18 68 99 73 64 61 53 4 
0 48 31 44 78 71 44 2 80 28 73 44 35 5 91 89 14 15 67 32 
75 37 68 45 5 19 18 21 32 23 25 32 71 57 76 2 80 21 56 60 
1 81 57 88 86 0 78 53 15 97 85 90 34 5 35 39 24 5 12 57 
28 38 41 52 95 18 54 27 91 62 88 92 43 97 80 81 97 10 34 12 
8 72 54 94 77 41 86 2 46 98 11 74 36 52 78 83 22 32 11 65 
46 51 57 89 48 90 71 97 52 5 61 12 77 15 59 7 8 45 9 6 
43 72 80 32 76 11 67 51 95 30 16 42 33 26 31 33 68 2 30 72 
60 91 85 89 58 44 96 18 41 57 24 36 81 57 68 58 68 88 61 63 
70 77 5 4 55 37 37 23 91 20 96 3 11 33 