In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-joj1pcxy
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-joj1pcxy
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=213d290f802b04ffab32d6894e75694bfdf512e825139868f298c0b342cf3760
  Stored in directory: /tmp/pip-ephem-wheel-cache-4t6nnhc3/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


## Stencil 2D (in CPU)

In [6]:
%%cu
#include <cuda.h>
#include <iostream>
using namespace std;

#define N 4

void stencil_2D(int in[], int out[], int size_out){
    
  int index, left, right, below, above;

  for (int i=0; i<size_out; i++){
      for (int j=0; j<size_out; j++){
          index = (i+1)*N+(j+1);
          left = index - 1;
          right = index + 1;
          above = i*N + (j+1);
          below = (i+2)*N + (j+1);

          out[i*N + j] = in[index] + in[left] + in[right] + in[above] + in[below];

      }
  }
}

int main(){
  int in[N*N];
  int out[(N-2)*(N-2)];
  size_t bytes = N * N * sizeof(int);
  srand(time(NULL));
  
  for (int i=0; i<N*N; i++){
      in[i] = rand()%10;
  }

  for (int i=0; i<N; i++){
      for (int j=0; j<N; j++){
          cout << in[i*N+j] << " ";
      }
      cout << endl;
  }

  cout << "-------------Result-----------\n";

  stencil_2D(in, out, N-2);


  for (int i=0; i<2; i++){
      for (int j=0; j<2; j++){
          cout << out[i*N+j] << " ";
      }
      cout << endl;
  }


  return 0;
}

3 7 2 4 
3 7 0 6 
3 7 5 3 
3 4 5 0 
-------------Result-----------
24 20 
26 20 



## Stencil 2D in GPU

In [7]:
%%cu
#include <cuda.h>
#include <iostream>
using namespace std;

#define N 4

__global__ void stencil_2D(int in[], int out[]){
    
    int index, left, right, below, above;
    int i = blockIdx.x;
    int j = threadIdx.x;

    index = (i+1)*N+(j+1);
    left = index - 1;
    right = index + 1;
    above = i*N + (j+1);
    below = (i+2)*N + (j+1);
  
    out[i*N + j] = in[index] + in[left] + in[right] + in[above] + in[below];

}

int main(){
    
    int* in;
    int* out;
    size_t bytes_in = N * N * sizeof(int);
    size_t bytes_out = (N-2) * (N-2) * sizeof(int);
    srand(time(NULL));
    
    cudaMallocHost(&in, bytes_in);
    cudaMallocHost(&out, bytes_out);

    for (int i=0; i<N*N; i++){
        in[i] = rand()%10;
    }

    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << in[i*N+j] << " ";
        }
        cout << endl;
    }

    cout << "-------------START-----------\n";
    # // Create device variables
    int* d_in, *d_out;

    # // Allocate memory on the device
    cudaMalloc(&d_in, bytes_in);
    cudaMalloc(&d_out, bytes_out);

    # // Copy data from the host to the device (CPU -> GPU)
    cudaMemcpy(d_in, in, bytes_in, cudaMemcpyHostToDevice);
    # // Lauch the kernel
    stencil_2D<<<N-2, N-2>>>(d_in, d_out);
    # // Copy data from the device to the host (GPU -> CPU)
    cudaMemcpy(out, d_out, bytes_out, cudaMemcpyDeviceToHost);


    stencil_2D<<<N-2, N-2>>>(in, out);

    for (int i=0; i<2; i++){
        for (int j=0; j<2; j++){
            cout << out[i*N+j] << " ";
        }
        cout << endl;
    }

  return 0;
}

9 5 0 9 
9 0 6 0 
8 4 1 9 
8 4 8 8 
-------------START-----------
24 7 
17 28 

