In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-joj1pcxy
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-joj1pcxy
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=213d290f802b04ffab32d6894e75694bfdf512e825139868f298c0b342cf3760
  Stored in directory: /tmp/pip-ephem-wheel-cache-4t6nnhc3/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


## Matrix Multiplication in CPU

In [8]:
%%cu
#include <cuda.h>
#include <iostream>
using namespace std;

#define N 4

void matrix_mul(int* a, int* b, int* c){

    for (int row=0; row<N; row++){
        for (int col=0; col<N; col++){
            int sum = 0;            
            for (int k=0; k<N; k++){
                sum += a[row*N + k] * b[k*N + col];
            }
            c[row*N+col] = sum;
        }
    }   
}

int main(){
    
    int* h_a, * h_b, * h_c;
    size_t bytes = N * N * sizeof(int);
    srand(time(NULL));
    
    cudaMallocHost(&h_a, bytes);
    cudaMallocHost(&h_b, bytes);
    cudaMallocHost(&h_c, bytes);

    for (int i=0; i<N*N; i++){
        h_a[i] = rand()%5;
        h_b[i] = rand()%5;
    }

    cout << "Values of array h_a:" << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_a[i*N+j] << " ";
        }
        cout << endl;
    }
    cout << "\nValues of array h_b:" << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_b[i*N+j] << " ";
        }
        cout << endl;
    }
    cout << "\n-------------START-----------\n";


    matrix_mul(h_a, h_b, h_c);

    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_c[i*N+j] << " ";
        }
        cout << endl;
    }


  return 0;
}

Values of array h_a:
1 2 2 2 
1 3 2 4 
0 0 2 3 
2 2 1 2 

Values of array h_b:
4 2 0 4 
4 0 4 1 
0 0 0 4 
3 1 0 4 

-------------START-----------
18 4 8 22 
28 6 12 31 
9 3 0 20 
22 6 8 22 



## Matrix Multiplication in GPU

In [10]:
%%cu
#include <cuda.h>
#include <iostream>
#define N  4
#define Width  4
#define TILE_WIDTH  2
using namespace std;

__global__ void matrix_mul(int* a, int* b, int* c){

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0;            
    for (int k=0; k<N; k++){
        sum += a[row*N + k] * b[k*N + col];
    }
    c[row*N+col] = sum;
}

int main(){
    
    int* h_a, * h_b, * h_c;
    size_t bytes = N * N * sizeof(int);
    srand(time(NULL));
    
    cudaMallocHost(&h_a, bytes);
    cudaMallocHost(&h_b, bytes);
    cudaMallocHost(&h_c, bytes);

    for (int i=0; i<N*N; i++){
        h_a[i] = rand()%5;
        h_b[i] = rand()%5;
    }

    cout << "Values of array h_a:" << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_a[i*N+j] << " ";
        }
        cout << endl;
    }
    cout << "\nValues of array h_b:" << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_b[i*N+j] << " ";
        }
        cout << endl;
    }
    cout << "\n-------------START-----------\n";

     # // Create device variables
    int* d_a, * d_b, * d_c;

    # // Allocate memory on the device
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    # // Copy data from the host to the device (CPU -> GPU)
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    dim3 dimGrid(Width/TILE_WIDTH, Width/TILE_WIDTH, 1);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);


    # // Lauch the kernel
    matrix_mul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);
    # // Copy data from the device to the host (GPU -> CPU)
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    cout << endl << "values of array h_c " << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_c[i*N+j] << " ";
        }
        cout << endl;
    }


  return 0;
}

Values of array h_a:
1 2 2 4 
0 0 1 1 
4 1 4 1 
0 1 0 2 

Values of array h_b:
0 3 0 2 
2 1 4 0 
2 1 1 0 
4 4 1 1 

-------------START-----------

values of array h_c 
24 23 14 6 
6 5 2 1 
14 21 9 9 
10 9 6 2 

