In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-joj1pcxy
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-joj1pcxy
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=213d290f802b04ffab32d6894e75694bfdf512e825139868f298c0b342cf3760
  Stored in directory: /tmp/pip-ephem-wheel-cache-4t6nnhc3/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


## Matrix transpose (in CPU)

In [11]:
%%cu
#include <cuda.h>
#include <iostream>
using namespace std;

#define N 4

void matrix_mul(int* in, int* out){

    for (int row=0; row<N; row++){
        for (int col=0; col<N; col++){
            out[row*N + col] =  in[col*N + row];
        }
    }
       
}

int main(){
    
    int* in, * out ;
    size_t bytes = N * N * sizeof(int);
    srand(time(NULL));
    
    cudaMallocHost(&in, bytes);
    cudaMallocHost(&out, bytes);

    for (int i=0; i<N*N; i++){
        in[i] = rand()%5;
    }

    cout << "Values of array h_in:" << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << in[i*N+j] << " ";
        }
        cout << endl;
    }
  
    cout << "\n-------------START-----------\n";

    matrix_mul(in, out);

    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << out[i*N+j] << " ";
        }
        cout << endl;
    }


  return 0;
}

Values of array h_in:
1 0 1 2 
4 3 3 4 
2 1 3 3 
1 3 0 2 

-------------START-----------
1 4 2 1 
0 3 1 3 
1 3 3 0 
2 4 3 2 



## Matrix transpose (in GPU)


In [15]:
%%cu
#include <cuda.h>
#include <iostream>
#define N  4
#define Width  4
#define TILE_WIDTH  2
using namespace std;

__global__ void matrix_transpose(int* in, int* out){

    int row = blockIdx.x;
    int col = threadIdx.x;

    out[row*N + col] =  in[col*N + row];
}

int main(){
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int* h_in, * h_out;
    size_t bytes = N * N * sizeof(int);
    srand(time(NULL));
    
    cudaMallocHost(&h_in, bytes);
    cudaMallocHost(&h_out, bytes);

    for (int i=0; i<N*N; i++){
        h_in[i] = rand()%5;
    }

    cout << "Values of array h_in:" << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_in[i*N+j] << " ";
        }
        cout << endl;
    }

    cout << "\n-------------START-----------\n";

     # // Create device variables
    int* d_in, * d_out;

    # // Allocate memory on the device
    cudaMalloc(&d_in, bytes);
    cudaMalloc(&d_out, bytes);

    # // Copy data from the host to the device (CPU -> GPU)
    cudaMemcpy(d_in, h_in, bytes, cudaMemcpyHostToDevice);

    # // Lauch the kernel
    cudaEventRecord(start, 0);
    matrix_transpose<<<N, N>>>(d_in, d_out);
    cudaEventRecord(stop, 0);
    # // Copy data from the device to the host (GPU -> CPU)
    cudaMemcpy(h_out, d_out, bytes, cudaMemcpyDeviceToHost);
 
    cout << endl << "values of array h_out " << endl;
    for (int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            cout << h_out[i*N+j] << " ";
        }
        cout << endl;
    }

    float diff;
    cudaEventElapsedTime(&diff, start, stop);
    cout << "time: " << diff;

  return 0;
}

Values of array h_in:
4 1 3 3 
3 3 1 1 
2 1 4 4 
2 1 4 2 

-------------START-----------

values of array h_out 
4 3 2 2 
1 3 1 1 
3 1 4 4 
3 1 4 2 
time: 0.017696
