In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!nvcc --version

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

In [33]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <fstream>

//#include "cpu_bitmap.h"

#define BLOCK_SIZE 2

const int N = 2;

__global__ void kernel(float * a, float * b, int n, float * c)
{
    int bx = blockIdx.x,    by = blockIdx.y;
    int tx = threadIdx.x,   ty = threadIdx.y;
 
    int aBegin  = n * BLOCK_SIZE * by;
    int aEnd    = aBegin + n - 1;
 
    int bBegin  = BLOCK_SIZE * bx;
    int aStep   = BLOCK_SIZE, bStep   = BLOCK_SIZE * n;

    float sum=0.0f;

    for (int ia = aBegin, ib = bBegin; ia <= aEnd; ia += aStep, ib += bStep) 
    {

        __shared__ float as[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float bs[BLOCK_SIZE][BLOCK_SIZE];
     
        as[ty][tx] = a[ia + n * ty + tx];
        bs[ty][tx] = b[ib + n * ty + tx];
        __syncthreads();
        for (int k = 0; k < BLOCK_SIZE; k++)
            sum += as[ty][k] * bs[k][tx];
        __syncthreads();

    }

    c[n * BLOCK_SIZE * by + BLOCK_SIZE * bx + n * ty + tx] = sum;

}

void CPU_keernel(float *a, float *b, int N, float *c)
{

    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            c[i*N + j] = 0;
            for (int k = 0; k < N; k++) 
            {
                c[i*N + j] += a[i*N + k] * b[k*N + j];
            }
        }
    }
}

int main()
{

    float A[N][N];
    float B[N][N];
    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            A[j][i] = 3;//j * N + i;
            B[j][i] = 25;//j * N * 2 + i * 2;
        }
    }
    float C[N][N];

    float *a = new float [N*N*sizeof(float)];
    float *b = new float [N*N*sizeof(float)];
    float *c = new float [N*N*sizeof(float)];

    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            a[j*N + i] = A[j][i];
            b[j*N + i] = B[j][i];
        }
    }

    float *dev_a, *dev_b, *dev_c;
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

   
    dim3 blocks((N/ threads.x), (N / threads.y));
    

    cudaMalloc((void**)&dev_a, N*N * sizeof(float));

    cudaMalloc((void**)&dev_b, N*N * sizeof(float));
    cudaMalloc((void**)&dev_c, N*N * sizeof(float));

//copy

    cudaMemcpy(dev_a, a, N*N * sizeof(float), cudaMemcpyHostToDevice);

    cudaMemcpy(dev_b, b, N*N * sizeof(float), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;

    float gpu_time = 0.0;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
 
    kernel << <blocks, threads >> > (dev_a, dev_b, N, dev_c);
    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&gpu_time, start, stop);
    printf("time on gpu = %2fmiliseconds\n\n", gpu_time);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    cudaMemcpy(c, dev_c, N*N * sizeof(float), cudaMemcpyDeviceToHost);

    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            C[j][i] = c[j*N + i];
        }   
    }
    double nn1 = pow(10,10);
    double nn2 = pow(10,10);
    
    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            printf(" %2.f",C[j][i]*nn1*nn2);
        }

        printf("\n\n");
    }

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    delete[] a;
    delete[] b;
    delete[] c;

    return 0;

}

time on gpu = 0.130336miliseconds

 150000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 150000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

 150000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 150000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


