In [3]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-pe88ow06
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-pe88ow06
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=a383f1e8f23ba461b0995c1c0876691ccebb7b4e0e7698c3ea5b30f1435e1c62
  Stored in directory: /tmp/pip-ephem-wheel-cache-1q5af7hi/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [17]:
%%cu
#define inf 9999

__global__ void funct(int n, int k, float* x, int* qx) {

    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int j = ix & (n - 1);
    float temp2 = x[ix - j + k] + x[k * n + j];
    if (x[ix] > temp2) {
        x[ix] = temp2;
        qx[ix] = k;
    }
    if (x[ix] == inf) {
        qx[ix] = -2;
    }
}

__global__ void funct2(int n, int k, float* x, int* qx) {
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int j = ix & (n - 1);
    float temp2 = x[ix - j + k] + x[k * n + j];
    if (x[ix] > temp2) {
        x[ix] = temp2;
        qx[ix] = k;
    }
}


#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include  <time.h>
#include <sys/time.h>

int main(int argc, char **argv) {

    struct timeval first, second, lapsed, third, fourth, lapsed2;
    struct timezone tzp, tzp2;
    float *host_A, *host_D;
    int *host_Q;
    float *dev_x;
    int *dev_qx;
    float *A;
    int *Q;
    float *D;
    float tolerance = 0.001;

    int i, j, bk;
    int k = 0;
    int n = 512; // atoi(argv[1]);

    printf("\n");
    printf("RUNNING WITH %d VERTICES \n", n);
    printf("\n");

    cudaMalloc(&dev_x, n * n * sizeof (float));
    cudaMalloc(&dev_qx, n * n * sizeof (float));

    //CPU arrays
    A = (float *) malloc(n * n * sizeof (float)); //initial matrix A
    D = (float *) malloc(n * n * sizeof (float)); //initinal matrix D
    Q = (int *) malloc(n * n * sizeof (int)); //initinal matrix Q

    //GPU arrays
    host_A = (float *) malloc(n * n * sizeof (float));
    host_D = (float *) malloc(n * n * sizeof (float));
    host_Q = (int *) malloc(n * n * sizeof (int));

    srand(time(NULL));
    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            if (i == j) {
                A[i * n + j] = 0;
            } else {
                A[i * n + j] = 1200 * (float) rand() / RAND_MAX + 1;
                if (A[i * n + j] > 1000) {
                    A[i * n + j] = inf;
                }
            }
        }
    }
    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            Q[i * n + j] = -1;
        }
    }
    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            D[i * n + j] = A[i * n + j];
        }

    }
    for (i = 0; i < n; i++) //copy of A to host_A
    {
        for (j = 0; j < n; j++) {
            host_A[i * n + j] = A[i * n + j];
        }

    }
    for (i = 0; i < n; i++) //copy of Q to host_Q
    {
        for (j = 0; j < n; j++) {
            host_Q[i * n + j] = Q[i * n + j];
        }
    }
    gettimeofday(&third, &tzp2);
    ////////////////////////////First Mem Copy////////////////////
    gettimeofday(&first, &tzp);
    cudaMemcpy(dev_x, host_A, n * n * sizeof (float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_qx, host_Q, n * n * sizeof (int), cudaMemcpyHostToDevice);
    gettimeofday(&second, &tzp);
    if (first.tv_usec > second.tv_usec) {
        second.tv_usec += 1000000;
        second.tv_sec--;
    }
    lapsed.tv_usec = second.tv_usec - first.tv_usec;
    lapsed.tv_sec = second.tv_sec - first.tv_sec;
    printf("First Transfer CPU to GPU  Time elapsed: %lu, %lu s\n", lapsed.tv_sec, lapsed.tv_usec);
    ////////////////////////////////////////////////////GPU Calculation////////////////////////////////
    int gputhreads = 1;
    bk = (int) (n * n / gputhreads);
    
    if (bk > 0) {
        gputhreads = 1;
    } else {
        bk = 1;
        gputhreads = n*n;
    }
   
    printf(" \n");
    printf("BLOCKS :   %d      GPU THREADS:     %d \n", bk, gputhreads);
    printf(" \n");
    gettimeofday(&first, &tzp);
    funct << <bk, gputhreads>>>(n, k, dev_x, dev_qx);
    for (k = 1; k < n; k++) {
        funct2 << <bk, gputhreads>>>(n, k, dev_x, dev_qx);
    }
    cudaThreadSynchronize();
    gettimeofday(&second, &tzp);
    if (first.tv_usec > second.tv_usec) {
        second.tv_usec += 1000000;
        second.tv_sec--;
    }

    lapsed.tv_usec = second.tv_usec - first.tv_usec;
    lapsed.tv_sec = second.tv_sec - first.tv_sec;
    printf("GPU Calculation Time elapsed: %lu, %lu s\n", lapsed.tv_sec, lapsed.tv_usec);
    printf("\n");

    //////////////////////////////////////////////////////////////////////////Second Mem Copy////////////////////
    gettimeofday(&first, &tzp);
    cudaMemcpy(host_D, dev_x, n * n * sizeof (float), cudaMemcpyDeviceToHost);
    cudaMemcpy(host_Q, dev_qx, n * n * sizeof (int), cudaMemcpyDeviceToHost);
    gettimeofday(&second, &tzp);
    if (first.tv_usec > second.tv_usec) {
        second.tv_usec += 1000000;
        second.tv_sec--;
    }

    lapsed.tv_usec = second.tv_usec - first.tv_usec;
    lapsed.tv_sec = second.tv_sec - first.tv_sec;
    printf("Second Transfer GPU to CPU  Time elapsed: %lu, %lu s\n", lapsed.tv_sec, lapsed.tv_usec);
    printf("\n");
    //////////////////////////////////////////////////////////////////////

    gettimeofday(&fourth, &tzp2); //total time
    if (third.tv_usec > fourth.tv_usec) {
        fourth.tv_usec += 1000000;
        fourth.tv_sec--;
    }
    lapsed2.tv_usec = fourth.tv_usec - third.tv_usec;
    lapsed2.tv_sec = fourth.tv_sec - third.tv_sec;
    printf("TOTAL GPU + TRANSFERS  Time elapsed: %lu, %lu s\n", lapsed2.tv_sec, lapsed2.tv_usec);
    //////////////////////////////////////////////////////////////


    //CPU RUN 
    printf("\n");
    printf("\n");
    printf(" Now running on CPU... \n");
    printf("\n");
    gettimeofday(&first, &tzp);
    for (k = 0; k < n; k++) {
        for (i = 0; i < n; i++) {
            for (j = 0; j < n; j++) {

                if ((D[i * n + k] + D[k * n + j]) < D[i * n + j]) {
                    D[i * n + j] = D[i * n + k] + D[k * n + j];
                    Q[i * n + j] = k;
                }
                
            }
        }
    }
    /////////////////////////////////////////////////////////////////
    gettimeofday(&second, &tzp);
    if (first.tv_usec > second.tv_usec) {
        second.tv_usec += 1000000;
        second.tv_sec--;
    }
    lapsed.tv_usec = second.tv_usec - first.tv_usec;
    lapsed.tv_sec = second.tv_sec - first.tv_sec;
    printf("CPU Time elapsed: %lu,%06lu s\n", lapsed.tv_sec, lapsed.tv_usec);
    /////////////////////////////////////////////////////
    printf(" \n");
    printf(" \n");
    /////////////FROM HERE AND UNDER ARE VALIDATION RUNS

    printf("VALIDATING THAT D array from CPU and host_D array from GPU match... \n");
    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            if (abs(D[i * n + j] - host_D[i * n + j]) > tolerance) {

                printf("ERROR MISMATCH in array D i %d j %d CPU SAYS %f and GPU SAYS %f \n", i, j, D[i * n + j], host_D[i * n + j]);
            }
        }
    }
    printf("OK \n");
   

    printf("ALL OK WE ARE DONE \n");
    return 0;
}


RUNNING WITH 512 VERTICES 

First Transfer CPU to GPU  Time elapsed: 0, 533 s
 
BLOCKS :   262144      GPU THREADS:     1 
 
GPU Calculation Time elapsed: 0, 227349 s

Second Transfer GPU to CPU  Time elapsed: 0, 719 s

TOTAL GPU + TRANSFERS  Time elapsed: 0, 228610 s


 Now running on CPU... 

CPU Time elapsed: 0,672676 s
 
 
VALIDATING THAT D array from CPU and host_D array from GPU match... 
OK 
ALL OK WE ARE DONE 

