In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!nvcc --version

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include<iostream>
#include<cstdlib>
using namespace std;

__global__ void vectorAdd(int *a, int *b, int *result, int n) {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    if(tid <= n) {
        result[tid] = a[tid] + b[tid];
    }
}

void print_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        cout<<"  "<<a[i];
    }
    cout<<endl;
}

void init_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        a[i] = rand()%10000 + 1;
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    int n = 10000;           //24
    
    a = (int*)malloc(n * sizeof(n));
    b = (int*)malloc(n * sizeof(n));
    c = (int*)malloc(n * sizeof(n));

    int size = n * sizeof(int);
    
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);
    
    init_array(a, n);
    init_array(b, n);
    
    print_array(a, n);
    print_array(b, n);
        
    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    
    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);
    
    int threads = 1024;
    int blocks = (n+threads-1)/threads;
    
    cudaEventRecord(start);
    
    //vectorAdd<<<blocks,threads>>>(a_dev, b_dev, c_dev, n);
    vectorAdd<<<1,1024>>>(a_dev, b_dev, c_dev, n);
    
    cudaEventRecord(end);
    
    cudaDeviceSynchronize();
       
    float time = 0.0;
    cudaEventElapsedTime(&time, start, end);
    
    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);
    
    cout<<"Results : "<<endl;
    print_array(c, n);
 
    cout<<"Time elapsed : "<<time<<endl;
        
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
        
    return 0;
}

  9384  887  2778  6916  7794  8336  5387  493  6650  1422  2363  28  8691  60  7764  3927  541  3427  9173  5737  5212  5369  2568  6430  5783  1531  2863  5124  4068  3136  3930  9803  4023  3059  3070  8168  1394  8457  5012  8043  6230  7374  4422  4920  3785  8538  5199  4325  8316  4371  6414  3527  6092  8981  9957  1874  6863  9171  6997  7282  2306  926  7085  6328  337  6506  847  1730  1314  5858  6125  3896  9583  546  8815  3368  5435  365  4044  3751  1088  6809  7277  7179  5789  3585  5404  2652  2755  2400  9933  5061  9677  3369  7740  13  6227  8587  8095  7540  796  571  1435  379  7468  6602  98  2903  3318  493  6653  757  7302  281  4287  9442  3866  9690  8445  6620  8441  4730  8032  8118  8098  5772  4482  676  710  8928  4568  7857  9498  2354  4587  6966  5307  4684  6220  8625  1529  2872  5733  8830  9504  20  8271  3369  9709  6716  6341  8150  7797  724  2619  2246  2847  3452  2922  3556  2380  7489  7765  8229  9842  2351  5194  1501  7035  7765  125  

In [None]:
%%cu
#include<iostream>

using namespace std;

__global__
void matrixVector(int *vec, int *mat, int *result, int n, int m)
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int sum=0;
    
    if(tid <= n) {
        for(int i=0; i<n; i++) {
            sum += vec[i]*mat[(i*m) + tid];
        }
        result[tid] = sum;
    }
}

void init_array(int *a, int n) {
    for(int i=0; i<n; i++)
      a[i] = rand()%n + 1;
}

void init_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++) {
            a[i*m + j] = rand()%n + 1;
        }
    }
}

void print_array(int *a, int n) {
    for(int i=0; i<n; i++) {
        cout<<"  "<<a[i];
    }
    cout<<endl;
}

void print_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++)
          cout<<"  "<<a[i*m + j];
        cout<<endl;
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    
    int n = 100;
    int m = 200;
    
    a = new int[n];
    b = new int[n*m];
    c = new int[m];
    
    init_array(a, n);
    init_matrix(b, n, m);
        
    cout<<"Initial array : "<<endl;
    print_array(a, n);
    cout<<"Initial matrix : "<<endl;
    print_matrix(b, n, m);
    cout<<"Initial resultant array : "<<endl;
    print_array(c, m);
    cout<<endl;
    
    cudaMalloc(&a_dev, sizeof(int)*n);
    cudaMalloc(&b_dev, sizeof(int)*n*m);
    cudaMalloc(&c_dev, sizeof(int)*m);

    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);   

    
    cudaMemcpy(a_dev, a, sizeof(int)*n, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*n*m, cudaMemcpyHostToDevice);

    cudaEventRecord(start);
    
    matrixVector<<<m/256+1, 256>>>(a_dev, b_dev, c_dev, n, m);

    cudaEventRecord(end);
    
    cudaDeviceSynchronize();
       
    float time = 0.0;
    cudaEventElapsedTime(&time, start, end);
    
    cudaMemcpy(c, c_dev, sizeof(int)*m, cudaMemcpyDeviceToHost);
    
    cout<<"Results : "<<endl;
    print_array(c, m);

    cout<<"Time elapsed : "<<time<<endl;
    
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    
    delete[] a;
    delete[] b;
    delete[] c;
    
    return 0;
}

Initial array : 
  84  87  78  16  94  36  87  93  50  22  63  28  91  60  64  27  41  27  73  37  12  69  68  30  83  31  63  24  68  36  30  3  23  59  70  68  94  57  12  43  30  74  22  20  85  38  99  25  16  71  14  27  92  81  57  74  63  71  97  82  6  26  85  28  37  6  47  30  14  58  25  96  83  46  15  68  35  65  44  51  88  9  77  79  89  85  4  52  55  100  33  61  77  69  40  13  27  87  95  40
Initial matrix : 
  96  71  35  79  68  2  98  3  18  93  53  57  2  81  87  42  66  90  45  20  41  30  32  18  98  72  82  76  10  28  68  57  98  54  87  66  7  84  20  25  29  72  33  30  4  20  71  69  9  16  41  50  97  24  19  46  47  52  22  56  80  89  65  29  42  51  94  1  35  65  25  15  88  57  44  92  28  66  60  37  33  52  38  29  76  8  75  22  59  96  30  38  36  94  19  29  44  12  29  30  77  5  44  64  14  39  7  41  5  19  29  89  70  18  18  97  25  44  71  84  91  100  73  26  45  91  6  40  55  87  70  83  43  65  98  8  56  5  49  12  23  29  100  44  47

In [None]:
%%cu
#include<iostream>

using namespace std;

__global__
void matrixMultiplication(int *a, int *b, int *c, int m, int n, int k)
{
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    int sum=0;
   
    if(col<k && row<m) {
      for(int j=0;j<n;j++)
      {
          sum += a[row*n+j] * b[j*k+col];
      }
      c[k*row+col]=sum;
    }
    
}

void init_result(int *a, int m, int k) {
    for(int i=0; i<m; i++) {
      for(int j=0; j<k; j++) {
        a[i*k + j] = 0;
      }
    }
}

void init_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
      for(int j=0; j<m; j++) {
        a[i*m + j] = rand()%10000 + 1;
      }
    }
}

void print_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
      for(int j=0; j<m; j++) {
        cout<<"  "<<a[i*m + j];
      }
      cout<<endl;
    }
    cout<<endl;
}

int main()
{
    
    int *a,*b,*c;
    int *a_dev,*b_dev,*c_dev;
    int m=100, n=200, k=300;
    
    a = new int[m*n];
    b = new int[n*k];
    c = new int[m*k];
    
    init_matrix(a, m, n);
    init_matrix(b, n ,k);
    init_result(c, m, k);
    
    cout<<"Initial matrix : "<<endl;
    
    print_matrix(a, m, n);
    print_matrix(b, n, k);
    print_matrix(c, m, k);
 
     
    cudaMalloc(&a_dev, sizeof(int)*m*n);
    cudaMalloc(&b_dev, sizeof(int)*n*k);
    cudaMalloc(&c_dev, sizeof(int)*m*k);
       

 cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    

    cudaMemcpy(a_dev, a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*n*k, cudaMemcpyHostToDevice);
 
 cudaEventRecord(start);
    
    dim3 dimGrid(1,1);
    dim3 dimBlock(16,16);
    
    matrixMultiplication<<<dimGrid, dimBlock>>>(a_dev,b_dev,c_dev, m, n, k);
 
  cudaEventRecord(end);
    
    cudaDeviceSynchronize();
       
    float time = 0.0;
    cudaEventElapsedTime(&time, start, end);
     
    cudaMemcpy(c, c_dev, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
    
    cout<<"Result : "<<endl;
    print_matrix(c, m, k);
    
    cout<<"Time elapsed : "<<time<<endl;
 
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    
    delete[] a;
    delete[] b;
    delete[] c;
    
    return 0;
}

Initial matrix : 
  9384  887  2778  6916  7794  8336  5387  493  6650  1422  2363  28  8691  60  7764  3927  541  3427  9173  5737  5212  5369  2568  6430  5783  1531  2863  5124  4068  3136  3930  9803  4023  3059  3070  8168  1394  8457  5012  8043  6230  7374  4422  4920  3785  8538  5199  4325  8316  4371  6414  3527  6092  8981  9957  1874  6863  9171  6997  7282  2306  926  7085  6328  337  6506  847  1730  1314  5858  6125  3896  9583  546  8815  3368  5435  365  4044  3751  1088  6809  7277  7179  5789  3585  5404  2652  2755  2400  9933  5061  9677  3369  7740  13  6227  8587  8095  7540  796  571  1435  379  7468  6602  98  2903  3318  493  6653  757  7302  281  4287  9442  3866  9690  8445  6620  8441  4730  8032  8118  8098  5772  4482  676  710  8928  4568  7857  9498  2354  4587  6966  5307  4684  6220  8625  1529  2872  5733  8830  9504  20  8271  3369  9709  6716  6341  8150  7797  724  2619  2246  2847  3452  2922  3556  2380  7489  7765  8229  9842  2351  5194  1501 