**ASSIGNMENT 2**

Design parallel algorithm to:
1. Add two large vectors 
2. Multiply Vector and Matrix 
3. Multiply two N × N arrays using n2 processors

**1. Add two large vectors**

In [6]:
%%cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <math.h>
#include <chrono>
#include <random>
#include <bits/stdc++.h>
using namespace std;

int random_in_range( int minimum, int maximum )
{
  thread_local std::ranlux48 rng( 
    std::chrono::system_clock::now().time_since_epoch().count() );
  return std::uniform_int_distribution <int> ( minimum, maximum )( rng );
}

__global__ void add(int *i, int *j, int *k) {
    int bid = blockIdx.x;
    k[bid] = i[bid] + j[bid];
}

void random_init(int *arr, int n) {
    for(int i = 0 ; i < n ; i++) {
        arr[i] = random_in_range(100,400);
    }
}

void add_cpu(int *i, int *j, int *k, int n) {
    for(int p = 0 ; p < n ; p++) {
        k[p] = i[p] + j[p]; 
    }
}

int main() {
    int n = 20000;
    int *a, *b;
    int c[n];
    int *i, *j, *k;
    int size = n * sizeof(int);

    a = new int[n];
    b = new int[n];
    random_init(a,n);
    random_init(b,n);

    cout<<"First: ";
    for(int i = 0 ; i < n ; i++) {
        cout<<a[i]<<" ";
    }
    cout<<endl;
    
    cout<<"Second: ";
    for(int i = 0 ; i < n ; i++) {
        cout<<b[i]<<" ";
    }
    cout<<endl;
    
    cudaMalloc((void **)&i,size);
    cudaMalloc((void **)&j,size);
    cudaMalloc((void **)&k,size);

    cudaMemcpy(i,a,size,cudaMemcpyHostToDevice);
    cudaMemcpy(j,b,size,cudaMemcpyHostToDevice);

    float gpu_elapsed_time;
    cudaEvent_t gpu_start,gpu_stop;
    cudaEventCreate(&gpu_start);
    cudaEventCreate(&gpu_stop);
    cudaEventRecord(gpu_start,0);
    add<<<n,1>>>(i,j,k);
    cudaEventRecord(gpu_stop, 0);
    cudaEventSynchronize(gpu_stop);
    cudaEventElapsedTime(&gpu_elapsed_time, gpu_start, gpu_stop);
    cudaEventDestroy(gpu_start);
    cudaEventDestroy(gpu_stop);

    cudaMemcpy(c,k,size,cudaMemcpyDeviceToHost);

    cout<<endl;
    cout<<"GPU Elapsed time is: "<<gpu_elapsed_time<<" milliseconds";
    cout<<endl;
    cout<<"Parallel Result: ";
    for(int i = 0 ; i < n ; i++) {
        cout<<c[i]<<" ";
    }
    cout<<endl;
    cout<<endl;

    cudaEventCreate(&gpu_start);
    cudaEventCreate(&gpu_stop);
    cudaEventRecord(gpu_start,0);
    add_cpu(a,b,c,n);
    cudaEventRecord(gpu_stop, 0);
    cudaEventSynchronize(gpu_stop);
    cudaEventElapsedTime(&gpu_elapsed_time, gpu_start, gpu_stop);
    cudaEventDestroy(gpu_start);
    cudaEventDestroy(gpu_stop);

    cout<<"CPU Elapsed time is: "<<gpu_elapsed_time<<" milliseconds";
    cout<<endl;

    cout<<"Serial Result: ";
    for(int i = 0 ; i < n ; i++) {
        cout<<c[i]<<" ";
    }
    cout<<endl;

    cudaFree(i);
    cudaFree(j);
    cudaFree(k);

    return 0;
}

First: 291 307 385 380 265 168 190 330 215 218 240 213 237 130 218 387 137 348 161 194 236 313 254 357 163 153 179 159 366 286 201 373 258 393 186 195 134 201 251 139 390 115 140 399 292 279 341 370 360 380 366 213 357 133 329 322 393 334 280 361 279 146 357 204 200 196 168 332 210 243 235 236 132 336 160 148 373 275 101 232 202 349 246 302 258 107 121 235 197 325 160 227 278 367 191 390 239 337 210 146 338 238 211 261 330 252 188 129 306 272 298 387 359 218 336 341 226 233 265 305 283 229 268 178 225 373 207 336 390 307 121 171 137 157 400 146 368 255 192 324 291 175 167 360 322 354 140 380 100 228 382 346 348 155 389 306 307 137 374 259 190 167 301 184 348 334 234 358 241 214 208 147 150 388 337 180 286 113 375 180 365 217 177 239 276 182 193 214 108 395 298 247 336 267 150 254 212 213 225 399 253 121 258 256 218 215 269 127 199 180 197 390 327 337 121 228 263 193 343 195 342 346 327 132 367 124 125 350 202 300 231 280 335 238 122 313 110 156 127 209 153 138 333 185 106 317 255 206 3

**3. Multiply two N × N arrays using n2 processors**

In [8]:
%%cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <chrono>
#include <random>
using namespace std;


int random_in_range( int minimum, int maximum )
{
  thread_local std::ranlux48 rng( 
    std::chrono::system_clock::now().time_since_epoch().count() );
  return std::uniform_int_distribution <int> ( minimum, maximum )( rng );
}

__global__
void matrixMultiplication(int *a, int *b, int *c, int m, int n, int k)
{
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    int sum=0;
   
    if(col<k && row<m) {
      for(int j=0;j<n;j++)
      {
          sum += a[row*n+j] * b[j*k+col];
      }
      c[k*row+col]=sum;
    }
    
}

void matrix_multiplication_cpu(int *a, int *b, int *c, int m, int n, int k) {
    for(int i = 0 ; i < m ; i++) {
        for(int j = 0 ; j < n ; j++) {
            long result = 0;
            for(int p = 0 ; p < k ; p++) {
                result=result+a[i*k+p]*b[p*k+j]; 
            }
            c[k*i+j] = result;
        }
    }
}

void init_result(int *a, int m, int k) {
    for(int i=0; i<m; i++) {
      for(int j=0; j<k; j++) {
        a[i*k + j] = 0;
      }
    }
}

void init_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
      for(int j=0; j<m; j++) {
        a[i*m + j] = random_in_range(10,30);
      }
    }
}

void print_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
      for(int j=0; j<m; j++) {
        cout<<"  "<<a[i*m + j];
      }
      cout<<endl;
    }
    cout<<endl;
}

int main()
{
    
    int *a,*b,*c;
    int *a_dev,*b_dev,*c_dev;
    int m=30, n=30, k=30;
    
    a = new int[m*n];
    b = new int[n*k];
    c = new int[m*k];
    
    init_matrix(a, m, n);
    init_matrix(b, n ,k);
    init_result(c, m, k);
    
    cout<<"First matrix : "<<endl;
    print_matrix(a, m, n);
    cout<<"Second matrix : "<<endl;
    print_matrix(b, n, k);
    
    cudaMalloc(&a_dev, sizeof(int)*m*n);
    cudaMalloc(&b_dev, sizeof(int)*n*k);
    cudaMalloc(&c_dev, sizeof(int)*m*k);
       
    cudaMemcpy(a_dev, a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*n*k, cudaMemcpyHostToDevice);
    
    dim3 dimGrid(1,1);
    dim3 dimBlock(n,n);
    
    float gpu_elapsed_time;
    cudaEvent_t gpu_start,gpu_stop;
    cudaEventCreate(&gpu_start);
    cudaEventCreate(&gpu_stop);
    cudaEventRecord(gpu_start,0);
    matrixMultiplication<<<dimGrid, dimBlock>>>(a_dev,b_dev,c_dev, m, n, k);
    cudaEventRecord(gpu_stop, 0);
    cudaEventSynchronize(gpu_stop);
    cudaEventElapsedTime(&gpu_elapsed_time, gpu_start, gpu_stop);
    cudaEventDestroy(gpu_start);
    cudaEventDestroy(gpu_stop);
    cout<<"GPU Elapsed time is: "<<gpu_elapsed_time<<" milliseconds"<<endl;
    
    cudaMemcpy(c, c_dev, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
    
    cout<<"GPU Result : "<<endl;
    print_matrix(c, m, k);
    cout<<endl;

    cudaEventCreate(&gpu_start);
    cudaEventCreate(&gpu_stop);
    cudaEventRecord(gpu_start,0);
    matrix_multiplication_cpu(a, b, c, m, n, k);
    cudaEventRecord(gpu_stop, 0);
    cudaEventSynchronize(gpu_stop);
    cudaEventElapsedTime(&gpu_elapsed_time, gpu_start, gpu_stop);
    cudaEventDestroy(gpu_start);
    cudaEventDestroy(gpu_stop);
    cout<<"CPU Elapsed time is: "<<gpu_elapsed_time<<" milliseconds"<<endl;
 
    cout<<"CPU Result : "<<endl;
    print_matrix(c, m, k);
    
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    
    delete[] a;
    delete[] b;
    delete[] c;
    
    return 0;
}

First matrix : 
  29  21  18  16  10  30  16  11  10  15  21  23  29  29  23  25  24  27  28  25  13  30  23  18  29  25  23  29  13  28
  27  10  12  20  15  18  10  14  16  15  26  12  23  23  14  30  22  26  13  26  13  20  28  27  22  12  29  27  18  18
  24  14  19  29  24  25  29  10  21  29  16  26  18  28  27  18  22  22  13  21  17  21  27  17  26  24  24  15  19  18
  12  28  29  16  12  17  17  18  24  29  26  10  24  23  18  14  17  11  22  22  20  13  25  22  22  29  21  26  12  25
  21  22  30  23  19  13  28  13  21  23  11  12  20  27  28  25  26  27  21  25  22  14  29  20  22  27  10  28  28  11
  23  30  20  14  10  17  25  29  24  18  29  10  22  10  24  29  30  11  11  24  23  19  30  14  24  11  19  17  27  14
  30  15  29  19  30  10  10  27  23  13  29  20  22  11  11  27  24  22  25  22  12  15  13  28  19  22  15  11  16  16
  26  27  29  18  24  11  21  30  13  21  13  29  17  18  30  14  13  12  22  15  14  12  13  11  25  12  20  21  17  25
  18  20  23  18

**2. Multiply Vector and Matrix **

In [9]:
%%cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <chrono>
#include <random>
using namespace std;


int random_in_range( int minimum, int maximum )
{
  thread_local std::ranlux48 rng( 
    std::chrono::system_clock::now().time_since_epoch().count() );
  return std::uniform_int_distribution <int> ( minimum, maximum )( rng );
}

__global__
void matrixVector(int *vec, int *mat, int *result, int n, int m)
{
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int sum=0;
    
    if(tid <= n) {
        for(int i=0; i<n; i++) {
            sum += vec[i]*mat[(i*m) + tid];
        }
        result[tid] = sum;
    }
}

void maxtrixVector_cpu(int *vec, int *mat, int *result, int n, int m) {
    for(int i = 0 ; i < n ; i++) {
        long sum = 0;
        for(int j = 0 ; j < m ; j++) {
            sum = sum + mat[j*m+i] * vec[j];  
        }
        result[i] = sum;
    }
}

void init_array(int *a, int n) {
    for(int i=0; i<n; i++)
      a[i] = random_in_range(10,40);
}

void init_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++) {
            a[i*m + j] = random_in_range(10, 40);        
        }
    }
}

void print_array(int *a, int n) {
    for(int i=0; i<n; i++) {
        cout<<a[i]<<" ";
    }
    cout<<endl;
}

void print_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++)
          cout<<"  "<<a[i*m + j];
        cout<<endl;
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    
    int n = 100;
    int m = 100;
    
    a = new int[n];
    b = new int[n*m];
    c = new int[m];
    
    init_array(a, n);
    init_matrix(b, n, m);
        
    cout<<"Initial vector array : "<<endl;
    print_array(a, n);
    cout<<endl;
    cout<<"Initial matrix : "<<endl;
    print_matrix(b, n, m);
    cout<<endl;
    
    cudaMalloc(&a_dev, sizeof(int)*n);
    cudaMalloc(&b_dev, sizeof(int)*n*m);
    cudaMalloc(&c_dev, sizeof(int)*m);
    
    cudaMemcpy(a_dev, a, sizeof(int)*n, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*n*m, cudaMemcpyHostToDevice);
    
    float gpu_elapsed_time;
    cudaEvent_t gpu_start,gpu_stop;
    cudaEventCreate(&gpu_start);
    cudaEventCreate(&gpu_stop);
    cudaEventRecord(gpu_start,0);
    matrixVector<<<m, 1>>>(a_dev, b_dev, c_dev, n, m);
    cudaEventRecord(gpu_stop, 0);
    cudaEventSynchronize(gpu_stop);
    cudaEventElapsedTime(&gpu_elapsed_time, gpu_start, gpu_stop);
    cudaEventDestroy(gpu_start);
    cudaEventDestroy(gpu_stop);
    cout<<"GPU Elapsed time is: "<<gpu_elapsed_time<<" milliseconds"<<endl;

    cudaMemcpy(c, c_dev, sizeof(int)*m, cudaMemcpyDeviceToHost);
    
    cout<<"GPU Resultant vector : ";
    print_array(c, m);
    cout<<endl;

    cudaEventCreate(&gpu_start);
    cudaEventCreate(&gpu_stop);
    cudaEventRecord(gpu_start,0);
    maxtrixVector_cpu(a, b, c, n, m);
    cudaEventRecord(gpu_stop, 0);
    cudaEventSynchronize(gpu_stop);
    cudaEventElapsedTime(&gpu_elapsed_time, gpu_start, gpu_stop);
    cudaEventDestroy(gpu_start);
    cudaEventDestroy(gpu_stop);
    cout<<"CPU Elapsed time is: "<<gpu_elapsed_time<<" milliseconds"<<endl;
    
    cout<<"CPU Resultant vector : ";
    for(int i = 0 ; i < n ; i++) {
        cout<<c[i]<<" ";
    }
    cout<<endl;
    
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    
    delete[] a;
    delete[] b;
    delete[] c;
    
    return 0;
}


Initial vector array : 
19 13 23 33 39 36 13 36 29 17 31 16 10 36 37 27 39 22 12 24 24 11 12 14 40 40 27 17 21 34 33 30 14 16 18 34 34 37 32 25 15 27 20 32 39 34 24 31 39 15 27 14 36 27 26 39 13 24 26 18 14 23 36 24 16 18 25 22 36 16 16 40 17 38 35 23 21 17 11 25 32 39 27 30 21 30 18 39 26 14 23 30 15 35 39 15 40 21 36 12 

Initial matrix : 
  11  36  16  20  13  35  11  10  17  24  37  19  17  16  35  17  40  23  16  23  39  31  29  24  21  39  21  30  23  30  31  27  23  13  29  30  18  25  25  39  19  11  27  27  36  10  16  35  26  21  24  24  33  13  11  15  31  39  19  18  34  39  40  36  29  16  17  30  40  31  32  18  26  27  26  15  16  22  25  20  19  37  34  28  26  37  37  27  16  10  12  37  39  31  25  11  38  39  20  25
  36  25  14  40  34  24  18  13  39  22  36  28  33  22  32  31  15  19  34  38  40  29  29  24  14  35  14  12  28  14  38  24  24  39  24  20  35  26  13  31  14  35  13  40  25  18  13  13  29  14  16  34  36  14  40  34  31  36  17  11  16  32  39  3