## Matrix Addition operation on CPU & GPU

    To check performance. Using Cpp, CUDA, OpenAcc

In [7]:
%%writefile 1.cpp

#include <stdio.h>
#include <stdlib.h>
#include <chrono>

void matrix_addition(int *a, int *b, int *c, int n){
  for(int i=0; i<n; i++){
    for(int j=0; j<n; i++){
      c[i*n+j] = a[i*n+j] + b[i*n+j];
    }
  }
}

int main(){
  int sizes[] = {5,10,15};

  for(int s=0; s<3; s++){
    int n = sizes[s];
    size_t size = n*n*sizeof(int);

    int *a = (int*)malloc(size);
    int *b = (int*)malloc(size);
    int *c = (int*) malloc(size);

    for(int i=0; i<n*n; i++){
      a[i]=i+1;
      b[i]=i+1;
    }

    auto start = std::chrono::high_resolution_clock::now();
    matrix_addition(a,b,c,n);
    auto end = std::chrono::high_resolution_clock::now();

    double elapsed_time = std::chrono::duration<double, std::milli>(end - start).count();

    printf("sequential - size: %d, time: %.3f ms\n", n, elapsed_time);

    free(a);
    free(b);
    free(c);
  }

  return 0;
}


Writing 1.cpp


In [8]:
!g++ -o 1 1.cpp


In [9]:
!./1

In [3]:
%%writefile 1.cu

__global__ void matrix_addition(int *a, int *b, int *c, int n){
  int tid = blockIdx.x * blockGrid.x + threadIdx.x;

  if(tid < n){
    for(int i=0; i<n; i++){
      for(int j=0; j<n; i++){
      c[i*n+j] = a[i*n+j] + b[i*n+j];
      }
    }
  }
}

int main(){
  int sizes[] = {5,10,15};
  int n;

  for(int s=0; s<3; s++){
    n = sizes[s];
    size_t size = n*n*sizeof(int);

    int *h_a = (int*)malloc(size);
    int *h_b = (int*)malloc(size);
    int *h_c = (int*) malloc(size);

    for(int i=0; i<n*n; i++){
      h_a[i]=i+1;
      h_b[i]=i+1;
    }

    int *d_a, *d_b, *d_c, d_n;

    cudaMalloc(d_a, n * sizeof(int));
    cudaMalloc(d_b, n * sizeof(int));
    cudaMalloc(d_c, n * sizeof(int));
    cudaMalloc(d_n, sizeof(int));

    cudaMemcpy(d_a, h_a, n*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_a, n*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_n, n, sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 1024;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    matrix_addition<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, d_n);

    cudaMemcpy(h_c, d_c, n*sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i<n; i++){
      printf("%d ", h_c[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFree(d_n);
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}
}

Overwriting 1.cu


In [4]:
!nvcc -o 1cu 1.cu

[01m[0m[01m1.cu(3)[0m: [01;31merror[0m: identifier "[01mblockGrid[0m" is undefined
    int tid = blockIdx.x * blockGrid.x + threadIdx.x;
                           ^

[01m[0m[01m1.cu(33)[0m: [01;31merror[0m: no instance of overloaded function [01m"cudaMalloc"[0m matches the argument list
            argument types are: (int *, unsigned long)
      cudaMalloc(d_a, n * sizeof(int));
      ^

[01m[0m[01m1.cu(34)[0m: [01;31merror[0m: no instance of overloaded function [01m"cudaMalloc"[0m matches the argument list
            argument types are: (int *, unsigned long)
      cudaMalloc(d_b, n * sizeof(int));
      ^

[01m[0m[01m1.cu(35)[0m: [01;31merror[0m: no instance of overloaded function [01m"cudaMalloc"[0m matches the argument list
            argument types are: (int *, unsigned long)
      cudaMalloc(d_c, n * sizeof(int));
      ^

[01m[0m[01m1.cu(36)[0m: [01;31merror[0m: no instance of overloaded function [01m"cudaMalloc"[0m matches the argumen

In [5]:
%%writefile 2.cu

#include <stdio.h>
#include <cuda.h>

__global__ void matrix_addition(int *a, int *b, int *c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < n * n) {
        int i = tid / n;  // Row index
        int j = tid % n;  // Column index
        c[i * n + j] = a[i * n + j] + b[i * n + j];
    }
}

int main() {
    int sizes[] = {5, 10, 15};
    int n;

    for (int s = 0; s < 3; s++) {
        n = sizes[s];
        size_t size = n * n * sizeof(int);

        int *h_a = (int *)malloc(size);
        int *h_b = (int *)malloc(size);
        int *h_c = (int *)malloc(size);

        for (int i = 0; i < n * n; i++) {
            h_a[i] = i + 1;
            h_b[i] = i + 1;
        }

        int *d_a, *d_b, *d_c;

        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);

        cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

        int threadsPerBlock = 1024;
        int blocksPerGrid = (n * n + threadsPerBlock - 1) / threadsPerBlock;

        matrix_addition<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);

        cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

        printf("Matrix addition result for size %d:\n", n);
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                printf("%d ", h_c[i * n + j]);
            }
            printf("\n");
        }
        printf("\n");

        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_c);

        free(h_a);
        free(h_b);
        free(h_c);
    }

    return 0;
}

Writing 2.cu


In [6]:
!nvcc -o 2 2.cu

In [7]:
!./2

Matrix addition result for size 5:
2 4 6 8 10 
12 14 16 18 20 
22 24 26 28 30 
32 34 36 38 40 
42 44 46 48 50 

Matrix addition result for size 10:
2 4 6 8 10 12 14 16 18 20 
22 24 26 28 30 32 34 36 38 40 
42 44 46 48 50 52 54 56 58 60 
62 64 66 68 70 72 74 76 78 80 
82 84 86 88 90 92 94 96 98 100 
102 104 106 108 110 112 114 116 118 120 
122 124 126 128 130 132 134 136 138 140 
142 144 146 148 150 152 154 156 158 160 
162 164 166 168 170 172 174 176 178 180 
182 184 186 188 190 192 194 196 198 200 

Matrix addition result for size 15:
2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 
32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 
62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 
92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 
122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 
152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 
182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 
212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 
242 244 246 248 250 252

In [1]:
%%writefile 3.cpp

#include <stdio.h>
#include <stdlib.h>
#include <chrono>

void matrix_addition(int *a, int *b, int *c, int n) {
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            c[i * n + j] = a[i * n + j] + b[i * n + j];
        }
    }
}

int main() {
    int sizes[] = {5, 10, 15};

    for (int s = 0; s < 3; s++) {
        int n = sizes[s];
        size_t size = n * n * sizeof(int);

        // Allocate memory
        int *a = (int *)malloc(size);
        int *b = (int *)malloc(size);
        int *c = (int *)malloc(size);

        // Initialize input matrices
        for (int i = 0; i < n * n; i++) {
            a[i] = i + 1;
            b[i] = i + 1;
        }

        // Measure execution time
        auto start = std::chrono::high_resolution_clock::now();
        matrix_addition(a, b, c, n);
        auto end = std::chrono::high_resolution_clock::now();

        // Calculate elapsed time
        double elapsed_time = std::chrono::duration<double, std::milli>(end - start).count();
        printf("Sequential - Size: %d, Time: %.3f ms\n", n, elapsed_time);

        // Free memory
        free(a);
        free(b);
        free(c);
    }

    return 0;
}


Writing 3.cpp


In [6]:
!gpp -o 3 3.cpp

stdio.h:28: error: Requested include file not found


In [7]:
!gpp --version

GPP 2.27
Copyright (C) 1996-2001 Denis Auroux
Copyright (C) 2003-2020 Tristan Miller
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.


In [4]:
!sudo apt install gpp

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  gpp
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 83.8 kB of archives.
After this operation, 195 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 gpp amd64 2.27-1 [83.8 kB]
Fetched 83.8 kB in 0s (928 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package gpp.
(Reading database ... 123630 files and directories currently installed.)

In [5]:
!gpp --version

GPP 2.27
Copyright (C) 1996-2001 Denis Auroux
Copyright (C) 2003-2020 Tristan Miller
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.


In [4]:
%%writefile 3.cu

#include <stdio.h>
#include <cuda.h>

__global__ void matrix_addition(int *a, int *b, int *c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < n * n) {
        int i = tid / n;  // Row index
        int j = tid % n;  // Column index
        c[i * n + j] = a[i * n + j] + b[i * n + j];
    }
}

int main() {
    int sizes[] = {5, 10, 15};
    int n;

    for (int s = 0; s < 3; s++) {
        n = sizes[s];
        size_t size = n * n * sizeof(int);

        int *h_a = (int *)malloc(size);
        int *h_b = (int *)malloc(size);
        int *h_c = (int *)malloc(size);

        for (int i = 0; i < n * n; i++) {
            h_a[i] = i + 1;
            h_b[i] = i + 1;
        }

        int *d_a, *d_b, *d_c;

        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);

        cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

        int threadsPerBlock = 1024;
        int blocksPerGrid = (n * n + threadsPerBlock - 1) / threadsPerBlock;

        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        // Start timing
        cudaEventRecord(start, 0);

        matrix_addition<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);

        // Stop timing
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        // Calculate elapsed time
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);

        // Copy result back to host
        cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

        printf("Matrix addition result for size %d:\n", n);
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                printf("%d ", h_c[i * n + j]);
            }
            printf("\n");
        }
        printf("\n");
        printf("Kernel execution time for size %d: %.3f ms\n\n", n, milliseconds);

        // Clean up CUDA events
        cudaEventDestroy(start);
        cudaEventDestroy(stop);

        // Free device memory
        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_c);

        // Free host memory
        free(h_a);
        free(h_b);
        free(h_c);
    }

    return 0;
}


Overwriting 3.cu


In [5]:
!nvcc -o 3cu 3.cu

In [6]:
!./3cu

Matrix addition result for size 5:
2 4 6 8 10 
12 14 16 18 20 
22 24 26 28 30 
32 34 36 38 40 
42 44 46 48 50 

Kernel execution time for size 5: 0.190 ms

Matrix addition result for size 10:
2 4 6 8 10 12 14 16 18 20 
22 24 26 28 30 32 34 36 38 40 
42 44 46 48 50 52 54 56 58 60 
62 64 66 68 70 72 74 76 78 80 
82 84 86 88 90 92 94 96 98 100 
102 104 106 108 110 112 114 116 118 120 
122 124 126 128 130 132 134 136 138 140 
142 144 146 148 150 152 154 156 158 160 
162 164 166 168 170 172 174 176 178 180 
182 184 186 188 190 192 194 196 198 200 

Kernel execution time for size 10: 0.011 ms

Matrix addition result for size 15:
2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 
32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 
62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 
92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 
122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 
152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 
182 184 186 188 190 192 194 196 198 200 202 204 206 208 

## Matrix Transpose Operation on CPU & GPU

In [None]:
%%writefile 4.cpp

#include <stdio.h>
#include <stdlib.h>
#include <chrono>

void matrix_transpose(){


}

int main(){
  int sizes[] = {5, 10, 15};

  for(int s=0; s<3; ++s){
    int n=sizes[s];
    size_t size = n*n*size(int);

  int *matrix = (int*)malloc(size);
  int *transpose = (int*)malloc(size);

  for(int i=0; i<n*n; i++){
    matrix[i]=i+1;
  }

  matrix_transpose(matrix, transpose, n);

  printf("Transpose - size: %d, time: %.3f ms \n", n, elapsed_time);

  free(matrix);
  free(transpose);

  }

  return 0;


}