## Matrix Multiplication On CPU & GPU

In [8]:
%%writefile 1.cpp

#include <stdio.h>
#include <stdlib.h>

#define N 2

void matrix_multiplication(int a[N][N], int b[N][N], int c[N][N]){
  for(int i=0; i<N; i++){
    for(int j=0; j<N; j++){
      c[i][j]=0;
      for(int k=0; k<N; k++){
        c[i][j] += a[i][k] * b[k][j];
      }
    }
  }
}

int main(){
  int a[N][N] = {{1,7}, {2,4}};
  int b[N][N] = {{3,3}, {5,2}};
  int c[N][N];

  matrix_multiplication(a,b,c);

  printf("Resultant Matrix: \n");
  for(int i=0; i<N; i++){
    for(int j=0; j<N; j++){
      printf("%d ", c[i][j]);
    }
    printf("\n");
  }

  return 0;
}

Overwriting 1.cpp


In [9]:
!g++ -o 1 1.cpp

In [10]:
!./1

Resultant Matrix: 
38 17 
26 14 


In [5]:
//Own written code
%%writefile 2.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <chrono>
#include <time.h>

#define N 2

__global__ void matrix_multiply_gpu(int a[N][N], int *b[N][N], int *c[N][N]){
  int id = threadIdx.x + blockIdx.x * blockDim.x;

  if(id < N * N){
    for(int i=0; i<N; i++){
      for(int j=0; j<N; j++){
        c[i][j]=0;
        for(int k=0; k<N; k++){
          c[i][j] += a[i][k] * b[k][j];
      }
    }
  }
  }
}

void matrix_multiplication(int a[N][N], int b[N][N], int c[N][N]){
  for(int i=0; i<N; i++){
    for(int j=0; j<N; j++){
      c[i][j]=0;
      for(int k=0; k<N; k++){
        c[i][j] += a[i][k] * b[k][j];
      }
    }
  }
}

int main(){
  int a[N][N] = {{1,7}, {2,4}};
  int b[N][N] = {{3,3}, {5,2}};
  int c[N][N];

  matrix_multiplication(a,b,c);

  printf("Resultant Matrix: \n");
  for(int i=0; i<N; i++){
    for(int j=0; j<N; j++){
      printf("%d ", c[i][j]);
    }
    printf("\n");
  }

  int *d_a, *d_b, *d_c;
  int h_c[N][N];

  cudaMalloc((void**)&d_a, N*N *sizeof(int));
  cudaMalloc((void**)&d_b, N*N * sizeof(int));
  cudaMalloc((void**)&d_c, N*N * sizeof(int));

  cudaMemcpy(d_a, a, N*N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*N * sizeof(int), cudaMemcpyHostToDevice);

  matrix_multiply_gpu<<<1, 4>>>(d_a, d_b, d_c);

  cudaMemcpy(c, d_c, N*N * sizeof(int), cudaMemcpyDeviceToHost);

  printf("Resulatant Matrix from GPU: \n");
  for(int i; i<N, i++){
    for(int j=0; j<N; j++){
      printf("%d ", h_c[i][j]);
    }
    printf("\n");
  }

  cudaFree(d_a);
  cudafree(d_b);
  cudafree(d_c);

  return 0;
}

Overwriting 2.cu


In [6]:
!nvcc -o 2 2.cu

[01m[0m[01m2.cu(18)[0m: [01;31merror[0m: expression must have arithmetic or unscoped enum type
            c[i][j] += a[i][k] * b[k][j];
                                 ^

[01m[0m[01m2.cu(61)[0m: [01;31merror[0m: argument of type "int *" is incompatible with parameter of type "int (*)[2]"
    matrix_multiply_gpu<<<1, 4>>>(d_a, d_b, d_c);
                                  ^

[01m[0m[01m2.cu(61)[0m: [01;31merror[0m: argument of type "int *" is incompatible with parameter of type "int *(*)[2]"
    matrix_multiply_gpu<<<1, 4>>>(d_a, d_b, d_c);
                                       ^

[01m[0m[01m2.cu(61)[0m: [01;31merror[0m: argument of type "int *" is incompatible with parameter of type "int *(*)[2]"
    matrix_multiply_gpu<<<1, 4>>>(d_a, d_b, d_c);
                                            ^

    for(int i; i<2, i++){
               ^


    for(int i; i<2, i++){
               ^

[01m[0m[01m2.cu(66)[0m: [01;31merror[0m: expected a ";"
    for(int i; i<2, 

In [25]:
//corrected using AI bot
%%writefile 3.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <chrono>
#include <time.h>

#define N 2

__global__ void matrix_multiply_gpu(int *a, int *b, int *c) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int value = 0;
        for (int k = 0; k < N; k++) {
            value += a[row * N + k] * b[k * N + col];
        }
        c[row * N + col] = value;
    }
}

void matrix_multiplication(int a[N][N], int b[N][N], int c[N][N]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            c[i][j] = 0;
            for (int k = 0; k < N; k++) {
                c[i][j] += a[i][k] * b[k][j];
            }
        }
    }
}

int main() {
    int a[N][N] = {{1, 7}, {2, 4}};
    int b[N][N] = {{3, 3}, {5, 2}};
    int c[N][N] = {0};

    auto start = std::chrono::high_resolution_clock::now();
    matrix_multiplication(a, b, c);
    auto end = std::chrono::high_resolution_clock::now();

    double elapsed_time_cpu = std::chrono::duration<double, std::milli>(end - start).count();

    printf("Resultant Matrix (CPU): \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", c[i][j]);
        }
        printf("\n");
    }

    printf("CPU Execution Time: %.3f ms\n", elapsed_time_cpu);

    int *d_a, *d_b, *d_c;
    int h_c[N][N] = {0};

    cudaMalloc((void**)&d_a, N * N * sizeof(int));
    cudaMalloc((void**)&d_b, N * N * sizeof(int));
    cudaMalloc((void**)&d_c, N * N * sizeof(int));

    cudaMemcpy(d_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(2, 2);
    dim3 blocksPerGrid(1, 1);

    cudaEvent_t start_event, stop_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);

    cudaEventRecord(start_event, 0);
    matrix_multiply_gpu<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c);
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);

    float elapsed_time_gpu = 0;
    cudaEventElapsedTime(&elapsed_time_gpu, start_event, stop_event);

    cudaMemcpy(h_c, d_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("\nResultant Matrix (GPU): \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", h_c[i][j]);
        }
        printf("\n");
    }

     printf("GPU Execution Time: %.3f ms\n", elapsed_time_gpu);


    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting 3.cu


In [26]:
!nvcc -o 3 3.cu

In [27]:
!./3

Resultant Matrix (CPU): 
38 17 
26 14 
CPU Execution Time: 0.000 ms

Resultant Matrix (GPU): 
38 17 
26 14 
GPU Execution Time: 0.165 ms


In [1]:
%%writefile 4.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <chrono>
#include <time.h>

#define N 2

__global__ void matrix_multiply_gpu(int *a, int *b, int *c) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int value = 0;
        for (int k = 0; k < N; k++) {
            value += a[row * N + k] * b[k * N + col];
        }
        c[row * N + col] = value;
    }
}

void matrix_multiplication(int a[N][N], int b[N][N], int c[N][N]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            c[i][j] = 0;
            for (int k = 0; k < N; k++) {
                c[i][j] += a[i][k] * b[k][j];
            }
        }
    }
}

int main() {
    int a[N][N] = {0};
    int b[N][N] = {0};
    int c[N][N] = {0};

    for(int i=0; i<N; i++){
      for(int j=0; i<N; j++){
        a[i][j];
        b[i][j];
      }
    }

    auto start = std::chrono::high_resolution_clock::now();
    matrix_multiplication(a, b, c);
    auto end = std::chrono::high_resolution_clock::now();

    double elapsed_time_cpu = std::chrono::duration<double, std::milli>(end - start).count();

    printf("Resultant Matrix (CPU): \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", c[i][j]);
        }
        printf("\n");
    }

    printf("CPU Execution Time: %.3f ms\n", elapsed_time_cpu);

    int *d_a, *d_b, *d_c;
    int h_c[N][N] = {0};

    cudaMalloc((void**)&d_a, N * N * sizeof(int));
    cudaMalloc((void**)&d_b, N * N * sizeof(int));
    cudaMalloc((void**)&d_c, N * N * sizeof(int));

    cudaMemcpy(d_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(2, 2);
    dim3 blocksPerGrid(1, 1);

    cudaEvent_t start_event, stop_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);

    cudaEventRecord(start_event, 0);
    matrix_multiply_gpu<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c);
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);

    float elapsed_time_gpu = 0;
    cudaEventElapsedTime(&elapsed_time_gpu, start_event, stop_event);

    cudaMemcpy(h_c, d_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("\nResultant Matrix (GPU): \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", h_c[i][j]);
        }
        printf("\n");
    }

     printf("GPU Execution Time: %.3f ms\n", elapsed_time_gpu);


    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;

}


Writing 4.cu


In [2]:
!nvcc -o 4 4.cu

          a[i][j];
          ^


          b[i][j];
          ^

          a[i][j];
          ^


          b[i][j];
          ^



In [10]:
//Tried To increase matrix size

%%writefile 5.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <chrono>

#define N 10

// Kernel for matrix multiplication on the GPU
__global__ void matrix_multiply_gpu(int *a, int *b, int *c) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int value = 0;
        for (int k = 0; k < N; k++) {
            value += a[row * N + k] * b[k * N + col];
        }
        c[row * N + col] = value;
    }
}

// Function for matrix multiplication on the CPU
void matrix_multiplication(int a[N][N], int b[N][N], int c[N][N]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            c[i][j] = 0;
            for (int k = 0; k < N; k++) {
                c[i][j] += a[i][k] * b[k][j];
            }
        }
    }
}

int main() {
    int a[N][N] = {0};
    int b[N][N] = {0};
    int c[N][N] = {0};

    printf("Initializing matrices...\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            a[i][j] = rand() % 10;
            b[i][j] = rand() % 10;
        }
    }

    // Perform matrix multiplication on CPU
    auto start = std::chrono::high_resolution_clock::now();
    matrix_multiplication(a, b, c);
    auto end = std::chrono::high_resolution_clock::now();

    double elapsed_time_cpu = std::chrono::duration<double, std::milli>(end - start).count();

    printf("Resultant Matrix (CPU):\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", c[i][j]);
        }
        printf("\n");
    }

    printf("CPU Execution Time: %.3f ms\n", elapsed_time_cpu);

    int *d_a, *d_b, *d_c;
    int h_c[N][N] = {0};

    cudaMalloc((void **)&d_a, N * N * sizeof(int));
    cudaMalloc((void **)&d_b, N * N * sizeof(int));
    cudaMalloc((void **)&d_c, N * N * sizeof(int));

    cudaMemcpy(d_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // GPU timing events
    cudaEvent_t start_event, stop_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);

    cudaEventRecord(start_event, 0);
    matrix_multiply_gpu<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c);
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);

    float elapsed_time_gpu = 0;
    cudaEventElapsedTime(&elapsed_time_gpu, start_event, stop_event);

    // Copy result back to host
    cudaMemcpy(h_c, d_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("\nResultant Matrix (GPU):\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", h_c[i][j]);
        }
        printf("\n");
    }

    printf("GPU Execution Time: %.3f ms\n", elapsed_time_gpu);

    // Free GPU memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting 5.cu


In [11]:
!nvcc -o 5 5.cu

In [12]:
!./5

Initializing matrices...
Resultant Matrix (CPU):
190 278 145 132 190 136 200 169 161 167 
186 355 156 157 207 209 185 164 210 246 
191 335 233 179 196 257 220 227 174 232 
191 319 172 156 167 218 182 186 165 186 
276 433 239 205 229 305 251 252 193 257 
233 378 222 181 218 240 231 216 180 226 
232 430 221 155 255 274 187 203 193 328 
248 319 178 137 201 217 233 171 165 236 
267 379 184 141 231 276 259 247 218 301 
252 477 239 204 282 302 239 261 245 334 
CPU Execution Time: 0.007 ms

Resultant Matrix (GPU):
190 278 145 132 190 136 200 169 161 167 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 0 
GPU Execution Time: 38.644 ms
