<a href="https://colab.research.google.com/github/Ludvins/Practicas_PDGE/blob/master/CUDA/Multiplicacion_matrices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiplicación en CPU

In [5]:
%%writefile mat_cpu.cu
#include <stdio.h>
const int N = 16;

// Computa la multiplicación de matrices en CPU
void matrixMultCPU(int a[N][N], int b[N][N], int c[N][N]) {
  int n,m;
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      int sum = 0;
      for (int k = 0; k < N; k++) {
        m = a[i][k];
        n = b[k][j];
        sum += m * n;
      }
      c[i][j] = sum;
    }
  }
}

int main() {
  int a[N][N], b[N][N], c[N][N];
  int cont,i,j;

  /* inicializando variables con datos*/
  for (i = 0; i < N; i++) {
    cont = 0;
    for (j = 0; j < N; j++) {
      a[i][j] = cont;
      b[i][j] = cont;
      cont++;
    }
  }

  matrixMultCPU(a, b, c);

  // imprimiendo
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
      printf("[%d][%d]=%d ", y, x, c[y][x]);
    }
    printf("\n");
  }
  return 0;
}

Overwriting mat_cpu.cu


In [8]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat_cpu.cu -o mat_cpu -lcudadevrt
!./mat_cpu

[0][0]=0 [0][1]=120 [0][2]=240 [0][3]=360 [0][4]=480 [0][5]=600 [0][6]=720 [0][7]=840 [0][8]=960 [0][9]=1080 [0][10]=1200 [0][11]=1320 [0][12]=1440 [0][13]=1560 [0][14]=1680 [0][15]=1800 
[1][0]=0 [1][1]=120 [1][2]=240 [1][3]=360 [1][4]=480 [1][5]=600 [1][6]=720 [1][7]=840 [1][8]=960 [1][9]=1080 [1][10]=1200 [1][11]=1320 [1][12]=1440 [1][13]=1560 [1][14]=1680 [1][15]=1800 
[2][0]=0 [2][1]=120 [2][2]=240 [2][3]=360 [2][4]=480 [2][5]=600 [2][6]=720 [2][7]=840 [2][8]=960 [2][9]=1080 [2][10]=1200 [2][11]=1320 [2][12]=1440 [2][13]=1560 [2][14]=1680 [2][15]=1800 
[3][0]=0 [3][1]=120 [3][2]=240 [3][3]=360 [3][4]=480 [3][5]=600 [3][6]=720 [3][7]=840 [3][8]=960 [3][9]=1080 [3][10]=1200 [3][11]=1320 [3][12]=1440 [3][13]=1560 [3][14]=1680 [3][15]=1800 
[4][0]=0 [4][1]=120 [4][2]=240 [4][3]=360 [4][4]=480 [4][5]=600 [4][6]=720 [4][7]=840 [4][8]=960 [4][9]=1080 [4][10]=1200 [4][11]=1320 [4][12]=1440 [4][13]=1560 [4][14]=1680 [4][15]=1800 
[5][0]=0 [5][1]=120 [5][2]=240 [5][3]=360 [5][4]=480 [5][5]=

# Multiplicación en GPU

## Memoria no compartida

In [9]:
%%writefile mat_gpu.cu
#include <stdio.h>
const int N = 16;

// Computa la multiplicación de matrices en GPU sin memoria compartida
__global__ void matrixMultGPU(int *a, int *b, int *c) {
  int k, sum = 0;
  int col = threadIdx.x + blockDim.x * blockIdx.x;
  int fil = threadIdx.y + blockDim.y * blockIdx.y;
  if (col < N && fil < N) {
    for (k = 0; k < N; k++) {
      sum += a[fil * N + k] * b[k * N + col];
    }
    c[fil * N + col] = sum;
  }
}

int main() {
  int a[N][N], b[N][N], c[N][N];
  int *dev_a, *dev_b, *dev_c;
  int cont,i,j;

  /* inicializando variables con datos*/
  for (i = 0; i < N; i++) {
    cont = 0;
    for (j = 0; j < N; j++) {
      a[i][j] = cont;
      b[i][j] = cont;
      cont++;
    }
  }

  int size = N * N * sizeof(int);
  cudaMalloc((void **) &dev_a, size);
  cudaMalloc((void **) &dev_b, size);
  cudaMalloc((void **) &dev_c, size);
  cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

  // Tamaño de grid y bloque
  dim3 dimGrid(1, 1);
  dim3 dimBlock(N, N);

  matrixMultGPU<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
  cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);


  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);

  // imprimiendo
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
      printf("[%d][%d]=%d ", y, x, c[y][x]);
    }
    printf("\n");
  }
  return 0;
}

Writing mat_gpu.cu


In [10]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat_gpu.cu -o mat_gpu -lcudadevrt
!./mat_gpu

[0][0]=0 [0][1]=120 [0][2]=240 [0][3]=360 [0][4]=480 [0][5]=600 [0][6]=720 [0][7]=840 [0][8]=960 [0][9]=1080 [0][10]=1200 [0][11]=1320 [0][12]=1440 [0][13]=1560 [0][14]=1680 [0][15]=1800 
[1][0]=0 [1][1]=120 [1][2]=240 [1][3]=360 [1][4]=480 [1][5]=600 [1][6]=720 [1][7]=840 [1][8]=960 [1][9]=1080 [1][10]=1200 [1][11]=1320 [1][12]=1440 [1][13]=1560 [1][14]=1680 [1][15]=1800 
[2][0]=0 [2][1]=120 [2][2]=240 [2][3]=360 [2][4]=480 [2][5]=600 [2][6]=720 [2][7]=840 [2][8]=960 [2][9]=1080 [2][10]=1200 [2][11]=1320 [2][12]=1440 [2][13]=1560 [2][14]=1680 [2][15]=1800 
[3][0]=0 [3][1]=120 [3][2]=240 [3][3]=360 [3][4]=480 [3][5]=600 [3][6]=720 [3][7]=840 [3][8]=960 [3][9]=1080 [3][10]=1200 [3][11]=1320 [3][12]=1440 [3][13]=1560 [3][14]=1680 [3][15]=1800 
[4][0]=0 [4][1]=120 [4][2]=240 [4][3]=360 [4][4]=480 [4][5]=600 [4][6]=720 [4][7]=840 [4][8]=960 [4][9]=1080 [4][10]=1200 [4][11]=1320 [4][12]=1440 [4][13]=1560 [4][14]=1680 [4][15]=1800 
[5][0]=0 [5][1]=120 [5][2]=240 [5][3]=360 [5][4]=480 [5][5]=

## Memoria compartida

In [22]:
%%writefile mat_gpu2.cu
#include <stdio.h>
// Tamaño de matrices
const int N = 16;
// Tamaño de mosaico
const int M = 8;

// Computa la multiplicación de matrices en GPU con memoria compartida y B traspuesta
__global__ void matrixMultGPU2(int* A, int* B, int* C, int N) {
    
    int sum = 0;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int i = blockIdx.x * blockDim.x + tx;
    int j = blockIdx.y * blockDim.y + ty;

    __shared__ int As[M][M];
    __shared__ int Bs[M][M];

    // Recorre los mosaicos de A y B necesarios para computar la submatriz de C
    for (int tile = 0; tile < (N/M); tile++){
        // Carga los mosaicos (M x M) de A y B en paralelo (y de forma traspuesta)
        As[ty][tx] = A[(i * N) + (ty + (tile*M))];
        Bs[ty][tx] = B[((tx + (tile * M))*N) + j];

        __syncthreads();

        // Computa los resultados para la submatriz de C
        for (int k = 0; k < M; k++)
          sum += As[k][tx] * Bs[ty][k];

        __syncthreads();
    }
    // Escribe en paralelo los resultados obtenidos por el bloque
    C[i * N + j] = sum;
}

int main() {
  int a[N][N], b[N][N], c[N][N];
  int *dev_a, *dev_b, *dev_c;
  int cont,i,j;

  /* inicializando variables con datos*/
  for (i = 0; i < N; i++) {
    cont = 0;
    for (j = 0; j < N; j++) {
      a[i][j] = cont;
      b[i][j] = cont;
      cont++;
    }
  }

  int size = N * N * sizeof(int);
  cudaMalloc((void **) &dev_a, size);
  cudaMalloc((void **) &dev_b, size);
  cudaMalloc((void **) &dev_c, size);
  cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

  // Tamaño de grid y bloque
  dim3 dimGrid(N/M, N/M);
  dim3 dimBlock(M, M);

  matrixMultGPU2<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c, N);

  cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);


  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);

  // imprimiendo
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
      printf("[%d][%d]=%d ", y, x, c[y][x]);
    }
    printf("\n");
  }
  return 0;
}

Overwriting mat_gpu2.cu


In [23]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat_gpu2.cu -o mat_gpu2 -lcudadevrt
!./mat_gpu2

[0][0]=0 [0][1]=120 [0][2]=240 [0][3]=360 [0][4]=480 [0][5]=600 [0][6]=720 [0][7]=840 [0][8]=960 [0][9]=1080 [0][10]=1200 [0][11]=1320 [0][12]=1440 [0][13]=1560 [0][14]=1680 [0][15]=1800 
[1][0]=0 [1][1]=120 [1][2]=240 [1][3]=360 [1][4]=480 [1][5]=600 [1][6]=720 [1][7]=840 [1][8]=960 [1][9]=1080 [1][10]=1200 [1][11]=1320 [1][12]=1440 [1][13]=1560 [1][14]=1680 [1][15]=1800 
[2][0]=0 [2][1]=120 [2][2]=240 [2][3]=360 [2][4]=480 [2][5]=600 [2][6]=720 [2][7]=840 [2][8]=960 [2][9]=1080 [2][10]=1200 [2][11]=1320 [2][12]=1440 [2][13]=1560 [2][14]=1680 [2][15]=1800 
[3][0]=0 [3][1]=120 [3][2]=240 [3][3]=360 [3][4]=480 [3][5]=600 [3][6]=720 [3][7]=840 [3][8]=960 [3][9]=1080 [3][10]=1200 [3][11]=1320 [3][12]=1440 [3][13]=1560 [3][14]=1680 [3][15]=1800 
[4][0]=0 [4][1]=120 [4][2]=240 [4][3]=360 [4][4]=480 [4][5]=600 [4][6]=720 [4][7]=840 [4][8]=960 [4][9]=1080 [4][10]=1200 [4][11]=1320 [4][12]=1440 [4][13]=1560 [4][14]=1680 [4][15]=1800 
[5][0]=0 [5][1]=120 [5][2]=240 [5][3]=360 [5][4]=480 [5][5]=

# Medición de tiempos

## Tiempos en CPU

In [91]:
%%writefile mat_cpu.cu
#include <stdio.h>
#include <time.h>

const int N = 16;

// Computa la multiplicación de matrices en CPU
void matrixMultCPU(int a[N][N], int b[N][N], int c[N][N]) {
  int n,m;
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      int sum = 0;
      for (int k = 0; k < N; k++) {
        m = a[i][k];
        n = b[k][j];
        sum += m * n;
      }
      c[i][j] = sum;
    }
  }
}

int main() {
  int a[N][N], b[N][N], c[N][N];
  int cont,i,j;

  /* inicializando variables con datos*/
  for (i = 0; i < N; i++) {
    cont = 0;
    for (j = 0; j < N; j++) {
      a[i][j] = cont;
      b[i][j] = cont;
      cont++;
    }
  }

  clock_t start, end;
  double cpu_time_used;
  int nIter = 1000;

  start = clock();
  for (int i = 0; i < nIter; i++)
      matrixMultCPU(a, b, c);
  
  end = clock();
  cpu_time_used = (double) (end - start) / CLOCKS_PER_SEC;
  printf(cpu_time_used);

  return 0;
}

Overwriting mat_cpu.cu


In [92]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat_cpu.cu -o mat_cpu -lcudadevrt
!./mat_cpu

mat_cpu.cu(46): error: argument of type "double" is incompatible with parameter of type "const char *"

1 error detected in the compilation of "/tmp/tmpxft_0000087b_00000000-8_mat_cpu.cpp1.ii".
[0][0]=0 [0][1]=120 [0][2]=240 [0][3]=360 [0][4]=480 [0][5]=600 [0][6]=720 [0][7]=840 [0][8]=960 [0][9]=1080 [0][10]=1200 [0][11]=1320 [0][12]=1440 [0][13]=1560 [0][14]=1680 [0][15]=1800 
[1][0]=0 [1][1]=120 [1][2]=240 [1][3]=360 [1][4]=480 [1][5]=600 [1][6]=720 [1][7]=840 [1][8]=960 [1][9]=1080 [1][10]=1200 [1][11]=1320 [1][12]=1440 [1][13]=1560 [1][14]=1680 [1][15]=1800 
[2][0]=0 [2][1]=120 [2][2]=240 [2][3]=360 [2][4]=480 [2][5]=600 [2][6]=720 [2][7]=840 [2][8]=960 [2][9]=1080 [2][10]=1200 [2][11]=1320 [2][12]=1440 [2][13]=1560 [2][14]=1680 [2][15]=1800 
[3][0]=0 [3][1]=120 [3][2]=240 [3][3]=360 [3][4]=480 [3][5]=600 [3][6]=720 [3][7]=840 [3][8]=960 [3][9]=1080 [3][10]=1200 [3][11]=1320 [3][12]=1440 [3][13]=1560 [3][14]=1680 [3][15]=1800 
[4][0]=0 [4][1]=120 [4][2]=240 [4][3]=360 [4][4]=480 [

## Tiempos en GPU

In [67]:
%%writefile mat.cu
#include <stdio.h>
// Tamaño de matrices
const int N = 16;
// Tamaño de mosaico
const int M = 8;

// Computa la multiplicación de matrices en GPU sin memoria compartida
__global__ void matrixMultGPU(int *a, int *b, int *c) {
  int k, sum = 0;
  int col = threadIdx.x + blockDim.x * blockIdx.x;
  int fil = threadIdx.y + blockDim.y * blockIdx.y;
  if (col < N && fil < N) {
    for (k = 0; k < N; k++) {
      sum += a[fil * N + k] * b[k * N + col];
    }
    c[fil * N + col] = sum;
  }
}

// Computa la multiplicación de matrices en GPU con memoria compartida y B traspuesta
__global__ void matrixMultGPU2(int* A, int* B, int* C) {
    
    int sum = 0;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int i = blockIdx.x * blockDim.x + tx;
    int j = blockIdx.y * blockDim.y + ty;

    __shared__ int As[M][M];
    __shared__ int Bs[M][M];

    // Recorre los mosaicos de A y B necesarios para computar la submatriz de C
    for (int tile = 0; tile < (N/M); tile++){
        // Carga los mosaicos (M x M) de A y B en paralelo (y de forma traspuesta)
        As[ty][tx] = A[(i * N) + (ty + (tile*M))];
        Bs[ty][tx] = B[((tx + (tile * M))*N) + j];

        __syncthreads();

        // Computa los resultados para la submatriz de C
        for (int k = 0; k < M; k++)
          sum += As[k][tx] * Bs[ty][k];

        __syncthreads();
    }
    // Escribe en paralelo los resultados obtenidos por el bloque
    C[i * N + j] = sum;
}

int main() {

  int a[N][N], b[N][N];
  int *dev_a, *dev_b, *dev_c;
  int cont,i,j;

  /* inicializando variables con datos*/
  for (i = 0; i < N; i++) {
    cont = 0;
    for (j = 0; j < N; j++) {
      a[i][j] = cont;
      b[i][j] = cont;
      cont++;
    }
  }

  int size = N * N * sizeof(int);
  cudaMalloc((void **) &dev_a, size);
  cudaMalloc((void **) &dev_b, size);
  cudaMalloc((void **) &dev_c, size);
  cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

  // Tamaño de grid y bloque
  //dim3 dimGrid(1, 1);
  //dim3 dimBlock(N, N);

  // Tamaño de grid y bloque
  dim3 dimGrid(N/M, N/M);
  dim3 dimBlock(M, M);

  cudaEvent_t start;
  cudaEventCreate(&start);

  cudaEvent_t stop;
  cudaEventCreate(&stop);

  cudaEventRecord(start, NULL);

  int nIter = 1000;
  for (int i = 0; i < nIter; i++){
      //matrixMultGPU<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
      matrixMultGPU2<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
  }

  cudaEventRecord(stop, NULL);

  cudaEventSynchronize(stop);

  float msecTotal = 0.0f;
  cudaEventElapsedTime(&msecTotal, start, stop);

  float msecPerKernelExecution = msecTotal / nIter;
  double flopsPerMMull = 2.0 * N * N * N;
  double gigaFlops = (flopsPerMMull * 1.0e-9f) / (msecPerKernelExecution / 1000.0f);

  printf("GFLOPS: %f", gigaFlops);

  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);

  return 0;
}

Overwriting mat.cu


In [68]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt

In [69]:
!./mat

GFLOPS: 2.734488

Sin memoria compartida:
- 16 - 2.620347
- 32 - 7.096351
- 64 - 3644.938416
- 128 - 29172.491339
- 512 - 6673514.212443

Con memoria compartida:
- 16 - 2.734488
- 32 - 11.243418
- 64 - 95.765257
- 128 - 418.859430
- 512 - 1030.160123