<a href="https://colab.research.google.com/github/Ludvins/Practicas_PDGE/blob/master/CUDA/Multiplicacion_matrices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiplicación en CPU

In [46]:
%%writefile mat_cpu.cu
#include <stdio.h>
const int N = 16;

// Computa la multiplicación de matrices en CPU
void matrixMultCPU(int a[N][N], int b[N][N], int c[N][N]) {
  int n,m;
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      int sum = 0;
      for (int k = 0; k < N; k++) {
        m = a[i][k];
        n = b[k][j];
        sum += m * n;
      }
      c[i][j] = sum;
    }
  }
}

int main() {
  int a[N][N], b[N][N], c[N][N];

  /* inicializando variables con datos*/
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      a[i][j] = j;
      b[i][j] = j;
    }
  }


  matrixMultCPU(a, b, c);

  // Comprueba resultados
  int errores = 0;
  // imprimiendo
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
        if (c[y][x] != 120*x)
          errores++;
    }
  }
  if (errores == 0)
    printf("Resultado correcto");
  else
    printf("Errores: %d", errores);
  return 0;
}

Overwriting mat_cpu.cu


In [47]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat_cpu.cu -o mat_cpu -lcudadevrt
!./mat_cpu

Resultado correcto

# Multiplicación en GPU

## Memoria no compartida

In [50]:
%%writefile mat_gpu.cu
#include <stdio.h>
const int N = 16;

// Computa la multiplicación de matrices en GPU sin memoria compartida
__global__ void matrixMultGPU(int *a, int *b, int *c) {
  int k, sum = 0;
  int fil = threadIdx.x + blockDim.x * blockIdx.x;
  int col = threadIdx.y + blockDim.y * blockIdx.y;
  if (col < N && fil < N) {
    for (k = 0; k < N; k++) {
      sum += a[fil * N + k] * b[k * N + col];
    }
    c[fil * N + col] = sum;
  }
}

int main() {
  int *a, *b, *c;

  int size = N * N * sizeof(int);

  cudaMallocManaged(&a, size);
  cudaMallocManaged(&b, size);
  cudaMallocManaged(&c, size);

  /* inicializando variables con datos*/
  for (int i = 0; i < N; i++) {
    int cont = 0;
    for (int j = 0; j < N; j++) {
      a[i*N + j] = cont;
      b[i*N + j] = cont;
      cont++;
    }
  }

  // Tamaño de grid y bloque
  dim3 dimGrid(1, 1);
  dim3 dimBlock(N, N);

  matrixMultGPU<<<dimGrid, dimBlock>>>(a, b, c);
  
  cudaDeviceSynchronize();

  // Comprueba resultados
  int errores = 0;
  // imprimiendo
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
        if (c[y*N + x] != 120*x)
          errores++;
    }
  }
  if (errores == 0)
    printf("Resultado correcto");
  else
    printf("Errores: %d", errores);

  cudaFree(a);
  cudaFree(b);
  cudaFree(c);

  return 0;
}

Overwriting mat_gpu.cu


In [51]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat_gpu.cu -o mat_gpu -lcudadevrt
!./mat_gpu

Resultado correcto

## Memoria compartida

In [52]:
%%writefile mat_gpu2.cu
#include <stdio.h>
// Tamaño de matrices
const int N = 16;
// Tamaño de mosaico
const int M = 8;

// Computa la multiplicación de matrices en GPU con memoria compartida y B traspuesta
__global__ void matrixMultGPU2(int* A, int* B, int* C, int N) {
    
    int sum = 0;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int i = blockIdx.x * blockDim.x + tx;
    int j = blockIdx.y * blockDim.y + ty;

    __shared__ int As[M][M];
    __shared__ int Bs[M][M];

    // Recorre los mosaicos de A y B necesarios para computar la submatriz de C
    for (int tile = 0; tile < (N/M); tile++){
        // Carga los mosaicos (M x M) de A y B en paralelo (y de forma traspuesta)
        As[ty][tx] = A[(i * N) + (ty + (tile*M))];
        Bs[ty][tx] = B[((tx + (tile * M))*N) + j];

        __syncthreads();

        // Computa los resultados para la submatriz de C
        for (int k = 0; k < M; k++)
          sum += As[k][tx] * Bs[ty][k];

        __syncthreads();
    }
    // Escribe en paralelo los resultados obtenidos por el bloque
    C[i * N + j] = sum;
}

int main() {
  int *a, *b, *c;

  int size = N * N * sizeof(int);

  cudaMallocManaged(&a, size);
  cudaMallocManaged(&b, size);
  cudaMallocManaged(&c, size);

  /* inicializando variables con datos*/
  for (int i = 0; i < N; i++) {
    int cont = 0;
    for (int j = 0; j < N; j++) {
      a[i*N + j] = cont;
      b[i*N + j] = cont;
      cont++;
    }
  }

  // Tamaño de grid y bloque
  dim3 dimGrid(N/M, N/M);
  dim3 dimBlock(M, M);

  matrixMultGPU2<<<dimGrid, dimBlock>>>(a, b, c, N);

  cudaDeviceSynchronize();

  // Comprueba resultados
  int errores = 0;
  // imprimiendo
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
        if (c[y*N + x] != 120*x)
          errores++;
    }
  }
  if (errores == 0)
    printf("Resultado correcto");
  else
    printf("Errores: %d", errores);

  return 0;
}

Overwriting mat_gpu2.cu


In [59]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat_gpu2.cu -o mat_gpu2 -lcudadevrt
!./mat_gpu2
!nvprof ./mat_gpu2

Resultado correcto==1530== NVPROF is profiling process 1530, command: ./mat_gpu2
Resultado correcto==1530== Profiling application: ./mat_gpu2
==1530== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  459.03us         1  459.03us  459.03us  459.03us  matrixMultGPU2(int*, int*, int*, int)
      API calls:   99.47%  204.39ms         3  68.130ms  6.8310us  204.36ms  cudaMallocManaged
                    0.23%  465.62us         1  465.62us  465.62us  465.62us  cudaDeviceSynchronize
                    0.19%  388.56us         1  388.56us  388.56us  388.56us  cuDeviceTotalMem
                    0.08%  159.57us        97  1.6450us     153ns  73.804us  cuDeviceGetAttribute
                    0.02%  41.582us         1  41.582us  41.582us  41.582us  cudaLaunchKernel
                    0.01%  28.532us         1  28.532us  28.532us  28.532us  cuDeviceGetName
                    0.00%  3.2400us         1  3.2400us  3.24

# Medición de tiempos

## Tiempos en CPU

## Tiempos en GPU

In [2]:
%%writefile mat.cu
#include <stdio.h>
// Tamaño de matrices
const int N = 16;
// Tamaño de mosaico
const int M = 8;

// Computa la multiplicación de matrices en GPU sin memoria compartida
__global__ void matrixMultGPU(int *a, int *b, int *c) {
  int k, sum = 0;
  int col = threadIdx.x + blockDim.x * blockIdx.x;
  int fil = threadIdx.y + blockDim.y * blockIdx.y;
  if (col < N && fil < N) {
    for (k = 0; k < N; k++) {
      sum += a[fil * N + k] * b[k * N + col];
    }
    c[fil * N + col] = sum;
  }
}

// Computa la multiplicación de matrices en GPU con memoria compartida y B traspuesta
__global__ void matrixMultGPUComp(int* A, int* B, int* C) {
    
    int sum = 0;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int i = blockIdx.x * blockDim.x + tx;
    int j = blockIdx.y * blockDim.y + ty;

    __shared__ int As[M][M];
    __shared__ int Bs[M][M];

    // Recorre los mosaicos de A y B necesarios para computar la submatriz de C
    for (int tile = 0; tile < (N/M); tile++){
        // Carga los mosaicos (M x M) de A y B en paralelo (y de forma traspuesta)
        As[ty][tx] = A[(i * N) + (ty + (tile*M))];
        Bs[ty][tx] = B[((tx + (tile * M))*N) + j];

        __syncthreads();

        // Computa los resultados para la submatriz de C
        for (int k = 0; k < M; k++)
          sum += As[k][tx] * Bs[ty][k];

        __syncthreads();
    }
    // Escribe en paralelo los resultados obtenidos por el bloque
    C[i * N + j] = sum;
}

int main() {
  int *a, *b, *c;

  int size = N * N * sizeof(int);

  cudaMallocManaged(&a, size);
  cudaMallocManaged(&b, size);
  cudaMallocManaged(&c, size);

  /* inicializando variables con datos*/
  for (int i = 0; i < N; i++) {
    int cont = 0;
    for (int j = 0; j < N; j++) {
      a[i*N + j] = cont;
      b[i*N + j] = cont;
      cont++;
    }
  }

  // Tamaño de grid y bloque
  //dim3 dimGrid(1, 1);
  //dim3 dimBlock(N, N);

  // Tamaño de grid y bloque
  dim3 dimGrid(N/M, N/M);
  dim3 dimBlock(M, M);

  cudaEvent_t start;
  cudaEventCreate(&start);

  cudaEvent_t stop;
  cudaEventCreate(&stop);

  cudaEventRecord(start, NULL);

  int nIter = 1000;
  for (int i = 0; i < nIter; i++){
      matrixMultGPU<<<dimGrid, dimBlock>>>(a, b, c);
  }
  cudaDeviceSynchronize();

  cudaEventRecord(stop, NULL);
  cudaEventSynchronize(stop);

  float msecTotal = 0.0f;
  cudaEventElapsedTime(&msecTotal, start, stop);

  float msecPerKernelExecution = msecTotal / nIter;
  double flopsPerMMull = 2.0 * N * N * N;
  double gigaFlops = (flopsPerMMull * 1.0e-9f) / (msecPerKernelExecution / 1000.0f);

  // Comprueba resultados
  int errores = 0;
  // imprimiendo
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
        if (c[y*N + x] != N*(N-1)/2*x)
          errores++;
    }
  }
  if (errores != 0)
    printf("Errores: %d\n", errores);

  printf("GFLOPS: %f\n", gigaFlops);


  cudaFree(a);
  cudaFree(b);
  cudaFree(c);

  return 0;
}

Writing mat.cu


## Sin memoria compartida:

In [81]:
!sed -i '/const int N = /c\const int N = 16;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2463== NVPROF is profiling process 2463, command: ./mat
GFLOPS: 1.216291
==2463== Profiling application: ./mat
==2463== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.0923ms      1000  3.0920us  2.5600us  478.71us  matrixMultGPU(int*, int*, int*)
      API calls:   96.75%  211.72ms         3  70.574ms  5.7390us  211.69ms  cudaMallocManaged
                    2.91%  6.3601ms      1000  6.3600us  4.8930us  29.878us  cudaLaunchKernel
                    0.17%  382.70us         1  382.70us  382.70us  382.70us  cuDeviceTotalMem
                    0.07%  147.56us        97  1.5210us     135ns  62.414us  cuDeviceGetAttribute
                    0.06%  122.00us         3  40.667us  12.327us  79.301us  cudaFree
                    0.01%  29.367us         1  29.367us  29.367us  29.367us  cuDeviceGetName
                    0.01%  20.080us         2  10.040us  1.2080us  18.872us  cudaEventCreate
                

In [82]:
!sed -i '/const int N = /c\const int N = 32;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2512== NVPROF is profiling process 2512, command: ./mat
GFLOPS: 10.005422
==2512== Profiling application: ./mat
==2512== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.6924ms      1000  3.6920us  3.0710us  584.46us  matrixMultGPU(int*, int*, int*)
      API calls:   96.95%  222.44ms         3  74.147ms  6.0530us  222.41ms  cudaMallocManaged
                    2.69%  6.1803ms      1000  6.1800us  4.6400us  32.548us  cudaLaunchKernel
                    0.17%  391.89us         1  391.89us  391.89us  391.89us  cuDeviceTotalMem
                    0.08%  180.03us        97  1.8550us     154ns  67.870us  cuDeviceGetAttribute
                    0.06%  138.31us         3  46.102us  10.977us  81.254us  cudaFree
                    0.02%  44.720us         1  44.720us  44.720us  44.720us  cuDeviceGetName
                    0.01%  16.392us         2  8.1960us  1.2990us  15.093us  cudaEventCreate
               

In [83]:
!sed -i '/const int N = /c\const int N = 64;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2561== NVPROF is profiling process 2561, command: ./mat
GFLOPS: 59.856129
==2561== Profiling application: ./mat
==2561== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  7.4704ms      1000  7.4700us  6.8800us  466.07us  matrixMultGPU(int*, int*, int*)
      API calls:   95.83%  211.19ms         3  70.398ms  6.3200us  211.17ms  cudaMallocManaged
                    3.00%  6.6017ms      1000  6.6010us  4.6550us  108.66us  cudaLaunchKernel
                    0.81%  1.7842ms         1  1.7842ms  1.7842ms  1.7842ms  cudaDeviceSynchronize
                    0.18%  396.58us         1  396.58us  396.58us  396.58us  cuDeviceTotalMem
                    0.10%  216.13us        97  2.2280us     160ns  113.98us  cuDeviceGetAttribute
                    0.05%  116.34us         3  38.779us  13.670us  69.261us  cudaFree
                    0.01%  28.172us         1  28.172us  28.172us  28.172us  cuDeviceGetName
         

In [84]:
!sed -i '/const int N = /c\const int N = 128;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2612== NVPROF is profiling process 2612, command: ./mat
GFLOPS: 121.721129
==2612== Profiling application: ./mat
==2612== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  33.171ms      1000  33.170us  32.447us  566.86us  matrixMultGPU(int*, int*, int*)
      API calls:   86.00%  213.90ms         3  71.300ms  6.1230us  213.87ms  cudaMallocManaged
                   11.13%  27.679ms         1  27.679ms  27.679ms  27.679ms  cudaDeviceSynchronize
                    2.57%  6.4010ms      1000  6.4010us  4.6330us  160.25us  cudaLaunchKernel
                    0.15%  381.66us         1  381.66us  381.66us  381.66us  cuDeviceTotalMem
                    0.06%  149.40us        97  1.5400us     143ns  62.588us  cuDeviceGetAttribute
                    0.05%  129.95us         3  43.316us  12.012us  79.386us  cudaFree
                    0.01%  30.506us         1  30.506us  30.506us  30.506us  cuDeviceGetName
        

In [85]:
!sed -i '/const int N = /c\const int N = 512;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2661== NVPROF is profiling process 2661, command: ./mat
GFLOPS: 324.711640
==2661== Profiling application: ./mat
==2661== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  825.86ms      1000  825.86us  668.55us  3.7685ms  matrixMultGPU(int*, int*, int*)
      API calls:   79.35%  817.51ms         1  817.51ms  817.51ms  817.51ms  cudaDeviceSynchronize
                   19.75%  203.52ms         3  67.840ms  18.862us  203.48ms  cudaMallocManaged
                    0.81%  8.3421ms      1000  8.3420us  4.7530us  99.959us  cudaLaunchKernel
                    0.04%  370.97us         1  370.97us  370.97us  370.97us  cuDeviceTotalMem
                    0.03%  329.19us         3  109.73us  50.810us  173.66us  cudaFree
                    0.01%  139.99us        97  1.4430us     145ns  58.257us  cuDeviceGetAttribute
                    0.00%  28.190us         1  28.190us  28.190us  28.190us  cuDeviceGetName
        

## Con memoria compartida

In [86]:
!sed -i '/matrixMultGPU<<<dimGrid, dimBlock>>>(a, b, c);/c\matrixMultGPUComp<<<dimGrid, dimBlock>>>(a, b, c);' mat.cu
!sed -i '/const int N = /c\const int N = 16;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2711== NVPROF is profiling process 2711, command: ./mat
GFLOPS: 1.076970
==2711== Profiling application: ./mat
==2711== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.7038ms      1000  1.7030us  1.3110us  375.57us  matrixMultGPUComp(int*, int*, int*)
      API calls:   96.18%  199.51ms         3  66.502ms  6.3450us  199.48ms  cudaMallocManaged
                    3.45%  7.1567ms      1000  7.1560us  5.0270us  97.368us  cudaLaunchKernel
                    0.21%  443.23us         1  443.23us  443.23us  443.23us  cuDeviceTotalMem
                    0.07%  138.38us        97  1.4260us     142ns  57.884us  cuDeviceGetAttribute
                    0.05%  107.06us         3  35.685us  12.939us  64.390us  cudaFree
                    0.01%  28.485us         1  28.485us  28.485us  28.485us  cuDeviceGetName
                    0.01%  14.737us         2  7.3680us  3.9850us  10.752us  cudaEventRecord
            

In [87]:
!sed -i '/const int N = /c\const int N = 32;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2760== NVPROF is profiling process 2760, command: ./mat
GFLOPS: 9.164951
==2760== Profiling application: ./mat
==2760== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  4.6596ms      1000  4.6590us  4.1600us  426.84us  matrixMultGPUComp(int*, int*, int*)
      API calls:   96.38%  199.36ms         3  66.452ms  6.5610us  199.33ms  cudaMallocManaged
                    3.26%  6.7452ms      1000  6.7450us  4.9150us  101.02us  cudaLaunchKernel
                    0.19%  392.28us         1  392.28us  392.28us  392.28us  cuDeviceTotalMem
                    0.07%  153.69us        97  1.5840us     146ns  58.239us  cuDeviceGetAttribute
                    0.06%  128.92us         3  42.972us  13.155us  72.183us  cudaFree
                    0.01%  23.871us         1  23.871us  23.871us  23.871us  cuDeviceGetName
                    0.01%  14.825us         2  7.4120us  4.0270us  10.798us  cudaEventRecord
            

In [88]:
!sed -i '/const int N = /c\const int N = 64;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2809== NVPROF is profiling process 2809, command: ./mat
GFLOPS: 54.456322
==2809== Profiling application: ./mat
==2809== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  8.3443ms      1000  8.3440us  7.8390us  420.47us  matrixMultGPUComp(int*, int*, int*)
      API calls:   95.15%  196.51ms         3  65.503ms  6.1570us  196.48ms  cudaMallocManaged
                    3.28%  6.7798ms      1000  6.7790us  4.6480us  81.972us  cudaLaunchKernel
                    1.20%  2.4763ms         1  2.4763ms  2.4763ms  2.4763ms  cudaDeviceSynchronize
                    0.20%  407.47us         1  407.47us  407.47us  407.47us  cuDeviceTotalMem
                    0.08%  159.88us        97  1.6480us     137ns  76.533us  cuDeviceGetAttribute
                    0.06%  127.38us         3  42.459us  13.767us  82.510us  cudaFree
                    0.01%  29.874us         1  29.874us  29.874us  29.874us  cuDeviceGetName
     

In [89]:
!sed -i '/const int N = /c\const int N = 128;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==2858== NVPROF is profiling process 2858, command: ./mat
GFLOPS: 119.013755
==2858== Profiling application: ./mat
==2858== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  33.925ms      1000  33.925us  32.831us  685.61us  matrixMultGPUComp(int*, int*, int*)
      API calls:   85.15%  203.50ms         3  67.833ms  6.1030us  203.47ms  cudaMallocManaged
                   11.14%  26.611ms         1  26.611ms  26.611ms  26.611ms  cudaDeviceSynchronize
                    3.43%  8.1950ms      1000  8.1950us  4.5590us  663.98us  cudaLaunchKernel
                    0.15%  350.23us         1  350.23us  350.23us  350.23us  cuDeviceTotalMem
                    0.06%  137.87us        97  1.4210us     132ns  58.358us  cuDeviceGetAttribute
                    0.05%  116.76us         3  38.919us  12.012us  72.066us  cudaFree
                    0.01%  27.126us         1  27.126us  27.126us  27.126us  cuDeviceGetName
    

In [3]:
!sed -i '/const int N = /c\const int N = 512;' mat.cu
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true mat.cu -o mat -lcudadevrt
!nvprof ./mat

==247== NVPROF is profiling process 247, command: ./mat
GFLOPS: 374.523663
==247== Profiling application: ./mat
==247== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  715.92ms      1000  715.92us  685.99us  2.1665ms  matrixMultGPU(int*, int*, int*)
      API calls:   70.58%  710.04ms         1  710.04ms  710.04ms  710.04ms  cudaDeviceSynchronize
                   28.70%  288.73ms         3  96.243ms  18.365us  288.69ms  cudaMallocManaged
                    0.62%  6.2437ms      1000  6.2430us  4.1620us  109.35us  cudaLaunchKernel
                    0.04%  429.32us         1  429.32us  429.32us  429.32us  cuDeviceTotalMem
                    0.03%  271.15us         3  90.384us  50.055us  126.49us  cudaFree
                    0.02%  159.27us        97  1.6410us     135ns  71.547us  cuDeviceGetAttribute
                    0.00%  26.068us         2  13.034us  9.5050us  16.563us  cudaEventRecord
            