# **Exercício Programa 5**

Nome: Jhon Wislin Ribeiro Citron

RA: 776852

Professor: Hermes Senger


Abaixo é possível observar o código de multiplicação de matrizes. O objetivo é paralelizalo utilizando programação em CUDA de forma que o calculo seja dividido entre as threads da grid da GPU.

In [None]:
!lscpu

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         46 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  2
  On-line CPU(s) list:   0,1
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:          6
    Model:               85
    Thread(s) per core:  2
    Core(s) per socket:  1
    Socket(s):           1
    Stepping:            3
    BogoMIPS:            4000.41
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clf
                         lush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_
                         good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fm
                         a cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hyp
                         ervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd i

In [None]:
!nvidia-smi

Mon Feb  5 01:13:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
%%writefile ep5.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

__global__ void matrixMulGPU(int *a, int *b, int *c, int N)
{
    int X = blockIdx.y * blockDim.y + threadIdx.y; //Indíce global para x na grid
    int Y = blockIdx.x * blockDim.x + threadIdx.x; //Indice global para y na grid
    int strideX = blockDim.x * gridDim.x; //Número de threads em x
    int strideY = blockDim.y * gridDim.y; //Número de threads em y

    for( int row = X; row < N; row+=strideX )
      for( int col = Y; col < N; col+=strideY )
      {
          int val = 0;
          for (int k = 0; k < N; ++k)
          {
              val += a[row * N + k] * b[k * N + col];
          }
          c[row * N + col] = val;
      }
}


/*
 * This CPU function already works, and will run to create a solution matrix
 * against which to verify your work building out the matrixMulGPU kernel.
 */

void matrixMulCPU( int * a, int * b, int * c , int N)
{
  int val = 0;

  for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      val = 0;
      for ( int k = 0; k < N; ++k )
        val += a[row * N + k] * b[k * N + col];
      c[row * N + col] = val;
    }
}

int main(int argc, char *argv[])
{
  int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations
  // Para medir tempo de execução de um kernel
  cudaEvent_t start_gpu, stop_gpu;
  cudaEventCreate(&start_gpu);
  cudaEventCreate(&stop_gpu);

  if (argc != 2)
  {
    printf("Usage: %s <N>\n", argv[0]);
    return 1;
  }

  int N = atoi(argv[1]);

  int size = N * N * sizeof (int); // Number of bytes of an N x N matrix

  // Allocate memory
  cudaMallocManaged (&a, size);
  cudaMallocManaged (&b, size);
  cudaMallocManaged (&c_cpu, size);
  cudaMallocManaged (&c_gpu, size);

  // Initialize memory; create 2D matrices
  for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      a[row*N + col] = row;//
      b[row*N + col] = col+2;
      c_cpu[row*N + col] = 0;
      c_gpu[row*N + col] = 0;
    }

  /*
   * Assign `threads_per_block` and `number_of_blocks` 2D values
   * that can be used in matrixMulGPU above.
   */

  dim3 threads_per_block(8,8);
  dim3 number_of_blocks(32,32);

  cudaEventRecord(start_gpu);

  matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu, N );
  cudaDeviceSynchronize();

  cudaEventRecord(stop_gpu);
  cudaEventSynchronize(stop_gpu);
  float duration_gpu = 0;
  cudaEventElapsedTime(&duration_gpu, start_gpu, stop_gpu);
  printf("GPU Time: %lf seconds\n", duration_gpu);

  // Call the CPU version to check our work
  clock_t start_cpu = clock();

  matrixMulCPU( a, b, c_cpu, N );

  clock_t stop_cpu = clock();

  double duration_cpu = ((double)(stop_cpu - start_cpu)) / CLOCKS_PER_SEC;
  printf("CPU Time: %lf seconds\n", duration_cpu);

  // Compare the two answers to make sure they are equal
  bool error = false;
  for( int row = 0; row < N && !error; ++row )
    for( int col = 0; col < N && !error; ++col )
      if (c_cpu[row * N + col] != c_gpu[row * N + col])
      {
        printf("FOUND ERROR at c[%d][%d]\n", row, col);
        error = true;
        break;
      }
  if (!error)
    printf("Success!\n");

  // Free all our allocated memory
  cudaFree(a); cudaFree(b);
  cudaFree( c_cpu ); cudaFree( c_gpu );
}

Overwriting ep5.cu


In [None]:
!nvcc -arch=sm_75 -o ep5 ep5.cu && ./ep5 10

GPU Time: 0.529344 seconds
CPU Time: 0.000063 seconds
Success!


In [None]:
!nvcc -arch=sm_75 -o ep5 ep5.cu && ./ep5 100

GPU Time: 0.799072 seconds
CPU Time: 0.004822 seconds
Success!


In [None]:
!nvcc -arch=sm_75 -o ep5 ep5.cu && ./ep5 1000

GPU Time: 12.529856 seconds
CPU Time: 3.873425 seconds
Success!
