<a href="https://colab.research.google.com/github/Ludvins/Practicas_PDGE/blob/master/CUDA/Reduccion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

El objetivo de este ejercicio es familiarizarse con un tipo de operaciones
muy común en computación científica: las reducciones. Una reducción
es una combinación de todos los elementos de un vector en un valor
único, utilizando para ello algún tipo de operador asociativo. Las
implementaciones paralelas aprovechan esta asociatividad para calcular
operaciones en paralelo, calculando el resultado en $O(\log N)$ pasos sin
incrementar el número de operaciones realizadas. 

En este ejercicio se trata de comparar diferentes patrones de acceso a
los datos para ir asociando por pares los operandos de cada operación.
Esto afecta al rendimiento de la memoria, y también a la complejidad de
programación, ya que las expresiones que hay que crear para que los
hilos generen los índices de acceso a sus datos en cada paso difieren en
dificultad.

# Diferentes paradigmas de reducción

## Caso 1

Este esquema de reducciónfunciona segun el siguiente esquema: (poner foto)

- Se lanzan tantos threads como elementos hay en el vector.
- Cada thread carga un elemento en un array de memoria compartida.
- Aumentando el valor de traslación en el vector (stride) se hacen los siguientes pasos:
  - Si el identificador del thread es mejor que 2*stride, añade a su posición la de la poscicion trasladada.

  Esto resulta en hacer

In [8]:
%%writefile Reduction0.cu

#include <stdio.h>
#include <assert.h>

#define NUM_ELEMENTS 512

// **===------------------------------------------------------------------===**
//! @param g_idata  input data in global memory
//                  result is expected in index 0 of g_idata
//! @param n        input number of elements to scan from input data
// **===------------------------------------------------------------------===**
__global__ void reduction(float *g_data, int n){
  int stride;
  // Define shared memory
  __shared__ float scratch[NUM_ELEMENTS];
  // Load the shared memory
  scratch[threadIdx.x ] = g_data[threadIdx.x];
  __syncthreads();
  // Do sum reduction from shared memory
  for (stride = 1 ; stride < blockDim.x; stride *= 2) {
      __syncthreads();
      if (threadIdx.x % (2*stride) == 0)
              scratch[threadIdx.x] += scratch[threadIdx.x + stride];
      
  }
  // Store results back to global memory
  if(threadIdx.x == 0)
    g_data[0] = scratch[0];
  return;
}

float computeOnDevice(float* h_data, int num_elements)
{
  float* d_data = NULL;
  float result;
  // Memory allocation on device side
  cudaMalloc((void**)&d_data, num_elements*sizeof(float));
  // Copy from host memory to device memory
  cudaMemcpy(d_data, h_data, num_elements*sizeof(float), cudaMemcpyHostToDevice);

  int threads = num_elements;
  // Invoke the kernel
  reduction<<<1,threads>>>(d_data,num_elements);
  // Copy from device memory back to host memory
  cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(d_data);
  return result;
}


  
  void computeOnHost( float* reference, float* idata, const unsigned int len) 
{
  reference[0] = 0;
  double total_sum = 0;
  unsigned int i;
  for( i = 0; i < len; ++i) 
  {
      total_sum += idata[i];
  }
  reference[0] = total_sum;
}



////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main() {
    int num_elements = NUM_ELEMENTS;
    const unsigned int array_mem_size = sizeof( float) * num_elements;
    // allocate host memory to store the input data
    float* h_data = (float*) malloc( array_mem_size);
    // * No arguments: Randomly generate input data and compare against the host's 
    
            // initialize the input data on the host to be integer values
            // between 0 and 1000
            for( unsigned int i = 0; i < num_elements; ++i) 
            {
                //h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
                h_data[i] = i*1.0;
            }
       
        // compute reference solution
    float reference = 0.0f;  
    computeOnCPU(&reference , h_data, num_elements);
  
   
    float result = computeOnDevice(h_data, num_elements);
    // We can use an epsilon of 0 since values are integral and in a range 
    // that can be exactly represented
    float epsilon = 0.0f;
    unsigned int result_regtest = (abs(result - reference) <= epsilon);
    printf( "Test %s\n", (1 == result_regtest) ? "PASSED" : "FAILED");
    printf( "device: %f  host: %f\n", result, reference);
    // cleanup memory
    free( h_data);
}

Writing Reduction0.cu


In [9]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true Reduction0.cu -o red0 -lcudadevrt


Reduction0.cu(50): error: function "computeOnDevice" has already been defined

Reduction0.cu(102): error: identifier "computeOnCPU" is undefined

2 errors detected in the compilation of "/tmp/tmpxft_00000178_00000000-8_Reduction0.cpp1.ii".


In [3]:
!./red0

Test PASSED
device: 130816.000000  host: 130816.000000


In [42]:
!nvprof ./red0

==710== NVPROF is profiling process 710, command: ./red
Test PASSED
device: 130816.000000  host: 130816.000000
==710== Profiling application: ./red
==710== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   67.92%  8.0630us         1  8.0630us  8.0630us  8.0630us  reduction(float*, int)
                   16.44%  1.9520us         1  1.9520us  1.9520us  1.9520us  [CUDA memcpy HtoD]
                   15.63%  1.8560us         1  1.8560us  1.8560us  1.8560us  [CUDA memcpy DtoH]
      API calls:   99.65%  197.40ms         1  197.40ms  197.40ms  197.40ms  cudaMalloc
                    0.19%  371.29us         1  371.29us  371.29us  371.29us  cuDeviceTotalMem
                    0.07%  144.66us        97  1.4910us     133ns  61.010us  cuDeviceGetAttribute
                    0.04%  78.882us         1  78.882us  78.882us  78.882us  cudaFree
                    0.02%  37.087us         2  18.543us  15.710us  21.377us  cudaMemcp

## Caso 2

In [10]:
%%writefile Reduction1.cu
// includes, kernels
#include <stdio.h>
#include <assert.h>
#define NUM_ELEMENTS 512
// **===------------------------------------------------------------------===**
//! @param g_idata  input data in global memory
//                  result is expected in index 0 of g_idata
//! @param n        input number of elements to scan from input data
// **===------------------------------------------------------------------===**
__global__ void reduction(float *g_data, int n)
{
  int stride;
  // Define shared memory
  __shared__ float scratch[NUM_ELEMENTS];
  // Load the shared memory
  scratch[threadIdx.x ] = g_data[threadIdx.x];
  if(threadIdx.x + blockDim.x < n)
    scratch[threadIdx.x + blockDim.x] = g_data[threadIdx.x + blockDim.x];
  __syncthreads();
  // Do sum reduction from shared memory
 
  for (stride = NUM_ELEMENTS / 2; stride >= 1; stride >>= 1)
  {
      if(threadIdx.x < stride)
         scratch[threadIdx.x] += scratch[threadIdx.x + stride];
      __syncthreads();
  }
  // Store results back to global memory
  if(threadIdx.x == 0)
    g_data[0] = scratch[0];
  return;
}

  void computeOnHost( float* reference, float* idata, const unsigned int len) 
{
  reference[0] = 0;
  double total_sum = 0;
  unsigned int i;
  for( i = 0; i < len; ++i) 
  {
      total_sum += idata[i];
  }
  reference[0] = total_sum;
}

float computeOnDevice(float* h_data, int num_elements)
{
 float* d_data = NULL;
  float result;
  // Memory allocation on device side
  cudaMalloc((void**)&d_data, num_elements*sizeof(float));
  // Copy from host memory to device memory
  cudaMemcpy(d_data, h_data, num_elements*sizeof(float), cudaMemcpyHostToDevice);
  int threads = (num_elements/2) + num_elements%2;
  // Invoke the kernel
  reduction<<<1,threads>>>(d_data,num_elements);
  // Copy from device memory back to host memory
  cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(d_data);
  return result;
}
   


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main() {
    int num_elements = NUM_ELEMENTS;
    const unsigned int array_mem_size = sizeof( float) * num_elements;
    // allocate host memory to store the input data
    float* h_data = (float*) malloc( array_mem_size);
    // * No arguments: Randomly generate input data and compare against the host's 
    
            // initialize the input data on the host to be integer values
            // between 0 and 1000
            for( unsigned int i = 0; i < num_elements; ++i) 
            {
                //h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
                h_data[i] = i*1.0;
            }
       
        // compute reference solution
    float reference = 0.0f;  
    computeOnHost(&reference , h_data, num_elements);
  
   
    float result = computeOnDevice(h_data, num_elements);
    // We can use an epsilon of 0 since values are integral and in a range 
    // that can be exactly represented
    float epsilon = 0.0f;
    unsigned int result_regtest = (abs(result - reference) <= epsilon);
    printf( "Test %s\n", (1 == result_regtest) ? "PASSED" : "FAILED");
    printf( "device: %f  host: %f\n", result, reference);
    // cleanup memory
    free( h_data);
}

Overwriting Reduction1.cu


In [11]:
!/usr/local/cuda/bin/nvcc -arch=sm_35 -rdc=true Reduction1.cu -o red1 -lcudadevrt
!./red1


Test PASSED
device: 130816.000000  host: 130816.000000
