In [1]:
import pyopencl as cl
import numpy as np
import pandas as pd


In [2]:
program_text="""
    __kernel void MatrixMul_kernel_localA_coallesced(int dim,__global float *A,__global float *B,__global float *C,__local float *lA)
{
 //Get the index of the work-item
 int iCol = get_global_id(0);
 int iRow = get_global_id(1);
 int localIdx = get_local_id(0);
 int localSizex = get_local_size(0);
 
 float result = 0.0f;
 int numElements = dim/localSizex;
 
 for(int i=0; i<numElements ; i++)
 {
    lA[i*localSizex + localIdx] = A[iRow*dim + i*localSizex +localIdx];
 }
 barrier(CLK_LOCAL_MEM_FENCE);
 
 for(int i=0;i< dim;++i)
 {
         result += lA[i]*B[i*dim + iCol];
 }
 C[iRow*dim + iCol] = result;
}
"""

In [3]:
MatrixMul_Local_Memory="""
__kernel void MatrixMul_Local_Memory(int N,__global float* A, __global float* B, __global float* C, __local float* sh_A, __local float* sh_B) {
    // Obtener la información de los índices
    int by = get_group_id(1);  // blockIdx.y
    int bx = get_group_id(0);  // blockIdx.x

    int ty = get_local_id(1);  // threadIdx.y
    int tx = get_local_id(0);  // threadIdx.x

    // Asumiendo TILE_WIDTH es el tamaño de grupo local (local workgroup size)
    int TILE_WIDTH = get_local_size(0);  // Debe ser igual a get_local_size(1)

    // C[i,j]
    int i = TILE_WIDTH * by + ty;
    int j = TILE_WIDTH * bx + tx;

    // Inicializar el valor de la celda de C
    float value = 0.0f;

    // Loop para la multiplicación de matrices en bloques
    for (int phase = 0; phase < N / TILE_WIDTH; phase++) {
        // Cargar los sub-bloques (tiles) de A y B en la memoria local
        sh_A[ty * TILE_WIDTH + tx] = A[i * N + (phase * TILE_WIDTH + tx)];
        sh_B[ty * TILE_WIDTH + tx] = B[(phase * TILE_WIDTH + ty) * N + j];
        
        // Sincronizar los hilos para asegurar que toda la memoria local esté cargada
        barrier(CLK_LOCAL_MEM_FENCE);

        // Calcular el producto punto de los sub-bloques
        for (int k = 0; k < TILE_WIDTH; k++) {
            value += sh_A[ty * TILE_WIDTH + k] * sh_B[k * TILE_WIDTH + tx];
        }

        // Sincronizar los hilos antes de cargar el siguiente bloque
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // Asignar el valor calculado a la matriz C
    C[i * N + j] = value;
}


"""

In [4]:
def mult_mat_local_coallesced(dim:int,local_size:tuple,device_type,MatrixMul_Local_Memory,A,B):

  # Plataforma
  platform = cl.get_platforms()[0]

  # Dispositivo (GPU)
  device = platform.get_devices(device_type=device_type)[0]

  # Crear contexto con el dispositivo seleccionado
  context = cl.Context([device])

  # Crear una cola de comandos
  command_queue = cl.CommandQueue(context, device=device, properties=cl.command_queue_properties.PROFILING_ENABLE)

  # Crear el programa y compilarlo
  program = cl.Program(context, program_text)
  try:
       program.build()
  except Exception as e:
    print("Build log:")
    print(program.get_build_info(device, cl.program_build_info.LOG))
    raise e

  # Crear el kernel
  kernel = cl.Kernel(program, 'MatrixMul_kernel_localA_coallesced')

  # Inicializar matrices  C
  C = np.zeros((dim, dim), dtype=np.float32)

  # Crear buffers en el dispositivo
  mf = cl.mem_flags
  buffer_A = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=A)
  buffer_B = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=B)
  buffer_C = cl.Buffer(context, mf.WRITE_ONLY, C.nbytes)

  # Configurar argumentos del kernel
  # Tamaño de la memoria local (por ejemplo, para un bloque TILE_WIDTH x TILE_WIDTH)

  numElements = dim // local_size[0]
  local_mem_size = local_size[0] * numElements * np.dtype(np.float32).itemsize

  # Crear buffers de memoria local para sh_A y sh_B
  local_A = cl.LocalMemory(local_mem_size)
  #local_B = cl.LocalMemory(local_mem_size)

  kernel.set_arg(0, np.int32(dim))
  kernel.set_arg(1, buffer_A)
  kernel.set_arg(2, buffer_B)
  kernel.set_arg(3, buffer_C)
  kernel.set_arg(4, local_A)
  #kernel.set_arg(5,local_B)


  # Definir el tamaño global y local
  global_size = (dim, dim)

  # Ejecutar el kernel
  event = cl.enqueue_nd_range_kernel(command_queue, kernel, global_size, local_size)
  event.wait()

  # Medir tiempo de ejecución
  start_time = event.profile.start
  end_time = event.profile.end
  execution_time = (end_time - start_time) * 1e-9  # Convertir a segundos

  # Leer el resultado del buffer C
  cl.enqueue_copy(command_queue, C, buffer_C).wait()

  return execution_time, C







In [5]:
device_type=cl.device_type.GPU
local_size=(1,1)
dim=2
A = (np.random.rand(dim, dim) * 10).astype(np.float32)
B = (np.random.rand(dim, dim) * 10).astype(np.float32)
print(A,B)
exec_time,C=mult_mat_local_coallesced(dim,local_size,device_type,program_text,A,B)
print(exec_time)
print(C)


[[1.065934  8.285424 ]
 [5.3255353 9.461278 ]] [[8.040339  1.5503781]
 [9.938599  0.2142359]]
5.088e-06
[[ 90.91598    3.427636]
 [136.85095   10.283538]]


In [6]:
PYOPENCL_COMPILER_OUTPUT=1

def main():
  device_type=cl.device_type.GPU
  index = [(f"({2 ** i}/{2 ** i})" if i != 0 else "(1/1)") for i in range(0, 5)]
  columns = [2 ** i for i in range(1, 14)]  # 2^1 a 2^13 (de 2 a 8192)
  results_df = pd.DataFrame(index=index, columns=columns)

  i=1
  while i<=16:

    local_size=(i,i)
    dim=i

    while dim<=8192:

       A = (np.random.rand(dim, dim) * 10).astype(np.float32)
       B = (np.random.rand(dim, dim) * 10).astype(np.float32)

       exec_time,C=mult_mat_local_coallesced(dim,local_size,device_type,program_text,A,B)
    

       results_df.loc[f"({i}/{i})", dim] = exec_time if exec_time is not None else "NP"

       dim*=2

       del A,B

    i*=2

  #Guardar los resultados
  results_df=results_df.drop(columns=[1])
  

  return results_df


results_df=main()


In [7]:
dim=2
device_type=cl.device_type.GPU
local_size=(1,1)
A = (np.random.rand(dim, dim) * 10).astype(np.float32)
B = (np.random.rand(dim, dim) * 10).astype(np.float32)
exec_time,C=mult_mat_local_coallesced(dim,local_size,device_type,MatrixMul_Local_Memory,A,B)

In [8]:
display(results_df)

Unnamed: 0,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192
(1/1),5e-06,5e-06,4e-06,6e-06,6e-06,2e-05,0.000131,0.001007,0.007996,0.067242,0.488994,6.611627,119.117633
(2/2),5e-06,5e-06,6e-06,6e-06,5e-06,7e-06,2.6e-05,0.00017,0.0018,0.01074,0.230164,3.625132,32.03271
(4/4),,4e-06,4e-06,4e-06,5e-06,5e-06,1e-05,4.2e-05,0.000279,0.002176,0.023068,0.380702,5.431366
(8/8),,,4e-06,4e-06,5e-06,5e-06,6e-06,2.2e-05,0.000132,0.000986,0.008044,0.106959,1.276112
(16/16),,,,5e-06,5e-06,5e-06,7e-06,2.4e-05,0.000125,0.000921,0.00831,0.066806,0.537381
