In [2]:
#!sudo apt update
#!sudo apt purge *nvidia* -y
#!sudo apt install nvidia-driver-530 -y


#!pip install pyopencl
#!apt-get install -y pocl-opencl-icd ocl-icd-libopencl1

In [3]:
import pyopencl as cl
import numpy as np
import pandas as pd


  warn("Unable to import recommended hash 'siphash24.siphash13', "


In [4]:
program_text="""
    __kernel void MatrixMul_kernel_localA(int dim, __global float *A, __global float *B, __global float *C, __local float *lA) {

        //Get the index of the work-item
             int iCol = get_global_id(0);
             int iRow = get_global_id(1);
             int localIdx = get_local_id(0);
             int localSizex = get_local_size(0);


             float result = 0.0f;
             int numElements = dim/localSizex;

             for(int i=0; i<numElements ; i++)
                 {
                  lA[localIdx*numElements + i] = A[iRow*dim + localIdx*numElements + i];
                  // Print para depuración
                   printf("voy bien", A[iRow*dim + localIdx*numElements + i],lA[localIdx*numElements + i]);

                   }

             barrier(CLK_LOCAL_MEM_FENCE);

             for(int i=0;i< dim;++i)
                   {
                       result += lA[i]*B[i*dim + iCol];
                     }
             C[iRow*dim + iCol] = result;
}
"""

In [5]:
def mult_mat_local(dim:int,local_size:tuple,device_type,program_text,A,B):

  # Plataforma
  platform = cl.get_platforms()[0]

  # Dispositivo (GPU)
  device = platform.get_devices(device_type=device_type)[0]

  # Crear contexto con el dispositivo seleccionado
  context = cl.Context([device])

  # Crear una cola de comandos
  command_queue = cl.CommandQueue(context, device=device, properties=cl.command_queue_properties.PROFILING_ENABLE)

  # Crear el programa y compilarlo
  program = cl.Program(context, program_text)
  try:
       program.build()
  except Exception as e:
    print("Build log:")
    print(program.get_build_info(device, cl.program_build_info.LOG))
    raise e

  # Crear el kernel
  kernel = cl.Kernel(program, 'MatrixMul_kernel_localA')

  # Inicializar matrices  C
  C = np.zeros((dim, dim), dtype=np.float32)

  # Crear buffers en el dispositivo
  mf = cl.mem_flags
  buffer_A = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=A)
  buffer_B = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=B)
  buffer_C = cl.Buffer(context, mf.WRITE_ONLY, C.nbytes)

    # Configurar argumentos del kernel
  numElements = dim // local_size[0]
  local_mem_size = local_size[0] * numElements * np.dtype(np.float32).itemsize
  kernel.set_arg(0, np.int32(dim))
  kernel.set_arg(1, buffer_A)
  kernel.set_arg(2, buffer_B)
  kernel.set_arg(3, buffer_C)
  kernel.set_arg(4, cl.LocalMemory(local_mem_size))

  # Definir el tamaño global y local
  global_size = (dim, dim)

  # Ejecutar el kernel
  event = cl.enqueue_nd_range_kernel(command_queue, kernel, global_size, local_size)
  event.wait()

  # Medir tiempo de ejecución
  start_time = event.profile.start
  end_time = event.profile.end
  execution_time = (end_time - start_time) * 1e-9  # Convertir a segundos

  # Leer el resultado del buffer C
  cl.enqueue_copy(command_queue, C, buffer_C).wait()

  return execution_time, C


In [6]:
PYOPENCL_COMPILER_OUTPUT=1
def main():

  device_type=cl.device_type.GPU
  index = [(f"({2 ** i}/{2 ** i})" if i != 0 else "(1/1)") for i in range(0, 5)]
  columns = [2 ** i for i in range(1, 14)]  # 2^1 a 2^13 (de 2 a 8192)
  results_df = pd.DataFrame(index=index, columns=columns)

  i=1
  while i<=16:

    local_size=(i,i)
    dim=i

    while dim<=512:

       A = (np.random.rand(dim, dim) * 10).astype(np.float32)
       B = (np.random.rand(dim, dim) * 10).astype(np.float32)
       
       exec_time,C=mult_mat_local(dim,local_size,device_type,program_text,A,B)
       print(A,B,C)
       results_df.loc[f"({i}/{i})", dim] = exec_time if exec_time is not None else "NP"

       dim*=2


       del A,B

    i*=2

  #Guardar los resultados
  results_df=results_df.drop(columns=[1])
  results_df.to_csv('C:/Users/Eevee/Documents/OPENCL_TFG/MULTIPLICACION DE MATRICES/Mult_Mat_Memoris_Local_GPU.csv',index=True)

  return results_df


results_df=main()


[[1.0272797]] [[6.8477817]] [[7.0345874]]
[[9.622467  9.478522 ]
 [5.700116  1.7390077]] [[7.1088467 2.402424 ]
 [8.311157  6.7820024]] [[147.18213  87.40061]
 [ 54.97442  25.48805]]
[[0.7949573 3.1848161 9.627438  3.876939 ]
 [8.0998    6.280698  8.11947   4.690803 ]
 [3.9733646 3.0848887 2.4830015 5.756375 ]
 [0.5597586 1.166825  6.267082  9.636275 ]] [[8.529339  2.375921  5.982067  5.751057 ]
 [7.7180824 8.825683  8.231365  1.9464469]
 [7.619036  3.00728   5.0006433 1.2065688]
 [2.8593848 7.443916  8.367194  4.5092945]] [[115.79859   87.808945 111.55335   39.869347]
 [192.83623  134.0114   180.0037    89.75637 ]
 [ 93.07737   86.983696 109.74309   57.80872 ]
 [ 89.08297  102.20647  124.921104  56.504837]]
[[5.1418347  9.712438   3.7822332  6.7312474  3.8925104  0.46568677
  0.3024679  7.8273597 ]
 [7.0759497  9.839875   2.4636424  0.74347854 2.1336555  3.3933575
  8.26441    8.303771  ]
 [3.899941   8.767117   6.8769217  4.174437   2.45269    8.8906555
  9.092098   9.697778  ]
 [9.7

RuntimeError: clWaitForEvents failed: OUT_OF_RESOURCES