In [None]:
import numpy as np
import pyopencl as cl

rng = np.random.default_rng()
a_np = rng.random(50000, dtype=np.float32)
b_np = rng.random(50000, dtype=np.float32)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

prg = cl.Program(ctx, """
__kernel void sum(
    __global const float *a_g, __global const float *b_g, __global float *res_g)
{
  int gid = get_global_id(0);
  res_g[gid] = a_g[gid] + b_g[gid];
}
""").build()

res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
knl = prg.sum  # Use this Kernel object for repeated calls
knl(queue, a_np.shape, None, a_g, b_g, res_g)

res_np = np.empty_like(a_np)
cl.enqueue_copy(queue, res_np, res_g)

# Check on CPU with Numpy:
error_np = res_np - (a_np + b_np)
print(f"Error:\n{error_np}")
print(f"Norm: {np.linalg.norm(error_np):.16e}")
assert np.allclose(res_np, a_np + b_np)

  warn("Unable to import recommended hash 'siphash24.siphash13', "


Error:
[0. 0. 0. ... 0. 0. 0.]
Norm: 0.0000000000000000e+00


  prg.build(options_bytes, [devices[i] for i in to_be_built_indices])


In [5]:
import pyopencl as cl
for platform in cl.get_platforms():
    print(platform)

platform = cl.get_platforms()[0]  # Adjust index based on your AMD platform
device = platform.get_devices()[0]  # Select the first device
context = cl.Context([device])
queue = cl.CommandQueue(context)
print("Selected device:", device.name)
print("Device type:", cl.device_type.to_string(device.type), context.devices, queue )

<pyopencl.Platform 'AMD Accelerated Parallel Processing' at 0x7ff9de3c3000>
Selected device: gfx1103
Device type: ALL | GPU [<pyopencl.Device 'gfx1103' on 'AMD Accelerated Parallel Processing' at 0x1cb56296860>] <pyopencl._cl.CommandQueue object at 0x000001CB57179FD0>
