In [13]:
import pyopencl as cl
import numpy as np
import time

# Define the matrix multiplication kernel
kernel_code = """
__kernel void matrix_multiply(__global const float* A,
                              __global const float* B,
                              __global float* C,
                              const int M,
                              const int N,
                              const int K)
{
    int i = get_global_id(0);
    int j = get_global_id(1);
    
    float sum = 0.0f;
    for (int k = 0; k < K; k++) {
        sum += A[i * K + k] * B[k * N + j];
    }
    
    C[i * N + j] = sum;
}
"""

# Create the OpenCL context and command queue
platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
context = cl.Context([device])
queue = cl.CommandQueue(context)

# Compile the kernel code
program = cl.Program(context, kernel_code).build()

# Define the matrix sizes
M = N = K = 32

# Create the input matrices A and B
A = np.random.rand(M, K).astype(np.float32)
B = np.random.rand(K, N).astype(np.float32)

# Create the output matrix C
C = np.zeros((M, N), dtype=np.float32)

# Create the OpenCL buffers for A, B, and C
A_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=A)
B_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=B)
C_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, C.nbytes)

# Set the kernel arguments
program.matrix_multiply.set_args(A_buffer, B_buffer, C_buffer, np.int32(M), np.int32(N), np.int32(K))

# Enqueue the kernel for execution
global_size = (M, N) 
local_size = None
event = cl.enqueue_nd_range_kernel(queue, program.matrix_multiply, global_size, local_size)
event.wait()  # Wait for kernel execution to finish

# Read the result from the device to the host
cl.enqueue_copy(queue, C, C_buffer).wait()  # Wait for data transfer to finish


# Print the result
print(C)


[[0.8774165  0.54083735 0.5385052  ... 0.66323435 0.0642884  0.6868612 ]
 [0.3425441  0.2619046  0.47662753 ... 0.23575708 0.00763941 0.4732214 ]
 [0.10591334 0.36297083 0.02353597 ... 0.00368951 0.48476776 0.78555524]
 ...
 [0.12812269 0.5563572  0.9188701  ... 0.9884563  0.08244425 0.04523661]
 [0.5155259  0.18342745 0.29845127 ... 0.3607278  0.01924756 0.20378217]
 [0.9540632  0.8430501  0.42470145 ... 0.1976566  0.24102797 0.3303517 ]]
Error: Data transfer failed
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [3]:
cl.enqueue_copy(queue, C, C_buffer).wait()  # Wait for data transfer to finish
print(C)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [252]:
import numpy as np
import pyopencl as cl

a_np = np.random.rand(50000).astype(np.float32)
b_np = np.random.rand(50000).astype(np.float32)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

prg = cl.Program(ctx, """
__kernel void sum(
    __global const float *a_g, __global const float *b_g, __global float *res_g)
{
  int gid = get_global_id(0);
  res_g[gid] = a_g[gid] + b_g[gid];
}
""").build()

res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
knl = prg.sum  # Use this Kernel object for repeated calls
knl(queue, a_np.shape, None, a_g, b_g, res_g)

res_np = np.empty_like(a_np)
cl.enqueue_copy(queue, res_np, res_g)

# Check on CPU with Numpy:
print(res_np - (a_np + b_np))
print(np.linalg.norm(res_np - (a_np + b_np)))
assert np.allclose(res_np, a_np + b_np)


[0. 0. 0. ... 0. 0. 0.]
0.0
