In [1]:
import pyopencl as cl
import numpy
from time import time

C_elem_KernelSource = '''
__kernel void mmul(
    const int N,
    __global float* A,
    __global float* B,
    __global float* C)
{
    int k;
    int i = get_global_id(0);
    int j = get_global_id(1);
    float tmp = 0;
    if ((i < N) && (j < N))
    {
        tmp = 0.0f;
        for (k=0; k<N; k++)
        {
            tmp += A[i*N + k] * B[k*N + j];
        }
        C[i*N + j] = tmp;
    }
}
'''

In [4]:
# Order of the square matrices A, B and C
ORDER = 1024

# A elemetns are constant and equal to AVAL
AVAL = 3.0

# B elemetns are constant and equal to BVAL
BVAL = 5.0

# tolerance used in floating point comparisons
TOL = 0.001

# Max dim for NDRange
DIM = 2

# number of times to do each multiplication
COUNT = 1

In [9]:
N = ORDER;

# Number of elements in the matrix
size = N * N


# A matrix
h_A = numpy.empty(size).astype(numpy.float32)
h_A.fill(AVAL)

# B matrix
h_B = numpy.empty(size).astype(numpy.float32)
h_B.fill(BVAL)

# C matrix
h_C = numpy.empty(size).astype(numpy.float32)

print ("\n===== Sequential, matrix mult (dot prod), order", ORDER, "on host CPU ======\n")





In [10]:
# Set up OpenCL
context = cl.create_some_context()
queue = cl.CommandQueue(context)

# Reset host buffers - just to play it safe
h_A = numpy.empty(size).astype(numpy.float32)
h_A.fill(AVAL)
h_B = numpy.empty(size).astype(numpy.float32)
h_B.fill(BVAL)
h_C = numpy.empty(size).astype(numpy.float32)

# Create OpenCL buffers
d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A)
d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B)
d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)

program = cl.Program(context, C_elem_KernelSource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])

print ("\n===== OpenCL, matrix mult, C(i,j) per work item, order", N, "======\n")





In [22]:
h_A, h_B, h_C

(array([3., 3., 3., ..., 3., 3., 3.], dtype=float32),
 array([5., 5., 5., ..., 5., 5., 5.], dtype=float32),
 array([15360., 15360., 15360., ..., 15360., 15360., 15360.], dtype=float32))

In [15]:
#  Function to compute the matrix product (sequential algorithm, dot prod)
def seq_mat_mul_sdot( Ndim, A, B, C):
    for i in range(Ndim):
        for j in range(Ndim):
            tmp = 0.0
            for k in range(Ndim):
                tmp += A[i*Ndim+k] * B[k*Ndim+j]
            C[i*Ndim+j] = tmp

#  Function to compute errors of the product matrix
def error( Ndim, C):
   cval = float(Ndim) * AVAL * BVAL
   errsq = 0.0
   for i in range(Ndim):
       for j in range(Ndim):
            err = C[i*Ndim+j] - cval
            errsq += err * err
   return errsq;

# Function to analyze and output results
def results( Ndim, C, run_time):
    mflops = ( 2.0 * (Ndim**(3)) )/(1000000.0* run_time)
    print (run_time, "seconds at", mflops, "MFLOPS")
    errsq = error( Ndim, C)
    if numpy.isnan(errsq) or errsq > TOL:
        print ("Errors in multiplication:", errsq)

In [16]:
# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (N, N)
    localrange = None

    mmul(queue, globalrange, localrange, N, d_a, d_b, d_c)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)

0.12828779220581055 seconds at 16739.579121876366 MFLOPS


In [23]:
h_C

array([15360., 15360., 15360., ..., 15360., 15360., 15360.], dtype=float32)