In [None]:
#
# Matrix Multiplication Driver
#
# This is a driver program to test various ways of computing
# the product:
#                 C = A * B
#
# A and B are constant matrices, square and the order is
# set as a constant, ORDER (see definitions.py). This is so
# we can make a quick test of the multiplication result.
#
# History:   C++ version written by Tim Mattson, August 2010 
#            Modified by Simon McIntosh-Smith, September 2011
#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
#            Ported to Python by Tom Deakin, July 2013
#            Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
#

from helper import *
from definitions import *

import pyopencl as cl
import numpy
from time import time


# A[N][N], B[N][N], C[N][N]
N = ORDER;

# Number of elements in the matrix
size = N * N

### 1. Defining platform (devices + context + queues)

In [20]:
## Device memory

# A matrix
h_A = numpy.empty(size).astype(numpy.float32)
h_A.fill(AVAL)

# B matrix
h_B = numpy.empty(size).astype(numpy.float32)
h_B.fill(BVAL)

# C matrix
h_C = numpy.empty(size).astype(numpy.float32)


## Devices and compute context
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[1])])
device = platforms[1].get_devices()

# Print out device info
#deviceinfo.output_device_info(context.devices[0])

# Create a command queue
queue = cl.CommandQueue(context)
# Set up OpenCL

### 2. Setup Memory Kernels

In [22]:
# Create the input (a, b) and output (c) arrays in device memory
# Create OpenCL buffers
d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A)
d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B)
d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)

### Test Simple Task

In [23]:
kernelSource = open("bin/Simple_task.aocx", mode='rb').read()
prg = cl.Program(ctx, device, [kernelSource]).build()

mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])

print("\n===== OpenCL, matrix mult, Simple Task, order", N, "======\n")

# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (1,)
    localrange = None

    mmul(queue, globalrange, localrange, N, d_a, d_b, d_c)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)


### 5. transfer memory objects and execute the kernel

In [30]:
print("\n===== OpenCL, matrix mult, C row per work item, order", N, "======\n")

kernelsource = open("Crow_matmul.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])

# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (N, )
    localrange = (int(ORDER/16), ) # 64 work-items per work-group
    
    mmul(queue, globalrange, localrange, numpy.int32(N), d_a, d_b, d_c)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)
    
print("\n===== OpenCL, matrix mult, C row per work item with private memory, order", N, "======\n")

kernelsource = open("Crow_matmul_priv_mem.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])

# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (N, )
    localrange = (int(ORDER/16), ) # 64 work-items per work-group
    
    mmul(queue, globalrange, localrange, N, d_a, d_b, d_c)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)    

print("\n===== OpenCL, matrix mult, C row per work item, Row A in private memory & Col B in Local Memory, order", N, "======\n")

kernelsource = open("Crow_matmul_priv&local-mem.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None, None])

local_mem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * N) # Creating a local memory buffer indicating the size in bytes

# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (N, )
    localrange = (int(ORDER/16), ) # 64 work-items per work-group
    
    mmul(queue, globalrange, localrange, N, d_a, d_b, d_c, local_mem)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)
    
print("\n===== OpenCL, matrix mult, block form (matrices divided into tiles), each block in Local Memory, order", N, "======\n")

kernelsource = open("C_block_form.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None, None, None])

blksz = 32

A_blk = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blksz* blksz) # Creating a local memory buffer indicating the size in bytes
B_blk = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blksz* blksz) # Creating a local memory buffer indicating the size in bytes


# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (N, N)
    localrange = (blksz, blksz) # 64 work-items per work-group
    
    mmul(queue, globalrange, localrange, N, d_a, d_b, d_c, A_blk, B_blk)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)


The kernel ran in 0.00017642974853515625 seconds


### 6. Get results and validate them

C = A+B:  200 out of 200 results were correct.

global size obtained is:  8 

local size obtained is:  2 

