In [5]:
#
# Matrix Multiplication Driver
#
# This is a driver program to test various ways of computing
# the product:
#                 C = A * B
#
# A and B are constant matrices, square and the order is
# set as a constant, ORDER (see definitions.py). This is so
# we can make a quick test of the multiplication result.
#
# History:   C++ version written by Tim Mattson, August 2010 
#            Modified by Simon McIntosh-Smith, September 2011
#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
#            Ported to Python by Tom Deakin, July 2013
#            Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
#

import sys
sys.path.append('..')
from helper import *
from definitions import *

import pyopencl as cl
import numpy
from time import time


# A[N][N], B[N][N], C[N][N]
N = ORDER;

# Number of elements in the matrix
size = N * N

### Compilación del kernel y ajuste de variables de entorno para simulación 

In [3]:
%%bash
aoc -march=simulator -v -ghdl Simple_task.cl -o bin/Simple_task.aocx -board=a10gx

aoc: Environment checks completed successfully.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
aoc: Cached files in /var/tmp/aocl/joerock may be used to reduce compilation time
aoc: Selected target board package /home/joerock/intelFPGA_pro/21.1/hld/board/a10_ref
aoc: Selected target board a10gx
aoc: Running OpenCL parser....
aoc: OpenCL parser completed 
aoc: Linking Object files....
aoc: Optimizing and doing static analysis of code...
aoc: Linking with IP library ...
aoc: Checking if memory usage is larger than 100%...
aoc: Memory usage is not above 100.
aoc: First stage compilation completed successfully.
aoc: Compiling for Simulator.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
Creating simulation system...
Generating simulation system...
Compiling simulation...
aoc: Simulation generation done!
Simulator flow is successful.
To execute simulator, invoke host with 
	env CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 <host_program>


In [8]:
%%bash
export CL_CONTEXT_EMULATOR_DEVICE_ALTERA=1
export CL_CONTEXT_COMPILER_MODE_INTELFPGA=3

#scl enable devtoolset-8 -- bash -> only for c++ compilation
#export LD_LIBRARY_PATH="/home/joerock/anaconda3/lib":$LD_LIBRARY_PATH -> only for c++ compilation (libstdc++.so)

### 1. Defining platform (devices + context + queues)

In [6]:
## Device memory

# A matrix
h_A = numpy.empty(size).astype(numpy.float32)
h_A.fill(AVAL)

# B matrix
h_B = numpy.empty(size).astype(numpy.float32)
h_B.fill(BVAL)

# C matrix
h_C = numpy.empty(size).astype(numpy.float32)


## Devices and compute context
platforms = cl.get_platforms()
context = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[2])])
device = platforms[2].get_devices()

# Print out device info
#deviceinfo.output_device_info(context.devices[0])

# Create a command queue
queue = cl.CommandQueue(context)
# Set up OpenCL

  h_B = numpy.empty(size).astype(numpy.float32)
  h_C = numpy.empty(size).astype(numpy.float32)


In [7]:
platforms[2].get_devices()

[<pyopencl.Device 'SimulatorDevice : Multi-process Simulator (aclmsim0)' on 'Intel(R) FPGA SDK for OpenCL(TM)' at 0x7f71c57c30d8>]

### 2. Setup Memory Kernels

In [8]:
# Create the input (a, b) and output (c) arrays in device memory
# Create OpenCL buffers
d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A)
d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B)
d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)

### Test Simple Task

In [None]:
print("\n===== OpenCL, matrix mult, Simple Task, order", N, "======\n")

kernelSource = open("bin/Simple_task.aocx", mode='rb').read()
program = cl.Program(context, device, [kernelSource]).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])

# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (1,)
    localrange = None

    mmul(queue, globalrange, localrange, N, d_a, d_b, d_c)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)







### block form (matrices divided into tiles), each block in Local Memory

In [12]:
print("\n===== OpenCL, matrix mult, block form (matrices divided into tiles), each block in Local Memory, order", N, "======\n")

kernelSource = open("bin/C_block_form.aocx", mode='rb').read()
program = cl.Program(context, device, [kernelSource]).build()

mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None, None, None])

blksz = 32

A_blk = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blksz* blksz) # Creating a local memory buffer indicating the size in bytes
B_blk = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blksz* blksz) # Creating a local memory buffer indicating the size in bytes


# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()

    globalrange = (N, N)
    localrange = (blksz, blksz) # 64 work-items per work-group
    
    mmul(queue, globalrange, localrange, N, d_a, d_b, d_c, A_blk, B_blk)
    queue.finish()

    run_time = time() - start_time

    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)




0.31950879096984863 seconds at 6721.2036372503235 MFLOPS
