In [1]:
# Import the Python OpenCL API
import pyopencl as cl
import numpy as np
from time import time

import sys
#sys.path.append('../../Python_common')
#import deviceinfo

# Setting global variables
TOL = 0.001   # tolerance used in floating point comparisons
LENGTH = 200 # length of vectors a, b, and c

### 1. Defining platform (devices + context + queues)

In [7]:
## Device memory

h_a = np.random.rand(LENGTH).astype(np.float32) # a vector
h_b = np.random.rand(LENGTH).astype(np.float32) # b vector
h_c = np.empty_like(h_a) # c vector (a+b) returned from the compute device

## Devices and compute context
# Ask the user to select a platform/device on the CLI
context = cl.create_some_context()

# Print out device info
#deviceinfo.output_device_info(context.devices[0])

# Create a command queue
queue = cl.CommandQueue(context)

### 2. Create and build program (dynamic library for kernels)

In [8]:
##  Create the compute program from the source buffer and build it
    
prg = cl.Program(context, """
__kernel void vadd(
    __global const float *a, 
    __global const float *b, 
    __global float *c,
    const unsigned int count,
    __global int *g_size,
    __global int *l_size)
{
    int i = get_global_id(0);
    if(i < count)                                                       
        c[i] = a[i] + b[i];
    if(i == 0)
    {
        g_size[0] = get_global_size(0);
        l_size[0] = get_local_size(0);
        printf("test");
    }
}""").build()


#// Create the compute kernel from the program
#ko_vadd = clCreateKernel(program, "vadd", &err);
#checkError(err, "Creating kernel");

### 3. Setup Memory Kernels

In [9]:
# Create the input (a, b) and output (c) arrays in device memory

mf = cl.mem_flags
d_a = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_a)
d_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=h_b)
d_c = cl.Buffer(context, mf.WRITE_ONLY, h_a.nbytes)
g_size = np.zeros(1).astype(np.int32)
l_size = np.zeros(1).astype(np.int32)

d_g_size = cl.Buffer(context, mf.WRITE_ONLY, sys.getsizeof(np.uint32))
d_l_size = cl.Buffer(context, mf.WRITE_ONLY, sys.getsizeof(np.uint32))

### 4. Define the kernel (attach arguments to kernel function) 

In [10]:
knl = prg.vadd  # Use this Kernel object for repeated calls
## Float pointers must be set as None
knl.set_scalar_arg_dtypes([None, None, None, np.uint32, None, None])


### 5. transfer memory objects and execute the kernel

In [11]:
rtime = time()
# __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
knl(queue, h_a.shape, None, d_a, d_b, d_c, np.uint32(LENGTH), d_g_size, d_l_size)

queue.finish()
rtime = time() - rtime
print("The kernel ran in", rtime, "seconds")


The kernel ran in 0.0003275871276855469 seconds


### 6. Get results and validate them

In [12]:
# Read back the results from the compute device

cl.enqueue_copy(queue, h_c, d_c)
cl.enqueue_copy(queue, g_size, d_g_size)
cl.enqueue_copy(queue, l_size, d_l_size)

# Test the results
correct = 0

for i in range(LENGTH):
    tmp = h_a[i] + h_b[i]     # assign element i of a+b to tmp
    tmp = tmp - h_c[i];       # compute deviation of expected and output result
    
    if ((tmp * tmp) < (TOL * TOL)):
        correct = correct + 1;
    else:
        print(" tmp %f h_a %f h_b %f h_c %f \n".format(tmp, h_a[i], h_b[i], h_c[i]))

# summarise results

print("C = A+B:  {} out of {} results were correct.\n".format(correct, LENGTH))
print("global size obtained is:  {} \n".format(g_size[0]))
print("local size obtained is:  {} \n".format(l_size[0]))

C = A+B:  200 out of 200 results were correct.

global size obtained is:  200 

local size obtained is:  10 



## Hello World test

### Test to see if simulation/emulation platforms appear

In [1]:
%%bash
aoc -march=simulator -v -ghdl device/hello_world.cl -o bin/hello_world.aocx -board=a10gx

aoc: Environment checks completed successfully.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
aoc: Cached files in /var/tmp/aocl/joerock may be used to reduce compilation time
aoc: Selected target board package /home/joerock/intelFPGA_pro/21.1/hld/board/a10_ref
aoc: Selected target board a10gx
aoc: Running OpenCL parser....
aoc: OpenCL parser completed 
aoc: Linking Object files....
aoc: Optimizing and doing static analysis of code...
aoc: Linking with IP library ...
aoc: Checking if memory usage is larger than 100%...
aoc: Memory usage is not above 100.
aoc: First stage compilation completed successfully.
aoc: Compiling for Simulator.
Quartus location: /home/joerock/intelFPGA_pro/21.1/quartus/bin/quartus_sh
Creating simulation system...
Generating simulation system...
Compiling simulation...
aoc: Simulation generation done!
Simulator flow is successful.
To execute simulator, invoke host with 
	env CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1 <host_program>


In [1]:
%%bash
#export CL_CONTEXT_MPSIM_DEVICE_INTELFPGA=1
echo $CL_CONTEXT_MPSIM_DEVICE_INTELFPGA
#export CL_CONTEXT_EMULATOR_DEVICE_ALTERA=1
#export CL_CONTEXT_COMPILER_MODE_INTELFPGA=3
#scl enable devtoolset-8 -- bash # -> only for c++ compilation
#export LD_LIBRARY_PATH="/home/joerock/anaconda3/lib":$LD_LIBRARY_PATH # -> only for c++ compilation (libstdc++.so)

1


In [1]:
#!/usr/bin/env python

import numpy as np
import pyopencl as cl
import sys

platforms = cl.get_platforms()
ctx = cl.Context(
        dev_type=cl.device_type.ALL,
        properties=[(cl.context_properties.PLATFORM, platforms[2])])
device = platforms[2].get_devices()

In [2]:
platforms[2].get_devices()

[<pyopencl.Device 'SimulatorDevice : Multi-process Simulator (aclmsim0)' on 'Intel(R) FPGA SDK for OpenCL(TM)' at 0x7f5e537ea0d8>]

In [3]:
#kernelSource = open("bin/hello_world.aocx", mode='rb')#.read()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
id_HelloWorld = np.zeros(1).astype(np.int32)
d_id_HelloWorld = cl.Buffer(ctx, mf.WRITE_ONLY, sys.getsizeof(np.uint32))

In [4]:
kernelSource = open("bin/hello_world.aocx", mode='rb').read()
prg = cl.Program(ctx, device, [kernelSource]).build()#("-cl-kernel-arg-info", device)
print("Using AOCX: hello_world.aocx\n")                 
               
    
# Set the kernel argument (argument 0)
knl = prg.hello_world
knl.set_scalar_arg_dtypes([np.uint32, None])


Using AOCX: hello_world.aocx



  warn("Non-empty compiler output encountered. Set the "


In [9]:
knl.get_info(cl.kernel_info.NUM_ARGS), knl.get_info(cl.kernel_info.FUNCTION_NAME)#,  knl.get_arg_info(0, cl.kernel_arg_info.NAME) 
###### The printf inside a kernel compiled adds 2 aditional variables to the kernel

(2, 'hello_world')

In [10]:
wi_2print = 5
id_HelloWorld[0] = -1

print("\nKernel initialization is complete.\n");
print("Launching the kernel...\n");

knl(queue, (8,), (8,), wi_2print, d_id_HelloWorld)

print("\nKernel execution is complete.\n");

cl.enqueue_copy(queue, id_HelloWorld, d_id_HelloWorld)

print("Thread {}: Hello from the Intel FPGA OpenCL Compiler!\n".format(id_HelloWorld[0]))


Kernel initialization is complete.

Launching the kernel...


Kernel execution is complete.

Thread 5: Hello from the Intel FPGA OpenCL Compiler!

