# Module 3 - Doing much task wow (with OpenCL)

## Setup

### Library Import
Before doing anything else, we need to import [PyOpenCL](https://documen.tician.de/pyopencl/) and [NumPy](http://www.numpy.org/).

In [None]:
import pyopencl,numpy

### Setting up platforms, devices and context
We're going to setup the devices and context as explicit objects because we might want to interogate their runtime information.

In [None]:
platforms = pyopencl.get_platforms()
nvidia_device,intel_device = [platform.get_devices()[0] 
                              for platform in platforms]
nvidia_context,intel_context = [pyopencl.Context(devices=[device]) 
                                for device in (nvidia_device,intel_device)]

## Inspecting Device Properties
1. Selecting the properties of interest
2. print out for each device

In [None]:
name_properties = {
    "Device Name":pyopencl.device_info.NAME,
    "Device Platform":pyopencl.device_info.PLATFORM,
    "Device Type":pyopencl.device_info.TYPE
}

processing_properties = {
    "Available Compute Units": pyopencl.device_info.MAX_COMPUTE_UNITS,
    "Clockrate": pyopencl.device_info.MAX_CLOCK_FREQUENCY
}

memory_properties = {
    "Available Global Memory": pyopencl.device_info.GLOBAL_MEM_SIZE,
    "Available Constant Memory": pyopencl.device_info.MAX_CONSTANT_BUFFER_SIZE,
    "Available Local Memory" : pyopencl.device_info.LOCAL_MEM_SIZE
}

device_types = {
    pyopencl.device_type.CPU:"CPU",
    pyopencl.device_type.GPU:"GPU"
}

In [None]:
for device in (nvidia_device,intel_device):
    for property_name in sorted(name_properties.keys() - {"Device Type"}):
        property_string_args = (property_name,device.get_info(name_properties[property_name]))
        print("%s: %s"%property_string_args)
        
    print("Device Types: %s"%device_types[device.get_info(name_properties["Device Type"])])
    
    for property_name in sorted(processing_properties.keys()):
        property_string_args = (property_name,device.get_info(processing_properties[property_name]))
        print("%s: %s"%property_string_args)
    
    for property_name in sorted(memory_properties.keys()):
        property_string_args = (property_name,device.get_info(memory_properties[property_name]))
        print("%s: %s"%property_string_args)
        
    print("\n")

## Task vs Data Parallelism
### Setting up the program
1. Create a program for Vector element-wise multiplication
2. Compile the programs

In [None]:
program_source = """
kernel void operation(global long *a,
                      global long *b)
{
  int gid = get_global_id(0);
  
  long a_temp = a[gid];
  long b_temp = b[gid];
  
  b[gid] = b_temp/a_temp + b_temp*a_temp - b_temp%a_temp;
}
"""
nvidia_program_source,intel_program_source = [pyopencl.Program(context,program_source) 
                                              for context in (nvidia_context,intel_context)]

In [None]:
nvidia_program,intel_program = [program.build()
                                for program in (nvidia_program_source,intel_program_source)]

### Creating the global memory resource
1. Defining source data parameters
2. Creating the source data
3. Creating the memory resources within the context

In [None]:
M = 10
N = int(128e3)
dt = numpy.int64
dt_size = numpy.dtype(dt).itemsize

In [None]:
a = numpy.random.randint(low=1,high=10,size=(M,N)).astype(dt)
b = numpy.random.randint(low=1,high=1000,size=(M,N)).astype(dt)*a

In [None]:
def create_buffers(context,a_size,b_size):
    a_buffer = pyopencl.Buffer(context,
                               flags = pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.ALLOC_HOST_PTR, 
                               size=a_size)
    b_buffer = pyopencl.Buffer(context, 
                               flags=pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.ALLOC_HOST_PTR, 
                               size=b_size)
    return a_buffer,b_buffer

In [None]:
nvidia_a_buffer,nvidia_b_buffer = create_buffers(nvidia_context,N*dt_size,N*dt_size)
intel_a_buffer,intel_b_buffer = create_buffers(intel_context,N*dt_size,N*dt_size)

## Running the program
### Defining the host program

In [None]:
def compute_norm(queue,a,a_buffer,b,b_buffer,program,wgs):
    c = numpy.empty_like(a)
    total = 0.0
    
    wg_size = int(a.shape[0]/wgs)
    for i,(a_row,b_row) in enumerate(zip(a,b)):
        #copying data onto device
        copyon_events = []
        
        copyon_events += [pyopencl.enqueue_copy(queue,
                                                src=a_row,
                                                dest=a_buffer,
                                                is_blocking = False)]
        copyon_events += [pyopencl.enqueue_copy(queue,
                                                src=b_row,
                                                dest=b_buffer,
                                                is_blocking = False)]
        
        #running program
        kernel_event = program.operation(queue,
                                         a_row.shape, #global size
                                         (wg_size,), #local size
                                         a_buffer,b_buffer,
                                         wait_for = copyon_events)
        
        kernel_event2 = program.square(queue,
                                       b_row.shape, #global size
                                       (wg_size,), #local size
                                       b_buffer,
                                       wait_for = [kernel_event])
        
        #copying data off device
        copyoff_event = pyopencl.enqueue_copy(queue,
                                              src = b_buffer,
                                              dest = c[i],
                                              wait_for = [kernel_event2],
                                              is_blocking = False)
        
        #since we might as well do something useful while we wait
        if(i>0): total += c[i-1].sum()
            
        #wait for copy-off to finish
        copyoff_event.wait()
        
    total += c[-1].sum()
        
    return total**0.5

## Out-of-order Execution
Similiar to before, but using out of order execution

In [None]:
nvidia_oo_queue = pyopencl.CommandQueue(nvidia_context,
                                        properties = pyopencl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
intel_oo_queue = pyopencl.CommandQueue(intel_context,
                                       properties = pyopencl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

In [None]:
nvidia_oo_norm = compute_norm(nvidia_oo_queue,
                              a,nvidia_a_buffer,
                              b,nvidia_b_buffer,
                              nvidia_program,
                              2*nvidia_device.get_info(pyopencl.device_info.MAX_COMPUTE_UNITS)
                             )
    
intel_oo_norm = compute_norm(intel_oo_queue,
                             a,intel_a_buffer,
                             b,intel_b_buffer,
                             intel_program,
                             2*intel_device.get_info(pyopencl.device_info.MAX_COMPUTE_UNITS)
                            )

In [None]:
reference_result = numpy.linalg.norm(b/a + b*a - b%a)

In [None]:
if(reference_result - nvidia_oo_norm > 0): raise Exception("nvidia result does not match!")
if(reference_result - intel_oo_norm > 0): raise Exception("intel result does not match!")

### Performance Comparison

In [None]:
import time

In [None]:
def evaluate_program(nvidia_queue,intel_queue,threads,n=10):
    nvidia_time = 0
    intel_time = 0
    for i in range(n):
        nvidia_start = time.time()
        compute_norm(nvidia_io_queue,a,nvidia_a_buffer,b,nvidia_b_buffer,nvidia_program)
        nvidia_stop = time.time()
        nvidia_time += nvidia_stop - nvidia_start
    
        intel_start = time.time()
        compute_norm(intel_io_queue,a,intel_a_buffer,b,intel_b_buffer,intel_program)
        intel_stop = time.time()
        intel_time += intel_stop - intel_start
    
    return nvidia_time/n,intel_time/n

In [None]:
for t in range(0,5):
    nvidia_time,intel_time = evaluate_program(nvidia_oo_queue,intel_oo_queue,2**t,n=10)
    print("t=%d: %.3f %.3f"%(2**t,nvidia_time,intel_time))

In [None]:
evaluate_program(nvidia_oo_queue,intel_oo_queue)

In [None]:
%timeit -n 10 numpy.linalg.norm(b/a + b*a - b%a)

## Module Challenge
* Perform any BLAS operation, using a mixture of task and data parallelism
* Characterise the change in any of the values

*Hint: Take advantage of multiple indices.*