# Part 3 - Doing much task wow (with OpenCL)

## Setup

### Library Import
As before, we need to import [PyOpenCL](https://documen.tician.de/pyopencl/) and [NumPy](http://www.numpy.org/).

In [None]:
import pyopencl,numpy

### Setting up platforms, devices and context
We're going to setup the devices and context as explicit objects because we might want to interogate their runtime information.

In [None]:
platforms = pyopencl.get_platforms()
nvidia_device,intel_device = [platform.get_devices()[0] 
                              for platform in platforms]
nvidia_context,intel_context = [pyopencl.Context(devices=[device]) 
                                for device in (nvidia_device,intel_device)]

## Inspecting Device Properties

### Using the runtime API
1. Selecting the properties of interest
2. print out for each device

In [None]:
name_properties = {
    "Device Name":pyopencl.device_info.NAME,
    "Device Platform":pyopencl.device_info.PLATFORM,
    "Device Type":pyopencl.device_info.TYPE
}

processing_properties = {
    "Available Compute Units": pyopencl.device_info.MAX_COMPUTE_UNITS,
    "Clockrate": pyopencl.device_info.MAX_CLOCK_FREQUENCY
}

memory_properties = {
    "Available Global Memory": pyopencl.device_info.GLOBAL_MEM_SIZE,
    "Available Constant Memory": pyopencl.device_info.MAX_CONSTANT_BUFFER_SIZE,
    "Available Local Memory" : pyopencl.device_info.LOCAL_MEM_SIZE
}

device_types = {
    pyopencl.device_type.CPU:"CPU",
    pyopencl.device_type.GPU:"GPU"
}

In [None]:
for device in (nvidia_device,intel_device):
    #print out all of the device name properties, except the device type
    for property_name in sorted(name_properties.keys() - {"Device Type"}):
        property_string_args = (property_name,device.get_info(name_properties[property_name]))
        print("%s: %s"%property_string_args)
        
    #look up the device type
    print("Device Types: %s"%device_types[device.get_info(name_properties["Device Type"])])
    
    #print out all of the processing properties
    for property_name in sorted(processing_properties.keys()):
        property_string_args = (property_name,device.get_info(processing_properties[property_name]))
        print("%s: %s"%property_string_args)
    
    #print out all of the memory properties
    for property_name in sorted(memory_properties.keys()):
        property_string_args = (property_name,device.get_info(memory_properties[property_name]))
        print("%s: %s"%property_string_args)
        
    print("\n")

## Using clinfo (external application)
Rather helpfully, Jupypter lets us run command line applications, including [clinfo](http://manpages.ubuntu.com/manpages/xenial/man1/clinfo.1.html), a command line utility for inspecting OpenCL devices

In [None]:
!clinfo

## Task vs Data Parallelism
### Setting up the program
1. Create a program for an expensive element-wise operation
2. Compile the programs

In [None]:
with open("part_3.cl","r") as program_source_file:
    program_source = program_source_file.read()

nvidia_program_source,intel_program_source = [pyopencl.Program(context,program_source) 
                                              for context in (nvidia_context,intel_context)]

In [None]:
nvidia_program,intel_program = [program.build(options=["-cl-fast-relaxed-math"])
                                for program in (nvidia_program_source,intel_program_source)]

nvidia_queue = pyopencl.CommandQueue(nvidia_context)
intel_queue = pyopencl.CommandQueue(intel_context)

### Creating the global memory resource
1. Defining source data parameters
2. Creating the source data
3. Creating the memory resources within the context

In [None]:
def create_arrays(M=2**12,N=2**10,datatype=numpy.int64):
    #Setting up memory
    a = (numpy.random.randint(0,10,size=(M,N))).astype(dt)
    b = numpy.asfortranarray(numpy.random.randint(0,10,size=(N,M))).astype(dt)
    
    return a,b

In [None]:
def create_buffers(context,datatype_size,M=2**12,N=2**10):
    ro_mem_flags = pyopencl.mem_flags.READ_ONLY
    
    a_buffer = pyopencl.Buffer(context,
                               flags=ro_mem_flags,
                               size=M*N*datatype_size)              
    b_buffer = pyopencl.Buffer(context,
                               flags=ro_mem_flags,
                               size=M*N*datatype_size)
    
    # Local buffer is size of column
    local_N = N
    local_b_buffer = pyopencl.LocalMemory(datatype_size*local_N)
    
    wo_mem_flags = pyopencl.mem_flags.WRITE_ONLY
    c_buffer = pyopencl.Buffer(context,
                               flags=wo_mem_flags,
                               size=M*M*datatype_size)
    
    return a_buffer,b_buffer,c_buffer,local_b_buffer

In [None]:
#Creating buffers
dt = numpy.float32 #The datatype we're using - this should match up with the source code
data_arrays = create_arrays(datatype=dt)

datatype_size = numpy.dtype(dt).itemsize
nvidia_buffers = create_buffers(nvidia_context,datatype_size)
intel_buffers = create_buffers(intel_context,datatype_size)

## Running the program
### Defining the host program
Similar to how we did it in module 2, but now we are *setting the workgroup size*.

In [None]:
def opencl_dot_product(data_arrays,buffers,queue,kernel,
                       M=2**12,N=2**10,max_wg_size=2**10):
    a,b = data_arrays
    a_buffer,b_buffer,c_buffer,local_b_buffer = buffers
    M,N = numpy.int32(M),numpy.int32(N) #converting to the exepcted type for OpenCL
    
    #copying data onto device
    copyon_events = []
        
    copyon_events += [pyopencl.enqueue_copy(queue,
                                            src=a,
                                            dest=a_buffer)]
    copyon_events += [pyopencl.enqueue_copy(queue,
                                            src=b,
                                            dest=b_buffer)]
    
    # Finding the right work group size
    # We can't go above the GPU max (1024),
    # but we should use all compute units
    work_group_size = min((M,max_wg_size))
    while(M/work_group_size < 13): work_group_size /= 2 
    work_group_size = int(work_group_size)
        
    #running program
    kernel_event = kernel(queue,
                          (M,M), #global size
                          (work_group_size,1), #local size - all rows, just one column wg_size
                          a_buffer,b_buffer,c_buffer,local_b_buffer,M,N, #Kernel Arguments
                          wait_for = copyon_events)
        
    #copying data off device
    c = numpy.empty((M,M),dtype=dt)
    copyoff_event = pyopencl.enqueue_copy(queue,
                                          src = c_buffer,
                                          dest = c,
                                          wait_for = [kernel_event])
    copyoff_event.wait()
        
    return c

In [None]:
data_arrays = create_arrays()
nvidia_result = dot_product(data_arrays,nvidia_buffers,
                            nvidia_queue,nvidia_program.dot_product_cc)
intel_result = dot_product(data_arrays,intel_buffers,
                           intel_queue,intel_program.dot_product_cc)

a,b = data_arrays
ref_c = a.dot(b)
if( numpy.abs(ref_c - nvidia_result).sum() ): print("Error in NVIDIA result!",numpy.abs(ref_c - nvidia_result).sum())
if( numpy.abs(ref_c - intel_result).sum() ): print("Error in Intel result!",numpy.abs(ref_c - intel_result).sum())

In [None]:
%timeit a.dot(b)

In [None]:
%timeit dot_product(data_arrays,nvidia_buffers,nvidia_queue,nvidia_program.dot_product)
%timeit dot_product(data_arrays,nvidia_buffers,nvidia_queue,nvidia_program.dot_product_cc)

In [None]:
%timeit dot_product(data_arrays,intel_buffers,intel_queue,intel_program.dot_product)
%timeit dot_product(data_arrays,intel_buffers,intel_queue,intel_program.dot_product_cc)

## Module Challenge
Take the above code and characterise performance across:
* Different `M` and `N` values. 
* datatypes: `int, long, float, double`.

Convince yourself (and your partner!) as to what explains the different performance characteristics. 