# Part 1 - Programming Fancy Devices (with OpenCL) Solutions

## Setup

In [None]:
import pyopencl
import numpy

In [None]:
N = int(1e8)
a = numpy.random.rand(N).astype(numpy.float32)
b = numpy.random.rand(N).astype(numpy.float32)
c = numpy.empty_like(a)

In [None]:
program_source = """
kernel void sum(global float *a, 
                global float *b, 
                global float *c)
{
  int gid = get_global_id(0);
  c[gid] = a[gid] + b[gid];
}
"""

In [None]:
#NVIDIA Platform, Device, Context, Queue
nvidia_platform = pyopencl.get_platforms()[0]
nvidia_devices = nvidia_platform.get_devices()
nvidia_context = pyopencl.Context(devices=nvidia_devices)
nvidia_queue = pyopencl.CommandQueue(nvidia_context)

#NVIDIA Compiling
nvidia_program_source = pyopencl.Program(nvidia_context,program_source)
nvidia_program = nvidia_program_source.build()
    
#NVIDIA Buffers
a_nvidia_buffer = pyopencl.Buffer(nvidia_context,
                                    flags=pyopencl.mem_flags.READ_ONLY, 
                                    size=a.nbytes)
b_nvidia_buffer = pyopencl.Buffer(nvidia_context, 
                                    flags=pyopencl.mem_flags.READ_ONLY, 
                                    size=b.nbytes)
c_nvidia_buffer = pyopencl.Buffer(nvidia_context, 
                                    flags=pyopencl.mem_flags.WRITE_ONLY, 
                                    size=c.nbytes)

def run_gpu_program(): 
    #copying data onto GPU
    pyopencl.enqueue_copy(nvidia_queue,
                          src=a,
                          dest=a_nvidia_buffer)
    pyopencl.enqueue_copy(nvidia_queue,
                          src=b,
                          dest=b_nvidia_buffer)
    
    #running program
    kernel_arguments = (a_nvidia_buffer,b_nvidia_buffer,c_nvidia_buffer) 
    nvidia_program.sum(nvidia_queue,
                       a.shape, #global size
                       None, #local size
                       *kernel_arguments)

    #copying data off GPU
    copy_off_event = pyopencl.enqueue_copy(nvidia_queue,
                                           src=c_nvidia_buffer,
                                           dest=c)
    copy_off_event.wait()

## Module Challenge

Perform the vector addition example, as above, but using the Intel platform to program the instance's CPU:

In [None]:
#Building the Intel
intel_platform = pyopencl.get_platforms()[1]
intel_devices = intel_platform.get_devices()
intel_context = pyopencl.Context(devices=intel_devices)

#Building the program
intel_program_source = pyopencl.Program(intel_context,program_source)
intel_program = intel_program_source.build()

#Memory buffers
a_intel_buffer = pyopencl.Buffer(intel_context,
                                 flags=pyopencl.mem_flags.READ_ONLY, 
                                 size=a.nbytes)
b_intel_buffer = pyopencl.Buffer(intel_context, 
                                 flags=pyopencl.mem_flags.READ_ONLY, 
                                 size=b.nbytes)
c_intel_buffer = pyopencl.Buffer(intel_context, 
                                 flags=pyopencl.mem_flags.WRITE_ONLY, 
                                 size=c.nbytes)
#Command Queue
intel_queue = pyopencl.CommandQueue(intel_context)

def run_cpu_program():
    #copying data onto CPU
    pyopencl.enqueue_copy(intel_queue,
                          src=a,
                          dest=a_intel_buffer)
    pyopencl.enqueue_copy(intel_queue,
                          src=b,
                          dest=b_intel_buffer)
    
    #running program
    kernel_arguments = (a_intel_buffer,b_intel_buffer,c_intel_buffer) 
    intel_program.sum(intel_queue,
                       a.shape, #global size
                       None, #local size
                       *kernel_arguments)

    #copying data off CPU
    copy_off_event = pyopencl.enqueue_copy(intel_queue,
                                           src=c_intel_buffer,
                                           dest=c)
    copy_off_event.wait()

#checking result
run_cpu_program()
if((c - (a + b)).sum() > 0.0): print("result does not match")
else: print("result matches!")

### Bonus round

Compare the performance of the two using the `%timeit` magic function

In [None]:
%timeit run_gpu_program()
%timeit run_cpu_program()