# Part 2 - Moving Bits Around (with OpenCL)

## Setup

### Library Import
Before doing anything else, we need to import [PyOpenCL](https://documen.tician.de/pyopencl/) and [NumPy](http://www.numpy.org/).

In [None]:
import pyopencl
import numpy

We're also going to use Matplotlib to visualise performance improvements:

In [None]:
import time
import matplotlib.pyplot as plt
%matplotlib inline

### Setting up platforms, devices and context
We're going to setup the devices and context as explicit objects because we might want to interogate their runtime information.

*NB* First, for the devices, we use ordinary index-based accessing of the devices. Then, I use Python's ability to unpack collections directly into variables. It takes a little getting used to, but this is the best example of dynamic programming languages ability to support succinct yet clear coding. 

In [None]:
platforms = pyopencl.get_platforms()

# Devices
devices = [platform.get_devices()[0] for platform in platforms]
nvidia_device = devices[0]
intel_device = devices[1]

# Contexts
contexts = [pyopencl.Context(devices=[device]) for device in (nvidia_device,intel_device)]
nvidia_context,intel_context = contexts

## Communicating between Host and Device
### Setting up the program
1. Using the OpenCL source in `part_2.cl`, create a new program.
2. Compile the programs and setup the command queues

In [None]:
with open("part_2.cl","r") as program_source_file:
    program_source = program_source_file.read()

nvidia_program_source,intel_program_source = [pyopencl.Program(context,program_source) 
                                              for context 
                                              in (nvidia_context,intel_context)]

In [None]:
nvidia_program,intel_program = [program.build()
                                for program
                                in (nvidia_program_source,intel_program_source)]

nvidia_queue = pyopencl.CommandQueue(nvidia_context)
intel_queue = pyopencl.CommandQueue(intel_context)

### Creating the global memory buffers
1. Generating the source data
2. Creating the memories of the right size within the context

In [None]:
def create_arrays(M=2**10,N=2**8,datatype=numpy.int32,b_colwise = False):
    #Setting up memory
    a = (numpy.random.randint(0,10,size=(M,N))).astype(dt)
    b = (numpy.random.randint(0,10,size=(N,M))).astype(dt)
    if(b_colwise): b = numpy.asfortranarray(b)
    
    return a,b

In [None]:
def create_buffers(context,datatype_size,M=2**10,N=2**8):
    ro_mem_flags = pyopencl.mem_flags.READ_ONLY
    
    a_buffer = pyopencl.Buffer(context,
                               flags=ro_mem_flags,
                               size=M*N*datatype_size)              
    b_buffer = pyopencl.Buffer(context,
                               flags=ro_mem_flags,
                               size=M*N*datatype_size)
    
    wo_mem_flags = pyopencl.mem_flags.WRITE_ONLY
    c_buffer = pyopencl.Buffer(context,
                               flags=wo_mem_flags,
                               size=M*M*datatype_size)
    
    return a_buffer,b_buffer,c_buffer

In [None]:
#Creating buffers
dt = numpy.int32 #The datatype we're using
data_arrays = create_arrays(datatype=dt)

datatype_size = numpy.dtype(dt).itemsize
nvidia_buffers = create_buffers(nvidia_context,datatype_size)
intel_buffers = create_buffers(intel_context,datatype_size)

## Running the program
### Defining the host program
We want to perform a matrix multiplication:
$$\mathbf{C} = \mathbf{A} \cdot \mathbf{B}$$

We create a function for running the program that takes the source data, the memory buffers we've created, the command queue, and the program kernel we're going to use. 

Optional arguments let us change the size of the matrix.

In [None]:
def opencl_dot_product(data_arrays,buffers,queue,kernel,M=2**10,N=2**8):
    a,b = data_arrays
    a_buffer,b_buffer,c_buffer = buffers
    M,N = numpy.int32(M),numpy.int32(N) #converting to the exepcted type for OpenCL
    
    #copying data onto device
    copyon_events = []
        
    copyon_events += [pyopencl.enqueue_copy(queue,
                                            src=a,
                                            dest=a_buffer)]
    copyon_events += [pyopencl.enqueue_copy(queue,
                                            src=b,
                                            dest=b_buffer)]
        
    #running program
    kernel_event = kernel(queue,
                          (M,M), #global size
                          None, #local size 
                          a_buffer,b_buffer,c_buffer,M,N, #Kernel Arguments
                          wait_for = copyon_events)
        
        
    #copying data off device
    c = numpy.empty((M,M),dtype=dt)
    copyoff_event = pyopencl.enqueue_copy(queue,
                                          src = c_buffer,
                                          dest = c,
                                          wait_for = [kernel_event])
    copyoff_event.wait()
        
    return c

### Running the program

Running on both platforms and checking the result

In [None]:
data_arrays = create_arrays()
nvidia_result = opencl_dot_product(data_arrays,nvidia_buffers,
                                   nvidia_queue,nvidia_program.dot_product)
intel_result = opencl_dot_product(data_arrays,intel_buffers,
                                  intel_queue,intel_program.dot_product)

a,b = data_arrays
ref_c = a.dot(b)
if( numpy.abs(ref_c - nvidia_result).sum() ): print("Error in NVIDIA result!")
if( numpy.abs(ref_c - intel_result).sum() ): print("Error in Intel result!")

Now, visualising the different optimisations:
1. Gathering timing data and calculating acceleration ($A$) : $A = \frac{L_{NumPy}}{L_{OpenCL}} $
2. Plotting the data

*N.B.* How in the no stride case, we're using the columnwise layout for the B matrix

In [None]:
data_arrays = create_arrays(datatype=dt)
a,b = data_arrays

# Getting reference result
start = time.time()
ref_c = a.dot(b)
end = time.time()
ref_time = end - start

nvidia_times = []
intel_times = []
#Iterating over the kernels for the two different platforms
for nvidia_kernel,intel_kernel in zip(
    (nvidia_program.dot_product,nvidia_program.dot_product_cached,nvidia_program.dot_product_no_stride),
    (intel_program.dot_product,intel_program.dot_product_cached,intel_program.dot_product_no_stride)):
    # Generating data
    data_arrays = create_arrays(b_colwise = nvidia_kernel == nvidia_program.dot_product_no_stride)
    
    # Benchmarking NVIDIA
    nvidia_start = time.time()
    nvidia_result = opencl_dot_product(data_arrays,nvidia_buffers,
                                       nvidia_queue,nvidia_kernel)
    nvidia_end = time.time()
    
    nvidia_times += [nvidia_end - nvidia_start]
    
    #Benchmarking Intel
    intel_start = time.time()
    intel_result = opencl_dot_product(data_arrays,intel_buffers,
                                      intel_queue,intel_kernel)
    intel_end = time.time()
    
    intel_times += [intel_end - intel_start]
    
nvidia_speedup = ref_time/numpy.array(nvidia_times)
intel_speedup = ref_time/numpy.array(intel_times)

In [None]:
fig, axes = plt.subplots(2)
ax1,ax2 = axes

ax1.bar(range(nvidia_speedup.size),nvidia_speedup,width=0.8,color='g',alpha=0.5)
ax2.bar(range(intel_speedup.size),intel_speedup,width=0.8,color='b',alpha=0.5)

ax1.set_title("NVIDIA")
ax2.set_title("Intel")

for ax in axes:
    ax.set_xticklabels(["Unoptimised","Cached Sum","Colwise Storage"])
    ax.set_xticks(numpy.arange(intel_speedup.size)+0.4)
    ax.set_ylabel("Speedup over NumPy")
    ax.grid(True)
    
fig.set_size_inches(10,6)

## Module Challenge
* Apply similar optimisations to the Java hash function (see helper code below)
* Measure the performance improvement over a Python implementation

### Source Data and Memory setup

In [None]:
def get_word_arrays_from_file(filename, size_limit = int(5e6)):
    """Reads a text file into numpy arrays.
    
    Keyword arguments:
    filename -- the file to read from
    
    Returns:
    data -- numpy.Array ASCII with a single entry containing all of 
            the non-whitespace characters in the file.
    offsets -- numpy.Array containing an integer element 
                for each word's offset
    lengths -- numpy.Array containing an integer element 
                for each word's length  
    """
    with open(filename,"r") as data_file:
        # Reading data from file, and making it into one big string
        data = [line[:-1] for line in data_file]
        data_string = "".join(data)
    
        # Getting the lengths of each word
        lengths = [len(word) for word in data]
    
        # Getting the start of each word
        offsets = [0]
        for word in data[:-1]:
            offsets += [offsets[-1] + len(word)]
        
        # Testing that the offsets and counts are correct
        for i,word in enumerate(data):
            temp_word = data_string[offsets[i]:offsets[i]+lengths[i]] 
            if(word != temp_word): 
                print("Problem :",word,"!=",temp_word)
                raise("Data mismatch!")
            
        # Converting data into numpy array
        byte_data_string = data_string.encode("ascii","ignore")
        data_array = numpy.array(byte_data_string)
        offsets_array = numpy.array(offsets,dtype=numpy.int32)
        lengths_array = numpy.array(lengths,dtype=numpy.int32)
            
    return data_array,offsets_array,lengths_array

In [None]:
def create_data_buffers(context,words,offsets,lengths):
    """Creates a read only PyOpenCL buffers for a numpy.Array
    
    Keyword arguments:
    context -- pyopencl.Context that buffer will be created in.
    words -- numpy.Array of words data
    offsets -- numpy.Array of offsets data
    lengths -- numpy.Array of lengths data
    
    Returns:
    words_buffers -- pyopencl.Buffer in context for words array
    offsets_buffers -- pyopencl.Buffer in context for offsets array
    lengths_buffers -- pyopencl.Buffer in context for lengths array
    """
    ro_mem_flags = pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.ALLOC_HOST_PTR
    
    # Source buffers
    buffers = [pyopencl.Buffer(context,
                               flags=ro_mem_flags,
                               size=array.nbytes) 
               for array in (words,offsets,lengths)]
    
    wo_mem_flags = pyopencl.mem_flags.WRITE_ONLY
    output_size = offsets.nbytes #int per word
    
    # Destination buffers
    buffers += [pyopencl.Buffer(context,
                                flags=wo_mem_flags,
                                size=output_size)]
    
    return buffers

In [None]:
!wget https://s3-eu-west-1.amazonaws.com/word-count-test-bucket/english_1000_most_common.txt
!wget https://s3-eu-west-1.amazonaws.com/word-count-test-bucket/wiki-100k.txt
!wget https://s3-eu-west-1.amazonaws.com/word-count-test-bucket/shakespeare_words.txt

In [None]:
# Source data
words,offsets,lengths = get_word_arrays_from_file("shakespeare_words.txt")
arrays = words,offsets,lengths

# NVIDIA Buffers
nvidia_buffers = create_data_buffers(nvidia_context,words,offsets,lengths)
nvidia_words_buffer,nvidia_offesets_buffer,nvidia_lengths_buffer,nvidia_hashes_buffer = nvidia_buffers

# Intel Buffers
intel_buffers = create_data_buffers(intel_context,words,offsets,lengths)
intel_words_buffer,intel_offesets_buffer,intel_lengths_buffer,intel_hashes_buffer = intel_buffers

### Compiling and running the program

In [None]:
program_source_filename = "part_2_challenge.cl"

with open(program_source_filename,"r") as program_source_file: 
    program_source = program_source_file.read()

    nvidia_program_source,intel_program_source = [pyopencl.Program(context,program_source) 
                                                  for context in (nvidia_context,intel_context)]
    
nvidia_program,intel_program = [program.build() for program 
                                in (nvidia_program_source,intel_program_source)]

In [None]:
def hash_words(queue,program,data_arrays,data_buffers):
    """Copy data onto OpenCL device, 
    
    Keyword arguments:
    queue -- pyopencl.Queue 
    program -- compiled pyopencl.Program 
    data_arrays -- Source data arrays
    data_buffers -- pyopencl.Buffers to use in pyopencl problem
    hashes -- Empty numpy.Array for results
    
    Returns:
    hashes -- Copy of numpy.Array of ints, containing the results
    """
    words,offsets,lengths = data_arrays
    words_buffer,offsets_buffer,lengths_buffer,hashes_buffer = data_buffers
    
    # Copying data onto the device
                     # Words
    copyon_events = [pyopencl.enqueue_copy(queue,
                                           src=words,
                                           dest=words_buffer),
                     # Offsets
                     pyopencl.enqueue_copy(queue,
                                           src=offsets,
                                           dest=offsets_buffer),
                     # Lengths
                     pyopencl.enqueue_copy(queue,
                                           src=lengths,
                                           dest=lengths_buffer)]
    
    # Hashing the words
    kernel_event = program.java_hash(queue,
                                     (offsets.size,), #global size
                                     None, #local size
                                     words_buffer,offsets_buffer,lengths_buffer,hashes_buffer, #buffers
                                     wait_for=copyon_events)
    
    
    # Copying data off the device
    hashes = numpy.empty(offsets.shape,dtype=numpy.int32)
    copyoff_event = pyopencl.enqueue_copy(queue,
                                          src=hashes_buffer,
                                          dest=hashes,
                                          wait_for=[kernel_event])
    
    #wait for copy-off to finish
    copyoff_event.wait()
    
    return hashes

In [None]:
nvidia_queue = pyopencl.CommandQueue(nvidia_context)
nvidia_hashes = hash_words(nvidia_queue,nvidia_program,arrays,nvidia_buffers)

intel_queue = pyopencl.CommandQueue(intel_context)
intel_hashes = hash_words(intel_queue,intel_program,arrays,intel_buffers)

### Testing the result

In [None]:
def ref_java_hash(s):
    """Hashes a string into 32-bit integer value, as per the Java hash algorithm"""
    h = 0
    for c in s.encode("ascii"):
        h = (31 * h + c) & 0xFFFFFFFF
        
    return ((h + 0x80000000) & 0xFFFFFFFF) - 0x80000000

In [None]:
temp_words = str(words)[2:-1]
temp_words = [temp_words[offset:offset+length] for offset,length in zip(offsets,lengths)]
ref_hashes = numpy.fromiter(map(ref_java_hash,temp_words),numpy.int32)

i = 0
for intel_hash,nvidia_hash,ref_hash in zip(intel_hashes,nvidia_hashes,ref_hashes):
    if(nvidia_hash != ref_hash or intel_hash != ref_hash): 
        print(i,"Problem!",nvidia_hash,intel_hash,ref_hash)
    i += 1

In [None]:
%timeit hash_words(nvidia_queue,nvidia_program,arrays,nvidia_buffers)
%timeit hash_words(intel_queue,intel_program,arrays,intel_buffers)
%timeit numpy.fromiter(map(ref_java_hash,temp_words),numpy.int32)