In [1]:
import numpy
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import *
import pycuda.gpuarray as gpuarray
import pycuda

### Python version ( Thanks Jim!)

It is based on binary search as used in bisect.

In [2]:
# Pyhton version
def vectorized_search(offsets, content):
    print("on CUDA, this would be %d threads for as many particles" % len(content))
    print("expected convergence in %g steps" % numpy.log2(len(offsets) - 1))

    index = numpy.arange(len(content), dtype=int)                     # threadIdx.x on CUDA
    below = numpy.zeros(len(content), dtype=int)                      # just below = 0 on CUDA
    above = numpy.ones(len(content), dtype=int) * (len(offsets) - 1)  # same for above

    step = 0   # only used for print-outs
    while True:
        middle = (below + above) // 2

        step += 1
        print("step %d: try parents = %s" % (step, str(middle)))

        change_below = offsets[middle + 1] <= index                   # which "belows" must we change?
        change_above = offsets[middle] > index                        # which "aboves"?

        if not numpy.bitwise_or(change_below, change_above).any():    # neither? great! we're done!
            break
        else:
            below = numpy.where(change_below, middle + 1, below)      # vectorized "if" statement
            above = numpy.where(change_above, middle - 1, above)      # this is the only branch

    print("done!")
    return middle

In [3]:
offsets = numpy.array([0, 3, 3, 5, 9])
content = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9])

In [4]:
vectorized_search(offsets,content)

on CUDA, this would be 9 threads for as many particles
expected convergence in 2 steps
step 1: try parents = [2 2 2 2 2 2 2 2 2]
step 2: try parents = [0 0 0 2 2 3 3 3 3]
done!


array([0, 0, 0, 2, 2, 3, 3, 3, 3], dtype=int32)

### (1) CUDA Loop version

There is an assumption that `index` is within `len(content)`, but it's easy to evade that, as done many times earlier. 

A similar thing is done in block and grid size determination.

In [5]:
mod = SourceModule('''
__global__ void vectorized_search(int* offsets, int* middle,int* len_content,int* below,int* above)
{
    int index = blockIdx.x*blockDim.x + threadIdx.x;
    
    while (1)
    {
        middle[index] = int((below[index] + above[index])/2);
        if (offsets[middle[index]+1]<=index || offsets[middle[index]]>index)
        {
            below[index] = (offsets[middle[index]+1]<=index)? middle[index]+1 :below[index];
            above[index] = (offsets[middle[index]]>index) ? middle[index]-1: above[index];
        }
        else
            break;
    }
}
''')

In [6]:
func = mod.get_function("vectorized_search")

In [7]:
g_offsets = gpuarray.to_gpu(offsets)
g_len_content = gpuarray.to_gpu(numpy.array(len(content), dtype=numpy.int))
below = gpuarray.zeros(len(content), dtype=numpy.int)
above = gpuarray.zeros(len(content), dtype=numpy.int) + (len(offsets)-1)

In [8]:
middle = gpuarray.empty(len(content), dtype=numpy.int)

In [9]:
func(g_offsets,middle, g_len_content, below, above, block=(len(content),1,1), grid=(1,1))

In [10]:
# Print the result
middle

array([0, 0, 0, 2, 2, 3, 3, 3, 3])

### (2) Python loop with gpuarrays

The loop here is in python. However, the arrays are gpuarray instances.
This is not very useful, as pycuda doesn't have support for array indexing, and implementing it will result in the same source module as above. 
Currently doing with a mix of elementwise and reduction kernels

In [11]:
# define the function
def pycuda_where(mask, arr1, arr2):
    # Simulate numpy.where in PyCUDA gpuarrays
    mod = SourceModule('''
    __global__ void where(bool* mask,int* arr1,int* arr2,int* out)
    {
        int index = blockIdx.x*blockDim.x + threadIdx.x;
        out[index] = mask[index]?arr1[index]:arr2[index];
    }
    ''')
    py_where = mod.get_function("where")
    
    if len(arr1) != len(arr2):
        raise NotImplementedError
    else:
        data_len = len(arr1)
        out = gpuarray.empty_like(arr1)
        if (data_len < 512):
            thread_size = data_len-1
            block_size = 1
        else:
            thread_size = 512-1
            block_size = data_len//512
        py_where(mask, arr1, arr2, out, block=(thread_size,1,1), grid=(block_size,1,1))
        return out

def gpuarray_search(offsets, content):
    index = gpuarray.arange(len(content), dtype=numpy.int)
    below = gpuarray.zeros(len(content), dtype=numpy.int)
    above = gpuarray.zeros_like(below) + len(offsets)-1
    # Additional gpuarrays needed
    middle = gpuarray.empty_like(below)
    g_offsets = gpuarray.to_gpu(offsets)
    change_below = gpuarray.empty(len(content), dtype=numpy.bool)
    change_above = gpuarray.empty(len(content), dtype=numpy.bool)
    # Additional kernels needed for integer division
    div_kern = pycuda.elementwise.ElementwiseKernel("int* a,int* b, int* out",
                                                   "out[i] = int((a[i]+b[i])/2)",
                                                   "mod_kern")
    comp_kern1 = pycuda.elementwise.ElementwiseKernel("bool* out,int* offsets,int* middle",
                                                    "out[i] = offsets[middle[i]+1]<=i",
                                                    "comp_kern1")
    comp_kern2 = pycuda.elementwise.ElementwiseKernel("bool* out,int* offsets,int* middle",
                                                    "out[i] = offsets[middle[i]]>i",
                                                    "comp_kern2")
    
    while True:
        div_kern(below,above,middle)
        comp_kern1(change_below,g_offsets,middle)
        comp_kern2(change_above,g_offsets,middle)
        
        if not (gpuarray.sum(change_below+change_above).get()):
            break
        else:
            below = pycuda_where(change_below, middle+1, below)
            above = pycuda_where(change_above, middle, above)
    
    return middle

In [12]:
middle_2 = gpuarray_search(offsets, content)

In [13]:
# Print
middle_2

array([0, 0, 0, 2, 2, 3, 3, 3, 3])