In [80]:
import numpy as np
import pycuda.autoinit
from pycuda.compiler import *
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import numba

In [81]:
NUMEVENTS = 50
AVENUMPARTICLES = 10

numjets = np.random.poisson(AVENUMPARTICLES, NUMEVENTS)
stops = np.cumsum(numjets, dtype=np.int)
starts = np.zeros_like(stops)                              # Starts array
starts[1:] = stops[:-1]
offsets = np.zeros(len(numjets)+1, dtype=np.int)
offsets[1:] = stops
data = np.random.randint(low=0,high=10,size=stops[-1])

In [82]:
gpu_offsets = gpuarray.to_gpu_async(offsets)

In [83]:
@numba.jit()
def vectorized_search(offsets, content):
    index = np.arange(len(content), dtype=int)                     # threadIdx.x on CUDA
    below = np.zeros(len(content), dtype=int)                      # just below = 0 on CUDA
    above = np.ones(len(content), dtype=int) * (len(offsets) - 1)  # same for above
    while True:
        middle = (below + above) // 2

        change_below = offsets[middle + 1] <= index                   # which "belows" must we change?
        change_above = offsets[middle] > index                        # which "aboves"?

        if not np.bitwise_or(change_below, change_above).any():    # neither? great! we're done!
            break
        else:
            below = np.where(change_below, middle + 1, below)      # vectorized "if" statement
            above = np.where(change_above, middle - 1, above)      # this is the only branch

    return middle

In [84]:
parents = vectorized_search(offsets, data)

In [85]:
gpu_parents = gpuarray.to_gpu(parents)

In [86]:
mod = SourceModule('''
__global__ void seg_warp_reduce(float* arr, int* parents,int* arr_len)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    
    if (tid > arr_len[0])
        return;
    
    int thread_id = threadIdx.x;
    int warp_size = 32;
    int lane = thread_id%warp_size;
    
    if (lane >=1 && parents[tid]==parents[tid-1])
        arr[tid] = max(arr[tid], arr[tid-1]);
    if (lane >=2 && parents[tid]==parents[tid-2])
        arr[tid] = max(arr[tid], arr[tid-2]);
    if (lane >=4 && parents[tid]==parents[tid-4])
        arr[tid] = max(arr[tid], arr[tid-4]);
    if (lane >=8 && parents[tid]==parents[tid-8])
        arr[tid] = max(arr[tid], arr[tid-8]);
    if (lane >=16 && parents[tid]==parents[tid-16])
        arr[tid] = max(arr[tid], arr[tid-16]);
    
}
''')

In [87]:
func = mod.get_function('seg_warp_reduce')

In [88]:
gpu_data = gpuarray.to_gpu(data)
num_threads_per_block = 32
num_blocks = int(np.ceil(stops[-1]/num_threads_per_block))
func(gpu_data,gpu_parents,cuda.In(len_arr), block=(num_threads_per_block,1,1), grid=(int(num_blocks),1))

In [89]:
kernel_data = gpu_data.get()
max_arr = kernel_data[stops-1]
max_arr

array([0, 4, 3, 1, 2, 1, 1, 4, 8, 8, 1, 3, 9, 9, 2, 9, 1, 7, 0, 9, 4, 4, 9,
       2, 5, 7, 8, 0, 3, 5, 2, 0, 1, 0, 0, 3, 6, 7, 3, 4, 2, 6, 1, 0, 0, 0,
       0, 2, 9, 8])

In [90]:
reduce_at_offsets = np.zeros_like(offsets)
reduce_at_offsets[1:] = stops-1

In [91]:
np_max = np.maximum.reduceat(data, reduce_at_offsets)

In [95]:
for i in range(len(starts)):
    print (" \narr{}: {}\nmax from gpu: {}   max from numpy: {} \n".format(i+1, data[starts[i]:stops[i]], max_arr[i],np_max[i] ))

 
arr1: [4 3 7 3 7 3 0 4 9 8 0 0]
max from gpu: 0   max from numpy: 9 

 
arr2: [3 4 1 5 7 9 8 4]
max from gpu: 4   max from numpy: 9 

 
arr3: [1 8 6 0 4 2 1 1 2 7 3]
max from gpu: 3   max from numpy: 8 

 
arr4: [7 7 2 8 0 2 2 5 1]
max from gpu: 1   max from numpy: 8 

 
arr5: [4 1 0 1 6 3 9 2]
max from gpu: 2   max from numpy: 9 

 
arr6: [0 7 1 1 5 4 5 1]
max from gpu: 1   max from numpy: 7 

 
arr7: [3 3 8 7 8 7 4 5 1]
max from gpu: 1   max from numpy: 8 

 
arr8: [3 6 5 0 3 0 5 6 3 7 8 8 3 5 6 2 4]
max from gpu: 4   max from numpy: 8 

 
arr9: [9 6 3 7 7 7 7 5 1 8]
max from gpu: 8   max from numpy: 9 

 
arr10: [5 0 8 0 6 0 9 8]
max from gpu: 8   max from numpy: 9 

 
arr11: [9 1 4 0 7 5 5 1 8 7 0 1]
max from gpu: 1   max from numpy: 9 

 
arr12: [7 1 0 9 6 3 5 7 8 3]
max from gpu: 3   max from numpy: 9 

 
arr13: [2 1 0 2 9 2 7 8 0 7 2 1 6 9 9]
max from gpu: 9   max from numpy: 9 

 
arr14: [4 8 3 8 1 9]
max from gpu: 9   max from numpy: 9 

 
arr15: [4 6 5 7 7 5 0 2 2 5 1 9 3 6