In [139]:
import numpy as np
import pycuda.autoinit
from pycuda.compiler import *
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import numba

In [140]:
NUMEVENTS = 50
AVENUMPARTICLES = 10

numjets = np.random.poisson(AVENUMPARTICLES, NUMEVENTS)
stops = np.cumsum(numjets, dtype=np.int)
starts = np.zeros_like(stops)                              # Starts array
starts[1:] = stops[:-1]
offsets = np.zeros(len(numjets)+1, dtype=np.int)
offsets[1:] = stops
data = np.random.randint(low=0,high=10,size=stops[-1], dtype=np.int)
len_arr = np.array([stops[-1]]).astype(np.int)

In [141]:
gpu_offsets = gpuarray.to_gpu_async(offsets)

In [142]:
@numba.jit()
def vectorized_search(offsets, content):
    index = np.arange(len(content), dtype=int)                     # threadIdx.x on CUDA
    below = np.zeros(len(content), dtype=int)                      # just below = 0 on CUDA
    above = np.ones(len(content), dtype=int) * (len(offsets) - 1)  # same for above
    while True:
        middle = (below + above) // 2

        change_below = offsets[middle + 1] <= index                   # which "belows" must we change?
        change_above = offsets[middle] > index                        # which "aboves"?

        if not np.bitwise_or(change_below, change_above).any():    # neither? great! we're done!
            break
        else:
            below = np.where(change_below, middle + 1, below)      # vectorized "if" statement
            above = np.where(change_above, middle - 1, above)      # this is the only branch

    return middle

In [143]:
parents = vectorized_search(offsets, data)

In [144]:
gpu_parents = gpuarray.to_gpu(parents)

In [145]:
mod = SourceModule('''
__global__ void seg_warp_reduce(int* arr, int* parents,int* arr_len)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    
    if (tid > arr_len[0])
        return;
    
    int thread_id = threadIdx.x;
    int warp_size = 32;
    int lane = thread_id%warp_size;
    
    if (lane >=1 && parents[tid]==parents[tid-1])
      arr[tid] = max(arr[tid], arr[tid-1]);
     
    if (lane >=2 && parents[tid]==parents[tid-2])
     arr[tid] = max(arr[tid], arr[tid-2]);
     
    if (lane >=4 && parents[tid]==parents[tid-4])
        arr[tid] = max(arr[tid], arr[tid-4]);
      
    if (lane >=8 && parents[tid]==parents[tid-8])
        arr[tid] = max(arr[tid], arr[tid-8]);
      
    if (lane >=16 && parents[tid]==parents[tid-16])
        arr[tid] = max(arr[tid], arr[tid-16]);
       
    
}
''')

In [146]:
func = mod.get_function('seg_warp_reduce')

In [147]:
gpu_data = gpuarray.to_gpu(data)
num_threads_per_block = 32
num_blocks = int(np.ceil(stops[-1]/num_threads_per_block))
func(gpu_data,gpu_parents,cuda.In(len_arr), block=(num_threads_per_block,1,1), grid=(int(num_blocks),1))

In [148]:
kernel_data = gpu_data.get()
max_arr = kernel_data[stops-1]
max_arr

array([8, 8, 9, 8, 8, 9, 8, 9, 9, 9, 5, 8, 9, 7, 9, 8, 9, 8, 8, 8, 9, 9, 9,
       9, 9, 8, 9, 9, 9, 9, 9, 8, 7, 9, 8, 9, 8, 4, 9, 8, 9, 9, 8, 9, 9, 8,
       8, 9, 6, 9])

In [149]:
reduce_at_offsets = np.zeros_like(offsets)
reduce_at_offsets[1:] = stops-1

In [150]:
np_max = np.maximum.reduceat(data, reduce_at_offsets)

In [152]:
for i in range(len(starts)):
    print (" \narr{}: {}\nmax from gpu: {}   max from numpy: {} \n".format(i+1, data[starts[i]:stops[i]], max_arr[i],np_max[i] ))

 
arr1: [6 0 3 2 8 4 3 6 8 7]
max from gpu: 8   max from numpy: 8 

 
arr2: [7 5 3 8 6]
max from gpu: 8   max from numpy: 8 

 
arr3: [3 9 7 3 5 8 6 1]
max from gpu: 9   max from numpy: 9 

 
arr4: [9 0 6 3 0 6 0 3 5 6 4 8]
max from gpu: 8   max from numpy: 9 

 
arr5: [6 3 2 5 8]
max from gpu: 8   max from numpy: 8 

 
arr6: [8 9 4 3 2 8 2 9 8 0 6]
max from gpu: 9   max from numpy: 9 

 
arr7: [2 0 7 5 1 8 4 1 8 0]
max from gpu: 8   max from numpy: 8 

 
arr8: [8 3 5 2 3 2 9]
max from gpu: 9   max from numpy: 8 

 
arr9: [3 6 4 9 4 6 8 9 5 8 4 7 3]
max from gpu: 9   max from numpy: 9 

 
arr10: [7 7 4 1 3 1 0 9 1]
max from gpu: 9   max from numpy: 9 

 
arr11: [3 9 0 3 3 9 4 3 5 4]
max from gpu: 5   max from numpy: 9 

 
arr12: [6 5 8 8 1 7 1 3]
max from gpu: 8   max from numpy: 8 

 
arr13: [7 4 0 2 9]
max from gpu: 9   max from numpy: 7 

 
arr14: [4 7 1 7 4 0 4 5 4]
max from gpu: 7   max from numpy: 9 

 
arr15: [2 1 3 8 3 0 5 2 7 7 9 1 6]
max from gpu: 9   max from numpy: 9 

 
ar