In [18]:
import numpy as np
import pycuda.autoinit
from pycuda.compiler import *
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import numba

In [19]:
NUMEVENTS = 50
AVENUMPARTICLES = 10

numjets = np.random.poisson(AVENUMPARTICLES, NUMEVENTS)
stops = np.cumsum(numjets, dtype=np.int)
starts = np.zeros_like(stops)                              # Starts array
starts[1:] = stops[:-1]
offsets = np.zeros(len(numjets)+1, dtype=np.int)
offsets[1:] = stops
data = np.random.randint(low=0,high=10,size=stops[-1], dtype=np.int)
len_arr = np.array([stops[-1]]).astype(np.int)

In [20]:
gpu_offsets = gpuarray.to_gpu_async(offsets)

In [21]:
@numba.jit()
def vectorized_search(offsets, content):
    index = np.arange(len(content), dtype=int)                     # threadIdx.x on CUDA
    below = np.zeros(len(content), dtype=int)                      # just below = 0 on CUDA
    above = np.ones(len(content), dtype=int) * (len(offsets) - 1)  # same for above
    while True:
        middle = (below + above) // 2

        change_below = offsets[middle + 1] <= index                   # which "belows" must we change?
        change_above = offsets[middle] > index                        # which "aboves"?

        if not np.bitwise_or(change_below, change_above).any():    # neither? great! we're done!
            break
        else:
            below = np.where(change_below, middle + 1, below)      # vectorized "if" statement
            above = np.where(change_above, middle - 1, above)      # this is the only branch

    return middle

In [22]:
parents = vectorized_search(offsets, data)

In [23]:
gpu_parents = gpuarray.to_gpu(parents)

In [24]:
mod = SourceModule('''
__global__ void seg_warp_reduce(int* arr, int* parents,int* arr_len,int* res1,int* res2,int* res3,int* res4,int* res5)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    
    if (tid > arr_len[0])
        return;
    
    int thread_id = threadIdx.x;
    int warp_size = 32;
    int lane = thread_id%warp_size;
    
    if (lane >=1 && parents[tid]==parents[tid-1])
    {
      arr[tid] = max(arr[tid], arr[tid-1]);
      __syncthreads();
     
    }
     res1[tid] = arr[tid];
    
    if (lane >=2 && parents[tid]==parents[tid-2]){
     arr[tid] = max(arr[tid], arr[tid-2]);
     __syncthreads();
      
     }
     res2[tid] = arr[tid];
     
    if (lane >=4 && parents[tid]==parents[tid-4]){
        arr[tid] = max(arr[tid], arr[tid-4]);
        __syncthreads();
      
      }
      res3[tid] = arr[tid];
      
    if (lane >=8 && parents[tid]==parents[tid-8]){
        arr[tid] = max(arr[tid], arr[tid-8]);
        __syncthreads();
      }
      res4[tid] = arr[tid];
      
    if (lane >=16 && parents[tid]==parents[tid-16]){
        arr[tid] = max(arr[tid], arr[tid-16]);
        __syncthreads();
      
    }
    res5[tid] = arr[tid];
    
    
}
''')

In [25]:
func = mod.get_function('seg_warp_reduce')

In [26]:
gpu_data = gpuarray.to_gpu(data)
res1 = gpuarray.empty_like(gpu_data)
res2 = gpuarray.empty_like(gpu_data)
res3 = gpuarray.empty_like(gpu_data)
res4 = gpuarray.empty_like(gpu_data)
res5 = gpuarray.empty_like(gpu_data)
num_threads_per_block = 32
num_blocks = int(np.ceil(stops[-1]/num_threads_per_block))
func(gpu_data,gpu_parents,cuda.In(len_arr),res1,res2,res3,res4,res5, block=(num_threads_per_block,1,1), grid=(int(num_blocks),1))

In [27]:
kernel_data = gpu_data.get()
max_arr = kernel_data[stops-1]
max_arr

array([9, 9, 9, 8, 9, 9, 9, 9, 8, 9, 7, 6, 9, 9, 5, 7, 9, 9, 7, 9, 9, 9, 9,
       9, 7, 8, 9, 9, 9, 7, 9, 8, 9, 9, 7, 8, 9, 8, 9, 9, 9, 8, 8, 7, 9, 7,
       9, 8, 9, 9])

In [28]:
reduce_at_offsets = np.zeros_like(offsets)
reduce_at_offsets[1:] = stops
reduce_at_offsets[-1] = reduce_at_offsets[-1]-1

In [29]:
np_max = np.maximum.reduceat(data, reduce_at_offsets)

In [30]:
for i in range(len(starts)):
    print(" \nEvent: {}\n".format(i+1))
    print (" arr: {}\n".format( data[starts[i]:stops[i]] ))
    print (" res1: {}\n".format( res1[starts[i]:stops[i]]))
    print (" res2: {}\n".format( res2[starts[i]:stops[i]]))
    print (" res3: {}\n".format( res3[starts[i]:stops[i]] ))
    print (" res4: {}\n".format( res4[starts[i]:stops[i]] ))
    print (" res5: {}\n".format( res5[starts[i]:stops[i]] ))

 
Event: 1

 arr: [2 9 2 6 5 5 8 6 3]

 res1: [2 9 9 6 6 5 8 8 6]

 res2: [2 9 9 9 9 6 8 8 8]

 res3: [2 9 9 9 9 9 9 9 9]

 res4: [2 9 9 9 9 9 9 9 9]

 res5: [2 9 9 9 9 9 9 9 9]

 
Event: 2

 arr: [9 6 9 6 8 7 2 7 8 8 5 6 0 6 3 1 9 5 0 6 8 3 4]

 res1: [9 9 9 9 8 8 7 7 8 8 8 6 6 6 6 3 9 9 5 6 8 8 4]

 res2: [9 9 9 9 9 9 8 8 8 8 8 8 8 6 6 6 9 9 9 9 8 8 8]

 res3: [9 9 9 9 9 9 9 9 9 9 8 8 8 8 8 8 9 9 9 9 9 9 9]

 res4: [9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]

 res5: [9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]

 
Event: 3

 arr: [1 5 4 7 1 9 5 1 6 6 0 1 8 0 8]

 res1: [1 5 5 7 7 9 9 5 6 6 6 1 8 8 8]

 res2: [1 5 5 7 7 9 9 9 9 6 6 6 8 8 8]

 res3: [1 5 5 7 7 9 9 9 9 9 9 9 9 8 8]

 res4: [1 5 5 7 7 9 9 9 9 9 9 9 9 9 9]

 res5: [1 5 5 7 7 9 9 9 9 9 9 9 9 9 9]

 
Event: 4

 arr: [8 2 3 8 5 6 0 6]

 res1: [8 8 3 8 8 6 6 6]

 res2: [8 8 8 8 8 8 8 6]

 res3: [8 8 8 8 8 8 8 8]

 res4: [8 8 8 8 8 8 8 8]

 res5: [8 8 8 8 8 8 8 8]

 
Event: 5

 arr: [0 8 5 3 4 8 3 1 4 9 0 9 3 1]

 res1: 

 res4: [1 1 1 1 1 9 9 9 9 9 9 9]

 res5: [1 1 1 1 1 9 9 9 9 9 9 9]

 
Event: 46

 arr: [2 1 2 5 5 7 7 2]

 res1: [2 2 2 5 5 7 7 7]

 res2: [2 2 2 5 5 7 7 7]

 res3: [2 2 2 5 5 7 7 7]

 res4: [2 2 2 5 5 7 7 7]

 res5: [2 2 2 5 5 7 7 7]

 
Event: 47

 arr: [5 2 6 1 8 9 8 3 6 9 8 4 4]

 res1: [5 5 6 6 8 9 9 8 6 9 9 8 4]

 res2: [5 5 6 6 8 9 9 9 9 9 9 9 9]

 res3: [5 5 6 6 8 9 9 9 9 9 9 9 9]

 res4: [5 5 6 6 8 9 9 9 9 9 9 9 9]

 res5: [5 5 6 6 8 9 9 9 9 9 9 9 9]

 
Event: 48

 arr: [3 2 4 2 4 7 8 7 3 1 0]

 res1: [3 3 4 4 4 7 8 8 7 3 1]

 res2: [3 3 4 4 4 7 8 8 8 8 7]

 res3: [3 3 4 4 4 7 8 8 8 8 8]

 res4: [3 3 4 4 4 7 8 8 8 8 8]

 res5: [3 3 4 4 4 7 8 8 8 8 8]

 
Event: 49

 arr: [6 7 5 2 1 1 6 9 8 1]

 res1: [6 7 7 5 2 1 6 9 9 8]

 res2: [6 7 7 7 7 5 6 9 9 9]

 res3: [6 7 7 7 7 7 7 9 9 9]

 res4: [6 7 7 7 7 7 7 9 9 9]

 res5: [6 7 7 7 7 7 7 9 9 9]

 
Event: 50

 arr: [4 3 8 3 9 7 2]

 res1: [4 4 8 8 9 9 7]

 res2: [4 4 8 8 9 9 9]

 res3: [4 4 8 8 9 9 9]

 res4: [4 4 8 8 9 9 9]

 res5: [

In [31]:
poor_vals = np.nonzero(max_arr-np_max[:-1])
poor_vals

(array([11, 14, 34, 37], dtype=int64),)