In [125]:
import numpy as np
import pycuda.autoinit
from pycuda.compiler import *
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import numba

In [126]:
NUMEVENTS = 50
AVENUMPARTICLES = 10

numjets = np.random.poisson(AVENUMPARTICLES, NUMEVENTS)
stops = np.cumsum(numjets, dtype=np.int)
starts = np.zeros_like(stops)                              # Starts array
starts[1:] = stops[:-1]
offsets = np.zeros(len(numjets)+1, dtype=np.int)
offsets[1:] = stops
data = np.random.randint(low=0,high=10,size=stops[-1], dtype=np.int)

In [127]:
gpu_offsets = gpuarray.to_gpu_async(offsets)

In [128]:
@numba.jit()
def vectorized_search(offsets, content):
    index = np.arange(len(content), dtype=int)                     # threadIdx.x on CUDA
    below = np.zeros(len(content), dtype=int)                      # just below = 0 on CUDA
    above = np.ones(len(content), dtype=int) * (len(offsets) - 1)  # same for above
    while True:
        middle = (below + above) // 2

        change_below = offsets[middle + 1] <= index                   # which "belows" must we change?
        change_above = offsets[middle] > index                        # which "aboves"?

        if not np.bitwise_or(change_below, change_above).any():    # neither? great! we're done!
            break
        else:
            below = np.where(change_below, middle + 1, below)      # vectorized "if" statement
            above = np.where(change_above, middle - 1, above)      # this is the only branch

    return middle

In [129]:
parents = vectorized_search(offsets, data)

In [130]:
gpu_parents = gpuarray.to_gpu(parents)

In [131]:
mod = SourceModule('''
__global__ void seg_warp_reduce(int* arr, int* parents,int* arr_len)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    
    if (tid > arr_len[0])
        return;
    
    int thread_id = threadIdx.x;
    int warp_size = 32;
    int lane = thread_id%warp_size;
    
    if (lane >=1 && parents[tid]==parents[tid-1])
      arr[tid] = max(arr[tid], arr[tid-1]);
     
    if (lane >=2 && parents[tid]==parents[tid-2])
     arr[tid] = max(arr[tid], arr[tid-2]);
     
    if (lane >=4 && parents[tid]==parents[tid-4])
        arr[tid] = max(arr[tid], arr[tid-4]);
      
    if (lane >=8 && parents[tid]==parents[tid-8])
        arr[tid] = max(arr[tid], arr[tid-8]);
      
    if (lane >=16 && parents[tid]==parents[tid-16])
        arr[tid] = max(arr[tid], arr[tid-16]);
       
    
}
''')

kernel.cu



In [132]:
func = mod.get_function('seg_warp_reduce')

In [133]:
gpu_data = gpuarray.to_gpu(data)
num_threads_per_block = 32
num_blocks = int(np.ceil(stops[-1]/num_threads_per_block))
func(gpu_data,gpu_parents,cuda.In(len_arr), block=(num_threads_per_block,1,1), grid=(int(num_blocks),1))

In [134]:
kernel_data = gpu_data.get()
max_arr = kernel_data[stops-1]
max_arr

array([9, 8, 5, 4, 1, 3, 6, 4, 6, 4, 3, 5, 2, 0, 6, 9, 5, 0, 9, 5, 1, 8, 2,
       5, 9, 1, 1, 2, 1, 3, 3, 7, 1, 0, 4, 9, 0, 3, 7, 3, 1, 2, 9, 0, 6, 6,
       5, 9, 6, 0])

In [135]:
reduce_at_offsets = np.zeros_like(offsets)
reduce_at_offsets[1:] = stops-1

In [136]:
np_max = np.maximum.reduceat(data, reduce_at_offsets)

In [137]:
for i in range(len(starts)):
    print (" \narr{}: {}\nmax from gpu: {}   max from numpy: {} \n".format(i+1, data[starts[i]:stops[i]], kernel_data[starts[i]:stops[i]],np_max[i] ))

 
arr1: [0 9 6 8]
max from gpu: [0 9 9 9]   max from numpy: 9 

 
arr2: [0 1 6 9 8 8 7 8]
max from gpu: [0 1 6 9 9 9 7 8]   max from numpy: 9 

 
arr3: [9 3 7 1 0 4 5 1 0 2 9 6 6 5]
max from gpu: [9 3 7 1 0 4 5 1 0 2 9 6 6 5]   max from numpy: 9 

 
arr4: [0 9 4 0 0 7 3 9 4]
max from gpu: [0 9 4 0 0 7 3 9 4]   max from numpy: 9 

 
arr5: [3 5 2 0 8 7 6 9 1 3 3 1 0 1]
max from gpu: [3 5 2 0 8 7 6 9 1 3 3 1 0 1]   max from numpy: 9 

 
arr6: [6 7 9 0 2 5 8 3 8 5 7 0 6 3]
max from gpu: [6 7 9 0 2 5 8 3 8 5 7 0 6 3]   max from numpy: 9 

 
arr7: [7 0 2 6]
max from gpu: [7 0 2 6]   max from numpy: 7 

 
arr8: [6 2 0 9 4 0 1 8 7 3 6 1 4]
max from gpu: [6 2 0 9 4 0 1 8 7 3 6 1 4]   max from numpy: 9 

 
arr9: [7 6 3 4 5 4 6]
max from gpu: [7 6 3 4 5 4 6]   max from numpy: 7 

 
arr10: [1 8 7 2 4 7 6 9 7 4]
max from gpu: [1 8 7 2 4 7 6 9 7 4]   max from numpy: 9 

 
arr11: [9 3 0 4 3 1 5 8 4 3]
max from gpu: [9 3 0 4 3 1 5 8 4 3]   max from numpy: 9 

 
arr12: [9 5 6 7 3 1 6 0 9 5]
max from gp