In [1]:
import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import *
import pycuda.driver as cuda

In [2]:
NUMEVENTS = 500
AVENUMJETS = 100

numjets = np.random.poisson(AVENUMJETS, NUMEVENTS)
stops = np.cumsum(numjets, dtype=np.int)
starts = np.zeros_like(stops)
starts[1:] = stops[:-1]
offsets = np.zeros(len(stops)+1, dtype=np.int)
offsets[1:] = stops

In [3]:
data = np.random.randint(low=0, high=10, size=stops[-1]).astype(np.int)

In [4]:
parents = np.empty(stops[-1], dtype=np.int)
for i in range(len(offsets)-1):
    parents[offsets[i]:offsets[i+1]] = i

In [5]:
mod = SourceModule('''

__global__ void heele_max(int* arr,int* offsets,int* parents, int* num_particles)
{
    unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if (tid > num_particles[0])
        return;
    
    for (int d=0; d<log2(double(num_particles[0])); d++)
    {
        if (tid >=pow(2.0,double(d)) && parents[tid]==parents[tid-int(pow(2.0, double(d)))])
        {
            arr[tid] = max(arr[tid], arr[tid-int(pow(2.0, double(d)))]);
        }
        __syncthreads();
    }
}

__global__ void heele_max2(int* arr,int* offsets,int* parents, int* num_particles)
{
    unsigned int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if (tid > num_particles[0])
        return;
    
    for (int d=1; d<num_particles[0]; d*=2)
    {
        //int index = 2*d*tid;
        /*
        if (index<num_particles[0] && parents[index]==parents[index+d])
        {
            arr[index] = max(arr[index], arr[index+d]);
        }
        __syncthreads();
        */
        if (tid >=d && parents[tid]==parents[tid-d])
        {
            arr[tid] = max(arr[tid], arr[tid-d]);
        }
        __syncthreads();
    }
}

''')

kernel.cu



In [6]:
func1 = mod.get_function('heele_max')
func2 = mod.get_function('heele_max2')

In [7]:

#len_arr = gpuarray.to_gpu(np.array([len(data)]).astype(np.int))
#gpu_data = gpuarray.to_gpu(data)
#gpu_parents = gpuarray.to_gpu(parents)
#gpu_offsets = gpuarray.to_gpu(offsets)
#numthreads = 512
#numblocks = int(np.ceil(stops[-1]/numthreads))


In [8]:
#%%timeit
#func1(gpu_data, gpu_offsets, gpu_parents, len_arr, block=(numthreads,1,1), grid=(numblocks,1))

In [9]:
len_arr = gpuarray.to_gpu(np.array([len(data)]).astype(np.int))
gpu_data = gpuarray.to_gpu(data)
gpu_parents = gpuarray.to_gpu(parents)
gpu_offsets = gpuarray.to_gpu(offsets)
numthreads = 512
numblocks = int(np.ceil(stops[-1]/numthreads))

In [10]:
%%timeit
func2(gpu_data, gpu_offsets, gpu_parents, len_arr, block=(numthreads,1,1), grid=(numblocks,1))

114 µs ± 449 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [11]:
host_data = gpu_data.get()

In [12]:
max_arr = np.empty_like(stops)
max_arr = host_data[stops-1]
#max_arr

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9,

In [13]:
reduce_at_offsets = np.zeros_like(offsets)
reduce_at_offsets[1:] = stops
reduce_at_offsets[-1] = reduce_at_offsets[-1]-1
np_max = np.maximum.reduceat(data, reduce_at_offsets)

In [14]:
# check. Empty array indicates success!
#np.nonzero(np_max[:-1]-max_arr)

(array([], dtype=int64),)