In [1]:
import numpy
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import *

In [2]:
NUMEVENTS = 320            # Number of evenets to simulate the data for.
AVENUMJETS = 32             # Average number of jets per event.

numjets = numpy.random.poisson(AVENUMJETS, NUMEVENTS).astype(numpy.int32) # Number of jets in each event
jets_stops = numpy.cumsum(numjets).astype(numpy.int32)                                      # Stops array
jets_starts = numpy.zeros_like(jets_stops)                              # Starts array
jets_starts[1:] = jets_stops[:-1]

In [3]:
pointer_cuda = numpy.empty(jets_stops[-1], dtype=numpy.int32)

In [26]:
NUMPARTICLES = jets_stops[-1]

### Working

Run a thread over all particles(`i`), which is known(=`jets_stops[-1]`), and total number of events(`j`).

Compare the `starts[j]` and `stop[j]` with `i`, and assign `j` to the pointer value if match is found

In [27]:
mod = SourceModule('''
__global__ void parents(int* pointer,int* starts,int* stops,int* NUMEVENTS,int* NUMPARTICLES)
{
    // i for particles, j for events.
    
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    
    if (i<NUMPARTICLES[0] && j<NUMEVENTS[0])
    {
        // There will be horrible warp divergence here. But I don't see anyway around it.
        if(starts[j]<=i && i<stops[j])
        {
            pointer[i] = j;
        }
    }
}
''')

In [28]:
func = mod.get_function("parents")

In [29]:
gpu_starts = gpuarray.to_gpu(jets_starts)
gpu_stops = gpuarray.to_gpu(jets_stops)
gpu_pointer = gpuarray.to_gpu(pointer_cuda)
# Calculate the counts array on GPU
gpu_counts = gpu_stops-gpu_starts
arr_numevents = numpy.array([NUMEVENTS]).astype(numpy.int32)
arr_numparticles = numpy.array([NUMPARTICLES]).astype(numpy.int32)

In [30]:
NUMBLOCKS_x = int(numpy.ceil(NUMPARTICLES/32))
NUMBLOCKS_y = int(numpy.ceil(NUMEVENTS/32))

In [31]:
func(gpu_pointer,gpu_starts,gpu_stops,cuda.In(arr_numevents),cuda.In(arr_numparticles), block=(32,32,1), grid=(NUMBLOCKS_x,NUMBLOCKS_y,1))

In [33]:
pointer_seq = numpy.empty(jets_stops[-1], dtype=numpy.int32)
# Sequential evaluation
def parent(starts, stops, pointers):
    for i in range(len(starts)):
        pointers[starts[i]:stops[i]] = i
parent(jets_starts, jets_stops, pointer_seq)

In [34]:
# Compare with sequential
# First copy data to host
host_pointer_data = gpu_pointer.get()
# Compare. Will not print anything if equal
assert(host_pointer_data.all()==pointer_seq.all())