# Product with non-rejection code

Based on original product index notations. Amended to include non-rejection mode, so that number of threads run can be optimized. Now it runs `num_particles*num_particles*num_events` threads, as in contrast to `max(num_particles)*max(num_particles)*num_events` threads in original code. 

In [1]:
import numpy
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import *

In [2]:
NUMEVENTS = 100            # Number of evenets to simulate the data for.
AVENUMJETS = 32             # Average number of jets per event.

numjets = numpy.random.poisson(AVENUMJETS, NUMEVENTS).astype(numpy.int32) # Number of jets in each event
jets_stops = numpy.cumsum(numjets).astype(numpy.int32)                                      # Stops array
jets_starts = numpy.zeros_like(jets_stops)                              # Starts array
jets_starts[1:] = jets_stops[:-1]

In [3]:
NUMPARTICLES = jets_stops[-1]
gpu_starts = gpuarray.to_gpu(jets_starts)
gpu_stops = gpuarray.to_gpu(jets_stops)
# Calculate the counts array on GPU
gpu_counts = gpu_stops-gpu_starts
arr_numevents = numpy.array([NUMEVENTS]).astype(numpy.int32)
arr_numparticles = numpy.array([NUMPARTICLES]).astype(numpy.int32)

In [4]:
pairs_offsets = numpy.zeros(len(numjets)+1, dtype = numpy.int32)
pairs_offsets[1:] = numpy.cumsum((jets_stops-jets_starts)*(jets_stops-jets_starts)).astype(numpy.int32)
gpu_pairs_offsets = gpuarray.to_gpu(pairs_offsets)

### Idea

Run:
- Thread i over x dimension, for all particles.
- Thread j over y dimension, over all particles.
- Thread k over z dimension, over all events.(Potentially very risky, as only 64 blocks are allowed in z dimension in my machine. So, for more events, it should be restricted, or multiple kernels need to be run.)

Alternatively, we can run two sets of threads, one for events, and one for pairs_offsets. This however, will require modulus operation to determine particle indices, so avoided here).

In [5]:
mod = SourceModule('''
__global__ void product(int* left,int* right,int* starts,int* stops,int* counts,int* pairs,int* NUMEVENTS,int* NUMPARTICLES)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    int k = blockIdx.z*blockDim.z + threadIdx.z;
    
    if (k<NUMEVENTS[0] && j<NUMPARTICLES[0] && k<NUMPARTICLES[0])
    {
        if (starts[k]<=i && i<stops[k] && starts[k]<=j && j<stops[k])
        {
            left[pairs[k]+(i-starts[k])*counts[k]+j-starts[k]] = i;
            right[pairs[k]+(i-starts[k])*counts[k]+j-starts[k]] = j;
        }
    }
}
''')

In [6]:
func = mod.get_function("product")

In [7]:
left = numpy.empty(pairs_offsets[-1], dtype=numpy.int32)
right = numpy.empty(pairs_offsets[-1], dtype=numpy.int32)

In [8]:
NUMBLOCKS_x = int(numpy.ceil(NUMPARTICLES/8))
NUMBLOCKS_z = int(numpy.ceil(NUMEVENTS/8))

In [9]:
func(cuda.InOut(left),cuda.InOut(right),gpu_starts,gpu_stops,gpu_counts,gpu_pairs_offsets,cuda.In(arr_numevents), cuda.In(arr_numparticles), block=(8,8,8), grid=(NUMBLOCKS_x, NUMBLOCKS_x, NUMBLOCKS_z))

In [11]:
for i in range(6):
    print("\nEvent {}\nLeft {}\nRight {}\n".format(i, left[pairs_offsets[i]:pairs_offsets[i+1]], right[pairs_offsets[i]:pairs_offsets[i+1]]))


Event 0
Left [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3
  3  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5
  5  5  5  5  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6  6  6  6  6  6  6
  6  6  6  6  6  6  6  6  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9
  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 13 13 13
 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 14 14
 14 14 14 14 14 14 14 14 14 14 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15
 15 15 15 15 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 17 1