# CUDA version of gen-reco matching

This is an unoptimized CUDA version of gen-reco matching. This is based on the sequential code developed earlier. 

#### Performance Issues I can think of in the CUDA code:
1. Warp size not considered. So possible divergence
2. Each track ( as David said in the problem statement) is assigned a separate dimension. So, if they are large in number, we will run out of threads.
3. It contains a loop to find the minimum index. Don't know if there is a reduction algorithm to find index of minimum ( and not minimum itself).

In [57]:
import numpy
import pycuda.driver as cuda
from pycuda.compiler import *
import pycuda.autoinit

In [58]:
NUMEVENTS = 100      # exact number of events
AVENUMJETS = 10      # average number of jets per event
PHILOW = -numpy.pi   # bounding box of phi (azimuthal angle) and eta (~polar angle)
PHIHIGH = numpy.pi
ETALOW = -5
ETAHIGH = 5
ERRPHI = 0.01        # detector resolution
ERRETA = 0.01
RECOPROB = 0.95      # probability of not reconstructing a real jet
AVENUMFAKES = 1      # average number of spurious (fake) recontstructions

# simulate the generator-level jets
numgenjets = numpy.random.poisson(AVENUMJETS, NUMEVENTS)
genstops = numpy.cumsum(numgenjets).astype(numpy.int32)
genstarts = numpy.empty_like(genstops).astype(numpy.int32)
genstarts[0] = 0
genstarts[1:] = genstops[:-1]
genphi = numpy.random.uniform(PHILOW, PHIHIGH, genstops[-1]).astype(numpy.float32)
geneta = numpy.random.uniform(ETALOW, ETAHIGH, genstops[-1]).astype(numpy.float32)

# simulate mismeasurement (error in reconstructing phi and eta)
phiwitherr = genphi + numpy.random.normal(0, ERRPHI, genstops[-1]).astype(numpy.float32)
etawitherr = geneta + numpy.random.normal(0, ERRETA, genstops[-1]).astype(numpy.float32)

# simulate inefficiency in reconstruction (missing real jets)
recomask = (numpy.random.uniform(0, 1, genstops[-1]) < RECOPROB)

# simulate spurious (fake) jets per event
numfakes = numpy.random.poisson(AVENUMFAKES, NUMEVENTS)
fakestops = numpy.cumsum(numfakes).astype(numpy.int32)
fakestarts = numpy.empty_like(fakestops).astype(numpy.int32)
fakestarts[0] = 0
fakestarts[1:] = fakestops[:-1]
fakephi = numpy.random.uniform(PHILOW, PHIHIGH, fakestops[-1]).astype(numpy.float32)
fakeeta = numpy.random.uniform(ETALOW, ETAHIGH, fakestops[-1]).astype(numpy.float32)

# fill reconstructed data arrays
recostarts = numpy.empty_like(genstarts)
recostops = numpy.empty_like(genstops)
recophi = numpy.empty(recomask.sum() + numfakes.sum(), dtype=genphi.dtype)
recoeta = numpy.empty_like(recophi)

truematches = []
recostart, recostop = 0, 0

In [59]:
for i in range(NUMEVENTS):
    genstart, genstop = genstarts[i], genstops[i]
    fakestart, fakestop = fakestarts[i], fakestops[i]
    mask = recomask[genstart:genstop]

    phi = phiwitherr[genstart:genstop][mask]    # generated phi with error and mask
    eta = etawitherr[genstart:genstop][mask]    # generated eta with error and mask

    # concatenate the subset of real jets with some fake jets
    holdphi = numpy.concatenate((phi, fakephi[fakestart:fakestop]))
    holdeta = numpy.concatenate((eta, fakeeta[fakestart:fakestop]))
    recostop += len(holdphi)

    # gen-level and reco-level data are both unordered sets; randomly permute
    order = numpy.random.permutation(recostop - recostart)
    recophi[recostart:recostop][order] = holdphi
    recoeta[recostart:recostop][order] = holdeta

    # keep that permutation to use as a "true match" map (not known to physicist!)
    '''
    truematch = numpy.ones(genstop - genstart, dtype=numgenjets.dtype) * -1
    truematch[mask] = order[:mask.sum()]
    truematches.append(truematch)
    '''
    recostarts[i] = recostart
    recostops[i] = recostop
    recostart = recostop

In [60]:
gen_len = (genstops - genstarts).astype(numpy.int32)
reco_len = (recostops - recostarts).astype(numpy.int32)

pairs_len = numpy.zeros(NUMEVENTS+1, dtype = numpy.int32)
pairs_len[1:] = numpy.cumsum(gen_len*reco_len).astype(numpy.int32)

In [61]:
# CUDA source for deltaR

mod = SourceModule('''
#include <cmath>        // Not needed, but added to check external includes in PyCUDA 
extern "C"{

__device__ float deltaeta(float eta1,float eta2)
{
    return eta1-eta2;       // It is simple subtraction.
}

__device__ float deltaphi(float phi1, float phi2)
{
    return phi1-phi2;      // Here, we need to consider the case that phi1-phi2 is constrained 
                           // between [-pi, pi].
}

__global__ void deltaR(float* deltar,float* eta1, float* eta2, float* phi1, float* phi2,int* length, int* start1, int* start2, int* pairs_len,int* lengths1, int* lengths2)
{
    // 3 dimensional indices
    // We will use hypotf() builtin CUDA function for the calculation of deltaR. 
    // The indices are generated in same way as combinations solved earlier. This indices are passeed onto
    // eta and phi arrays.
    
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    int k = blockIdx.z*blockDim.z + threadIdx.z;
    if (i <length[0]){
        if(j<lengths1[i] && k<lengths2[i]){
            deltar[pairs_len[i] + j*lengths2[i] + k] = hypotf(deltaeta(eta1[j+start1[i]], eta2[k+start2[i]]), deltaphi(phi1[j + start1[i]], phi2[k+start2[i]]));
        }
    }
}
}
''', no_extern_c = True)

In [62]:
deltar = numpy.empty(pairs_len[-1], dtype=numpy.float32)
# Number of events = base_len
base_len = numpy.array([NUMEVENTS]).astype(numpy.int32)
deltar_func = mod.get_function("deltaR")

In [63]:
deltar_func(cuda.InOut(deltar),cuda.In(geneta), cuda.In(recoeta), cuda.In(genphi), cuda.In(recophi), 
    cuda.In(base_len),cuda.In(genstarts), cuda.In(recostarts), cuda.In(pairs_len), cuda.In(gen_len), cuda.In(reco_len), block=(1,8,8),grid = (len(numgenjets), 20, 20))

In [71]:
# Truematches calculation. It emulates the sequential code

mod2 = SourceModule('''

__global__ void truematches(float* deltar,int* truematches, int* start1,int* start2,int* pairs_len, int* length, int*lengths1, int* lengths2)
{
    // Calculate the global event index i, and 1st particle index j
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    
    if (i<length[0])
    {
        if (j<lengths1[i])
        {
           __shared__ int min_idx;   // shared variable will reduce memory access times in loop.
            // Calculate min index. Here is where the problem lies. Can't find an efficient reduction procedure
            
            min_idx = start2[i];
            #pragma unroll
            for (int k=1; k<lengths2[i]; k++)
            {
                if (deltar[pairs_len[i] + j*lengths2[i]+k] < deltar[pairs_len[i]+j*lengths2[i]+min_idx-start2[i]])
                {
                    min_idx = k+start2[i];
                }
            }
            truematches[j+start1[i]] = min_idx;
            __syncthreads();
        }
    }
}
''')

kernel.cu



In [72]:
truematch_func = mod2.get_function("truematches")

In [73]:
truematches = numpy.ones(genstops[-1], dtype=numpy.int32)*-1

In [74]:
truematch_func(cuda.In(deltar), cuda.InOut(truematches), cuda.In(genstarts), cuda.In(recostarts),
              cuda.In(pairs_len), cuda.In(base_len), cuda.In(gen_len), cuda.In(reco_len),
              block=(1,8,1), grid=(len(numgenjets), 20, 1))

In [77]:
print("CUDA Output ")
for i in range(3):
    print("\n Event: {}\n".format(i))
    print(" Generator Level index              Reconstructed Level index\n")
    for j in range(genstarts[i], genstops[i]):
        if truematches[j] != -1 :
            print("      {}                  ->                   {}".format(j, truematches[j]))
        else:
            print("      {}                  ->                   {}".format(j, '-'))

CUDA Output 

 Event: 0

 Generator Level index              Reconstructed Level index

      0                  ->                   7
      1                  ->                   5
      2                  ->                   3
      3                  ->                   8
      4                  ->                   10
      5                  ->                   6
      6                  ->                   13
      7                  ->                   12
      8                  ->                   11
      9                  ->                   9
      10                  ->                   4
      11                  ->                   2
      12                  ->                   3
      13                  ->                   0

 Event: 1

 Generator Level index              Reconstructed Level index

      14                  ->                   14
      15                  ->                   15
      16                  ->                   16

 Event

In [69]:
# Sequential version. Test it with CUDA version to see if the results are right.

truematches_seq = numpy.ones(genstops[-1], dtype=numpy.int32)*-1
for i in range(NUMEVENTS):
    pairs_i = pairs_len[i]
    for j in range(genstarts[i], genstops[i]):
        min_idx = recostarts[i]
        truematches_seq[j] = min_idx
        for k in range(recostarts[i], recostops[i]):
            if (deltar[pairs_i] < deltar[pairs_len[i]+(j-genstarts[i])*reco_len[i]+(min_idx-recostarts[i])]):
                truematches_seq[j] = k
                min_idx = k
            pairs_i+=1

In [78]:
print("Sequential output")
for i in range(3):
    print("\n Event: {}\n".format(i))
    print(" Generator Level index              Reconstructed Level index\n")
    for j in range(genstarts[i], genstops[i]):
        if truematches_seq[j] != -1 :
            print("      {}                  ->                   {}".format(j, truematches_seq[j]))
        else:
            print("      {}                  ->                   {}".format(j, '-'))

Sequential output

 Event: 0

 Generator Level index              Reconstructed Level index

      0                  ->                   7
      1                  ->                   5
      2                  ->                   3
      3                  ->                   8
      4                  ->                   10
      5                  ->                   6
      6                  ->                   13
      7                  ->                   12
      8                  ->                   11
      9                  ->                   9
      10                  ->                   4
      11                  ->                   2
      12                  ->                   3
      13                  ->                   0

 Event: 1

 Generator Level index              Reconstructed Level index

      14                  ->                   14
      15                  ->                   15
      16                  ->                   16

 