# CUDA version of gen-reco matching

This is priliminary, extremely buggy and unoptimized CUDA version of gen-reco matching. This is based on the sequential code developed earlier. 

In [1]:
import numpy
import pycuda.driver as cuda
from pycuda.compiler import *
import pycuda.autoinit

In [2]:
NUMEVENTS = 100      # exact number of events
AVENUMJETS = 10      # average number of jets per event
PHILOW = -numpy.pi   # bounding box of phi (azimuthal angle) and eta (~polar angle)
PHIHIGH = numpy.pi
ETALOW = -5
ETAHIGH = 5
ERRPHI = 0.01        # detector resolution
ERRETA = 0.01
RECOPROB = 0.95      # probability of not reconstructing a real jet
AVENUMFAKES = 1      # average number of spurious (fake) recontstructions

# simulate the generator-level jets
numgenjets = numpy.random.poisson(AVENUMJETS, NUMEVENTS)
genstops = numpy.cumsum(numgenjets).astype(numpy.int32)
genstarts = numpy.empty_like(genstops).astype(numpy.int32)
genstarts[0] = 0
genstarts[1:] = genstops[:-1]
genphi = numpy.random.uniform(PHILOW, PHIHIGH, genstops[-1]).astype(numpy.float32)
geneta = numpy.random.uniform(ETALOW, ETAHIGH, genstops[-1]).astype(numpy.float32)

# simulate mismeasurement (error in reconstructing phi and eta)
phiwitherr = genphi + numpy.random.normal(0, ERRPHI, genstops[-1]).astype(numpy.float32)
etawitherr = geneta + numpy.random.normal(0, ERRETA, genstops[-1]).astype(numpy.float32)

# simulate inefficiency in reconstruction (missing real jets)
recomask = (numpy.random.uniform(0, 1, genstops[-1]) < RECOPROB)

# simulate spurious (fake) jets per event
numfakes = numpy.random.poisson(AVENUMFAKES, NUMEVENTS)
fakestops = numpy.cumsum(numfakes).astype(numpy.int32)
fakestarts = numpy.empty_like(fakestops).astype(numpy.int32)
fakestarts[0] = 0
fakestarts[1:] = fakestops[:-1]
fakephi = numpy.random.uniform(PHILOW, PHIHIGH, fakestops[-1]).astype(numpy.float32)
fakeeta = numpy.random.uniform(ETALOW, ETAHIGH, fakestops[-1]).astype(numpy.float32)

# fill reconstructed data arrays
recostarts = numpy.empty_like(genstarts)
recostops = numpy.empty_like(genstops)
recophi = numpy.empty(recomask.sum() + numfakes.sum(), dtype=genphi.dtype)
recoeta = numpy.empty_like(recophi)

truematches = []
recostart, recostop = 0, 0

In [3]:
for i in range(NUMEVENTS):
    genstart, genstop = genstarts[i], genstops[i]
    fakestart, fakestop = fakestarts[i], fakestops[i]
    mask = recomask[genstart:genstop]

    phi = phiwitherr[genstart:genstop][mask]    # generated phi with error and mask
    eta = etawitherr[genstart:genstop][mask]    # generated eta with error and mask

    # concatenate the subset of real jets with some fake jets
    holdphi = numpy.concatenate((phi, fakephi[fakestart:fakestop]))
    holdeta = numpy.concatenate((eta, fakeeta[fakestart:fakestop]))
    recostop += len(holdphi)

    # gen-level and reco-level data are both unordered sets; randomly permute
    order = numpy.random.permutation(recostop - recostart)
    recophi[recostart:recostop][order] = holdphi
    recoeta[recostart:recostop][order] = holdeta

    # keep that permutation to use as a "true match" map (not known to physicist!)
    '''
    truematch = numpy.ones(genstop - genstart, dtype=numgenjets.dtype) * -1
    truematch[mask] = order[:mask.sum()]
    truematches.append(truematch)
    '''
    recostarts[i] = recostart
    recostops[i] = recostop
    recostart = recostop

In [4]:
gen_len = (genstops - genstarts).astype(numpy.int32)
reco_len = (recostops - recostarts).astype(numpy.int32)

pairs_len = numpy.zeros(NUMEVENTS+1, dtype = numpy.int32)
pairs_len[1:] = numpy.cumsum(gen_len*reco_len).astype(numpy.int32)

In [5]:
# CUDA source
mod = SourceModule('''
#include <cmath>        // Not needed, but added to check external includes in PyCUDA 
extern "C"{

__device__ float deltaeta(float eta1,float eta2)
{
    return eta1-eta2;       // It is simple subtraction.
}

__device__ float deltaphi(float phi1, float phi2)
{
    return phi1-phi2;      // Here, we need to consider the case that phi1-phi2 is constrained 
                           // between [-pi, pi].
}

__global__ void deltaR(float* deltar,float* eta1, float* eta2, float* phi1, float* phi2,int* length, int* start1, int* start2, int* pairs_len,int* lengths1, int* lengths2)
{
    // 3 dimensional indices
    // We will use hypotf() builtin CUDA function for the calculation of deltaR. 
    // The indices are generated in same way as combinations solved earlier. This indices are passeed onto
    // eta and phi arrays.
    
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    int k = blockIdx.z*blockDim.z + threadIdx.z;
    if (i <length[0]){
        if(j<lengths1[i] && k<lengths2[i]){
            deltar[pairs_len[i] + j*lengths2[i] + k] = hypotf(deltaeta(eta1[j+start1[i]], eta2[k+start2[i]]), deltaphi(phi1[j + start1[i]], phi2[k+start2[i]]));
        }
    }
}
}
''', no_extern_c = True)

In [6]:
deltar = numpy.empty(pairs_len[-1], dtype=numpy.float32)
# Number of events = base_len
base_len = numpy.array([NUMEVENTS]).astype(numpy.int32)
deltar_func = mod.get_function("deltaR")

In [7]:
deltar_func(cuda.InOut(deltar),cuda.In(geneta), cuda.In(recoeta), cuda.In(genphi), cuda.In(recophi), 
    cuda.In(base_len),cuda.In(genstarts), cuda.In(recostarts), cuda.In(pairs_len), cuda.In(gen_len), cuda.In(reco_len), block=(1,8,8),grid = (len(numgenjets), 20, 20))

In [8]:
# Print the values of deltaR
for i in range(3):
    print("Event:{} \n {}\n".format(i, deltar[pairs_len[i]:pairs_len[i+1]]))

Event:0 
 [  1.45053221e-02   4.28466749e+00   4.13513994e+00   2.85659313e+00
   3.81150079e+00   5.35130167e+00   3.36524320e+00   1.41364861e+00
   7.45716619e+00   8.19147396e+00   5.36781263e+00   3.97514868e+00
   1.56485677e+00   2.58731508e+00   4.36883163e+00   1.22029902e-02
   2.08710122e+00   4.17141104e+00   2.46515703e+00   2.83509970e+00
   3.38990140e+00   2.69973516e+00   1.49357820e+00   5.31340957e-01
   3.70838976e+00   2.05880141e+00   2.09596027e-02   2.11392832e+00
   4.08884907e+00   4.84721661e+00   4.67329121e+00   6.79580450e+00
   3.02902222e+00   4.13212585e+00   8.83964479e-01   4.38963938e+00
   4.16162539e+00   4.64199448e+00   6.78741693e+00   6.53875923e+00
   3.79568100e+00   6.24024153e+00   2.85679579e+00   3.58475232e+00
   9.98045504e-03   4.35947227e+00   3.71087337e+00   3.87054706e+00
   6.82675123e+00   6.78507519e+00   7.46255493e+00   4.66751194e+00
   4.02129889e+00   4.60060596e+00   6.81891060e+00   2.47262549e+00
   4.09998417e+00   6.11

In [9]:
mod2 = SourceModule('''

__global__ void truematches(float* deltar,int* truematches, int* start1,int* start2,int* pairs_len, int* length, int*lengths1, int* lengths2)
{
    // Calculate the global event index i, and 1st particle index j
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    
    if (i<length[0])
    {
        if (j<lengths1[i])
        {
            __shared__ int min_idx[512];
            // Calculate min index. Here is where the problem lies. Can't find an efficient reduction procedure
            min_idx[j] = start2[i];
            __syncthreads();
            for (int k=1; k<lengths2[i]; k++)
            {
                if (deltar[pairs_len[i] + j*lengths2[i]+k] < deltar[pairs_len[i]+j*lengths2[i]+min_idx[j]-start2[i]])
                {
                    min_idx[j] = k;
                }
            }
            __syncthreads();
            truematches[j+start1[i]] = min_idx[j] + start2[i];
            __syncthreads();
        }
    }
}
''')

In [10]:
truematch_func = mod2.get_function("truematches")

In [11]:
truematches = numpy.ones(genstops[-1], dtype=numpy.int32)*-1

In [12]:
truematch_func(cuda.In(deltar), cuda.InOut(truematches), cuda.In(genstarts), cuda.In(recostarts),
              cuda.In(pairs_len), cuda.In(base_len), cuda.In(gen_len), cuda.In(reco_len),
              block=(1,8,1), grid=(len(numgenjets), 20, 1))

In [13]:

for i in range(6):
    print("\n Event: {}\n".format(i))
    print(" Generator Level index              Reconstructed Level index\n")
    for j in range(genstarts[i], genstops[i]):
        if truematches[j] != -1 :
            print("      {}                  ->                   {}".format(j, truematches[j]))
        else:
            print("      {}                  ->                   {}".format(j, '-'))


 Event: 0

 Generator Level index              Reconstructed Level index

      0                  ->                   0
      1                  ->                   5
      2                  ->                   6
      3                  ->                   4
      4                  ->                   4
      5                  ->                   8
      6                  ->                   3
      7                  ->                   9
      8                  ->                   1
      9                  ->                   2

 Event: 1

 Generator Level index              Reconstructed Level index

      10                  ->                   17
      11                  ->                   17
      12                  ->                   20
      13                  ->                   11
      14                  ->                   14
      15                  ->                   18
      16                  ->                   16
      17            