In [1]:
spark

## Load ROOT files

In [2]:
requiredColumns = ["EFlowTrack", "MuonTight_size", "Electron_size",
                   "EFlowNeutralHadron", "EFlowPhoton", "Electron",
                   "MuonTight", "MissingET", "Jet"]

In [3]:
df = spark.read.format("org.dianahep.sparkroot.experimental")\
        .load("file:///afs/cern.ch/work/m/migliori/public/SparkPipeline/data/qcd*.root") \
        .select(*requiredColumns)

In [4]:
df.printSchema()

root
 |-- EFlowTrack: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- fUniqueID: integer (nullable = true)
 |    |    |-- fBits: integer (nullable = true)
 |    |    |-- PID: integer (nullable = true)
 |    |    |-- Charge: integer (nullable = true)
 |    |    |-- PT: float (nullable = true)
 |    |    |-- Eta: float (nullable = true)
 |    |    |-- Phi: float (nullable = true)
 |    |    |-- EtaOuter: float (nullable = true)
 |    |    |-- PhiOuter: float (nullable = true)
 |    |    |-- X: float (nullable = true)
 |    |    |-- Y: float (nullable = true)
 |    |    |-- Z: float (nullable = true)
 |    |    |-- T: float (nullable = true)
 |    |    |-- XOuter: float (nullable = true)
 |    |    |-- YOuter: float (nullable = true)
 |    |    |-- ZOuter: float (nullable = true)
 |    |    |-- TOuter: float (nullable = true)
 |    |    |-- Dxy: float (nullable = true)
 |    |    |-- SDxy: float (nullable = true)
 |    |    |-- Xd: float (nullable = 

In [5]:
import numpy as np

### Filter muons and electrons

In [6]:
filtered = df.filter((df['MuonTight_size']!=0) | (df['Electron_size']!=0))

In [7]:
df.count()

35101

In [8]:
filtered.count()

29780

## Filter traks

In [9]:
tracks = df.select('EFlowTrack').take(1)

In [10]:
len(tracks[0].EFlowTrack)

390

In [11]:
tracks[0].EFlowTrack[0]

Row(fUniqueID=10596, fBits=50331664, PID=-211, Charge=-1, PT=0.4648737907409668, Eta=-0.3981272280216217, Phi=-2.126897096633911, EtaOuter=-3.879678964614868, PhiOuter=-1.96621572971344, X=0.0, Y=0.0, Z=36.448448181152344, T=2.1116360049422411e-10, XOuter=-47.7633056640625, YOuter=-114.42939758300781, ZOuter=-3000.0, TOuter=2.8104524929517538e-08, Dxy=-0.0, SDxy=0.0, Xd=0.0, Yd=-0.0, Zd=36.448448181152344, EFlowTrack_Particle=Row(TObject=Row(fUniqueID=0, fBits=65536)))

Default code is

```Python
def ChPtMapp(DR, event):
    pTmap = []
    for h in event.EFlowTrack:
        if h.PT<= 0.5: continue
        pTmap.append([h.Eta, h.Phi, h.PT])
    return np.asarray(pTmap)
```

hence we can use HOF to filer all the particles in `EFlowTrack` with $p_T\leq0.5$

In [12]:
filtered.createOrReplaceTempView("events")b

In [13]:
spark.sql("""
SELECT cardinality(EFlowTrack),
    cardinality(FILTER(EFlowTrack,
        tracks -> tracks.PT > 0.5
    )) EFlowTrack_Filtered
FROM events
""").show(5)

+----------------+-------------------+
|size(EFlowTrack)|EFlowTrack_Filtered|
+----------------+-------------------+
|             390|                283|
|             491|                347|
|             398|                302|
|             388|                297|
|             136|                 96|
+----------------+-------------------+
only showing top 5 rows



We can do the same thing for the others, for example consider the photons

In [14]:
spark.sql("""
SELECT cardinality(EFlowPhoton),
    cardinality(FILTER(EFlowPhoton,
        photon -> photon.ET > 1
    )) EFlowPhoton_Filtered
FROM events
""").show(5)

+-----------------+--------------------+
|size(EFlowPhoton)|EFlowPhoton_Filtered|
+-----------------+--------------------+
|              556|                  78|
|              635|                  89|
|              509|                  76|
|              555|                  68|
|              190|                  30|
+-----------------+--------------------+
only showing top 5 rows



and also neutral hadrons

In [15]:
spark.sql("""
SELECT cardinality(EFlowNeutralHadron),
    cardinality(FILTER(EFlowNeutralHadron,
        hadron -> hadron.ET > 1
    )) EFlowNeutralHadron_Filtered
FROM events
""").show(5)

+------------------------+---------------------------+
|size(EFlowNeutralHadron)|EFlowNeutralHadron_Filtered|
+------------------------+---------------------------+
|                     419|                        124|
|                     450|                        128|
|                     377|                        110|
|                     412|                        126|
|                     150|                         49|
+------------------------+---------------------------+
only showing top 5 rows



During the simulation of the trigger we require $p_T > 23\,\text{GeV}$ for muons and electrons

## EXAMPLE

In [3]:
events = spark.read.format("org.dianahep.sparkroot.experimental")\
        .load("file:///afs/cern.ch/work/m/migliori/public/SparkPipeline/data/*.root") \
        .select(*requiredColumns)
        
events.createOrReplaceTempView("events")

In [4]:
spark.sql("""
SELECT 
    cardinality(Electron),
    cardinality(MuonTight),
    cardinality(FILTER(Electron,
        electron -> electron.PT > 23
    )) Electron_Filtered,
    cardinality(FILTER(MuonTight,
        muon -> muon.PT > 23
    )) Muon_Filtered
FROM events
WHERE MuonTight_size > 0 OR  Electron_size > 0
""").show(5)

+--------------+---------------+-----------------+-------------+
|size(Electron)|size(MuonTight)|Electron_Filtered|Muon_Filtered|
+--------------+---------------+-----------------+-------------+
|             1|              0|                1|            0|
|             0|              1|                0|            1|
|             0|              1|                0|            0|
|             0|              1|                0|            1|
|             0|              1|                0|            1|
+--------------+---------------+-----------------+-------------+
only showing top 5 rows



In [None]:
## LEPTONS FILTERED! 
filteredLeptons = spark.sql("""
SELECT *
FROM (
    SELECT 
        EFlowTrack,
        EFlowNeutralHadron,
        EFlowPhoton ,
        MissingET,
        Jet,
        FILTER(Electron, 
            electron -> electron.PT > 23
        ) Electron,
        FILTER(MuonTight,
            muon -> muon.PT > 23
        ) MuonTight
    FROM events
    WHERE MuonTight_size > 0 OR  Electron_size > 0
) Filtered 
WHERE cardinality(Electron) > 0 
      OR cardinality(MuonTight) > 0
""")

In [6]:
filteredLeptons.createTempView("filteredLeptons")

In [7]:
## Decrease number of tracks
filteredDF = spark.sql("""
SELECT 
    FILTER(EFlowTrack,
        tracks -> tracks.PT > 0.5
    ) EFlowTrack, 
    
    FILTER(EFlowNeutralHadron,
        hadron -> hadron.ET > 1.0
    ) EFlowNeutralHadron,
    
    FILTER(EFlowPhoton,
        photon -> photon.ET > 1.0
    ) EFlowPhoton,
    
    FILTER(Jet,
        jet -> ((jet.PT>30.0) AND (ABS(jet.Eta)<2.6)) 
    ) Jets,
    
    MissingET,
    Electron,
    MuonTight
FROM filteredLeptons
""")

In [8]:
filteredDF.createOrReplaceTempView("filteredDF")

In [9]:
## Reduce number of features
reduced = spark.sql("""
SELECT
     TRANSFORM(EFlowTrack,
     track -> map_from_arrays(
        Array("PT", "Eta", "Phi", "PID", "X", "Y", "Z"),
        Array(track.PT, track.Eta, track.Phi, track.PID, track.X, track.Y, track.Z)
        )
     ) Tracks,
     
     TRANSFORM(EFlowPhoton,
     photon -> map_from_arrays(
        Array("ET", "Eta", "Phi"),
        Array(photon.ET, photon.Eta, photon.Phi)
        )
     ) Photons,
     
     TRANSFORM(EFlowNeutralHadron,
     hadron -> map_from_arrays(
        Array("ET", "Eta", "Phi"),
        Array(hadron.ET, hadron.Eta, hadron.Phi)
        )
     ) NeutralHadrons,
     
     TRANSFORM(MissingET,
     missingET -> map_from_arrays(
        Array("MET", "Phi"),
        Array(missingET.MET, missingET.Phi)
        )
     ) MissingET,
     
     TRANSFORM(Jets,
     jet -> map_from_arrays(
        Array("PT", "BTag"),
        Array(jet.PT, jet.BTag)
        )
     ) Jets,
     
     TRANSFORM(Electron,
     electron -> map_from_arrays(
        Array("PT", "Eta", "Phi", "Charge"),
        Array(electron.PT, electron.Eta, electron.Phi, electron.Charge)
        )
     ) Electron,
     
     TRANSFORM(MuonTight,
     muon -> map_from_arrays(
        Array("PT", "Eta", "Phi", "Charge"),
        Array(muon.PT, muon.Eta, muon.Phi,muon.Charge)
        )
     ) MuonTight
     
FROM filteredDF
""")

In [10]:
reduced.printSchema()

root
 |-- Tracks: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: float (valueContainsNull = true)
 |-- Photons: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: float (valueContainsNull = true)
 |-- NeutralHadrons: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: float (valueContainsNull = true)
 |-- MissingET: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: float (valueContainsNull = true)
 |-- Jets: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: float (valueContainsNull = true)
 |-- Electron: array (nullable = true)
 |    |-- element: map (containsNull = false)
 |    |    |-- key: string
 |    |    |-- value: float (valueContainsNull =

In [19]:
from pyspark.sql import Row
from pyspark.sql.functions import pandas_udf, udf, col
from pyspark.sql.types import ArrayType, DoubleType
import numpy as np

def convert(event):
    q = LorentzVector()
    particles = np.zeros((801,19))
    index = 0
    
    TrkPtMap = ChPtMapp(event.Tracks)
    NeuPtMap = NeuPtMapp(event.NeutralHadrons)
    PhotonPtMap = PhotonPtMapp(event.Photons)
    
    ## Lepton Filter
    selected, lep, otherlep = selection(event.Electron, event.MuonTight,
                                        TrkPtMap, NeuPtMap, PhotonPtMap)
    if not selected: return Row()
    #particles.append(lep)
    particles[index] = lep
    index
    lepMomentum = LorentzVector(lep[1], lep[2], lep[3], lep[0])
    
    nTrk = 0
    for h in event.Tracks:
        if nTrk>=450: break
        q.SetPtEtaPhiM(h["PT"], h["Eta"], h["Phi"], 0.)
        if lepMomentum.DeltaR(q) > 0.0001:
            pfisoCh = PFIso(q, 0.3, TrkPtMap, True)
            pfisoNeu = PFIso(q, 0.3, NeuPtMap, False)
            pfisoGamma = PFIso(q, 0.3, PhotonPtMap, False)
            """particles.append([q.E(), q.Px(), q.Py(), q.Pz(),
                              h["PT"], h["Eta"], h["Phi"], h["X"], h["Y"], h["Z"],
                              pfisoCh, pfisoGamma, pfisoNeu,
                              1., 0., 0., 0., 0., np.sign(h["PID"])])"""
            particles[index] = [q.E(), q.Px(), q.Py(), q.Pz(),
                              h["PT"], h["Eta"], h["Phi"], h["X"], h["Y"], h["Z"],
                              pfisoCh, pfisoGamma, pfisoNeu,
                              1., 0., 0., 0., 0., np.sign(h["PID"])]
            nTrk += 1
            index += 1
            
    nPhoton = 0
    for h in event.Photons:
        if nPhoton >= 150: break
        q.SetPtEtaPhiM(h["ET"], h["Eta"], h["Phi"], 0.)
        pfisoCh = PFIso(q, 0.3, TrkPtMap, True)
        pfisoNeu = PFIso(q, 0.3, NeuPtMap, False)
        pfisoGamma = PFIso(q, 0.3, PhotonPtMap, False)
        """particles.append([q.E(), q.Px(), q.Py(), q.Pz(),
                          h["ET"], h["Eta"], h["Phi"], 0., 0., 0.,
                          pfisoCh, pfisoGamma, pfisoNeu,
                          0., 0., 1., 0., 0., 0.])"""
        particles[index] = [q.E(), q.Px(), q.Py(), q.Pz(),
                          h["ET"], h["Eta"], h["Phi"], 0., 0., 0.,
                          pfisoCh, pfisoGamma, pfisoNeu,
                          0., 0., 1., 0., 0., 0.]
        nPhoton += 1
        index += 1
    
    nNeu = 0
    for h in event.NeutralHadrons:
        if nNeu >= 200: break
        q.SetPtEtaPhiM(h["ET"], h["Eta"], h["Phi"], 0.)
        pfisoCh = PFIso(q, 0.3, TrkPtMap, True)
        pfisoNeu = PFIso(q, 0.3, NeuPtMap, False)
        pfisoGamma = PFIso(q, 0.3, PhotonPtMap, False)
        """particles.append([q.E(), q.Px(), q.Py(), q.Pz(),
                          h["ET"], h["Eta"], h["Phi"], 0., 0., 0.,
                          pfisoCh, pfisoGamma, pfisoNeu,
                          0., 1., 0., 0., 0., 0.])"""
        particles[index] = [q.E(), q.Px(), q.Py(), q.Pz(),
                          h["ET"], h["Eta"], h["Phi"], 0., 0., 0.,
                          pfisoCh, pfisoGamma, pfisoNeu,
                          0., 1., 0., 0., 0., 0.]
        nNeu += 1
        index += 1
        
    ## Build high level features
    
    myMET = event.MissingET[0]
    MET = myMET["MET"]
    phiMET = myMET["Phi"]
    MT = 2.*MET*lepMomentum.Pt()*(1-math.cos(lepMomentum.Phi()-phiMET))
    HT = 0.
    nJets = 0.
    nBjets = 0.
    for jet in event.Jets:
        nJets += 1
        HT += jet["PT"]
        if jet["BTag"]>0: 
            nBjets += 1
    LepPt = lep[4]
    LepEta = lep[5]
    LepPhi = lep[6]
    LepIsoCh = lep[10]
    LepIsoGamma = lep[11]
    LepIsoNeu = lep[12]
    LepCharge = lep[18]
    LepIsEle = lep[16]
    
    hlf = [HT, MET, phiMET, MT, nJets, nBjets, LepPt, LepEta, LepPhi,
           LepIsoCh, LepIsoGamma, LepIsoNeu, LepCharge, LepIsEle]
    
    return Row(hlf=hlf)

In [20]:
%time collected = reduced.rdd.map(convert).filter(lambda row: len(row) > 0).toDF().collect()

CPU times: user 504 ms, sys: 130 ms, total: 634 ms
Wall time: 5min 44s


**Time reduced from 11mins to 5min 44s**

In [12]:
events.count()

205097

In [13]:
reduced.count()

103539

## Functions

In [14]:
import math
def PFIso(p, DR, PtMap, subtractPt):
    if p.Pt() <= 0.: return 0.
    DeltaEta = PtMap[:,0] - p.Eta()
    DeltaPhi = PtMap[:,1] - p.Phi()
    twopi = 2.*math.pi
    DeltaPhi = DeltaPhi - twopi*(DeltaPhi >  twopi) + twopi*(DeltaPhi < -1.*twopi)
    isInCone = DeltaPhi*DeltaPhi + DeltaEta*DeltaEta < DR*DR
    Iso = PtMap[isInCone, 2].sum()/p.Pt()
    if subtractPt: Iso = Iso -1
    return float(Iso)

# get the selected tracks
def ChPtMapp(Tracks):
    #pTmap = []
    pTmap = np.zeros((len(Tracks), 3))
    for i, h in enumerate(Tracks):
        pTmap[i] = [h["Eta"], h["Phi"], h["PT"]]
    return pTmap

# get the selected neutrals
def NeuPtMapp(NeutralHadrons):
    pTmap = np.zeros((len(NeutralHadrons), 3))
    for i, h in enumerate(NeutralHadrons):
        pTmap[i] = [h["Eta"], h["Phi"], h["ET"]]
    return pTmap

# get the selected photons
def PhotonPtMapp(Photons):
    pTmap = np.zeros((len(Photons), 3))
    for i, h in enumerate(Photons):
        pTmap[i] = [h["Eta"], h["Phi"], h["ET"]]
    return pTmap

In [15]:
def selection(Electron, MuonTight, TrkPtMap, NeuPtMap, PhotonPtMap):
    
    foundMuon = None 
    foundEle =  None 

    l = LorentzVector()
    for ele in Electron:

        l.SetPtEtaPhiM(ele["PT"], ele["Eta"], ele["Phi"], 0.)
        
        pfisoCh = PFIso(l, 0.3, TrkPtMap, True)
        pfisoNeu = PFIso(l, 0.3, NeuPtMap, False)
        pfisoGamma = PFIso(l, 0.3, PhotonPtMap, False)
        if foundEle == None and (pfisoCh+pfisoNeu+pfisoGamma)<0.45:
            foundEle = [l.E(), l.Px(), l.Py(), l.Pz(), l.Pt(), l.Eta(), l.Phi(),
                        0., 0., 0., pfisoCh, pfisoGamma, pfisoNeu,
                        0., 0., 0., 1., 0., float(ele["Charge"])]
    for muon in MuonTight:
        #
        # Has to replace the TLorentzVector functionality
        #
        l.SetPtEtaPhiM(muon["PT"], muon["Eta"], muon["Phi"], 0.)
        
        pfisoCh = PFIso(l, 0.3, TrkPtMap, True)
        pfisoNeu = PFIso(l, 0.3, NeuPtMap, False)
        pfisoGamma = PFIso(l, 0.3, PhotonPtMap, False)
        if foundMuon == None and (pfisoCh+pfisoNeu+pfisoGamma)<0.45:
            foundMuon = [l.E(), l.Px(), l.Py(), l.Pz(), l.Pt(), l.Eta(), l.Phi(),
                         0., 0., 0., pfisoCh, pfisoGamma, pfisoNeu,
                         0., 0., 0., 0., 1., float(muon["Charge"])]
    if foundEle != None and foundMuon != None:
        if foundEle[5] > foundMuon[5]:
            return True, foundEle, foundMuon
        else:
            return True, foundMuon, foundEle
    if foundEle != None: return True, foundEle, foundMuon
    if foundMuon != None: return True, foundMuon, foundEle
    return False, None, None

In [16]:
def Phi_mpi_pi(x):
    while x >= 3.1415: 
        x -= 2*3.1415
    while x < -3.1415:
        x += 2*3.1415
    return x

In [17]:
class LorentzVector(object):
    def __init__(self, *args):
        if len(args)>0:
            self.x = args[0]
            self.y = args[1]
            self.z = args[2]
            self.t = args[3]
    
    def SetPtEtaPhiM(self, pt, eta, phi, mass):
        pt = abs(pt)
        self.SetXYZM(pt*math.cos(phi), pt*math.sin(phi), pt*math.sinh(eta), mass)
        
    def SetXYZM(self, x, y, z, m):
        self.x = x;
        self.y = y
        self.z = z
        if (m>=0):
            self.t = math.sqrt(x*x + y*y + z*z + m*m)
        else:
            self.t = math.sqrt(max(x*x + y*y + z*z - m*m, 0))
            
    def E(self):
        return self.t
    
    def Px(self): 
        return self.x
    
    def Py(self):
        return self.y
    
    def Pz(self):
        return self.z
    
    def Pt(self):
        return math.sqrt(self.x*self.x + self.y*self.y)
    
    def Eta(self):
        cosTheta = self.CosTheta()
        if cosTheta*cosTheta<1:
            return -0.5*math.log((1.0 - cosTheta)/(1.0 + cosTheta))
        if self.z == 0: return 0
    
    def mag(self):
        return math.sqrt(self.x*self.x + self.y*self.y + self.z*self.z)
    
    def CosTheta(self):
        return 1.0 if self.mag()==0.0 else self.z/self.mag()
    
    def Phi(self):
        return math.atan2(self.y, self.x)
    
    def DeltaR(self, other):
        deta = self.Eta() - other.Eta()
        dphi = Phi_mpi_pi(self.Phi() - other.Phi())
        return math.sqrt(deta*deta + dphi*dphi)