# Notebook 1 - Simple Analyzer

This notebook takes CMS OpenData nanoAOD files, applies some selection and make few simple plots. 

Expected output: Histograms with the event selection.


Physics objects of interest: muons and jets. 

For more information: https://github.com/HEP-EPN/FourTopsCMSOpenData/wiki. 

To understand more about coffea (extremely useful): https://coffeateam.github.io/coffea/index.html. 

Let's first load the libraries:

In [39]:
import asyncio
import logging
import os
import time

import vector; vector.register_awkward() 
import awkward as ak
from coffea import processor
from coffea.nanoevents import transforms
from coffea.nanoevents.methods import base, vector
from coffea.nanoevents import NanoAODSchema
import hist
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import uproot

For future use, let's define some global configuration

In [3]:
#ifile = uproot.open("root://eosuser.cern.ch//eos/user/a/algomez/tmpFiles/opendata_files/SingleElectron/cmsopendata2015_Run2015D_SingleElectron_MINIAOD_08Jun2016-v1_21.root")
#ifile["Events"].keys()

#events = NanoEventsFactory.from_root(
#    "https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root",
#    schemaclass=NanoAODSchema.v6,
#    metadata={"dataset": "TT"},
#).events()

In [40]:
DATA = "SingleMuon"  

# input files per process, set to e.g. 10 (smaller number = faster)
N_FILES_MAX_PER_SAMPLE = 10

### BENCHMARKING-SPECIFIC SETTINGS

# chunk size to use
CHUNKSIZE = 500_000

# metadata to propagate through to metrics
CORES_PER_WORKER = 2  # does not do anything, only used for metric gathering (set to 2 for distributed coffea-casa)

# scaling for local setups with FuturesExecutor
NUM_CORES = 4

NanoAOD datasets are stored in `data/ntuples_nanoaod.json` folder. This json file contains information about the number of events, process and systematic. The following function reads the json file and returns a dictionary with the process to run.

In [41]:
def construct_fileset(n_files_max_per_sample,
                      dataset="SingleMuon",
                      onlyNominal=False,
                      ntuples_json="ntuples_nanoaod.json"):
    # using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data
    # for reference
    # x-secs are in pb
    xsec_info = {
        "ttbar": 831., ###396.87 + 332.97, # nonallhad + allhad, keep same x-sec for all
       # "single_top_s_chan": 2.0268 + 1.2676,
       # "single_top_t_chan": (36.993 + 22.175)/0.252,  # scale from lepton filter to inclusive
       # "single_top_tW": 35.6 + 35.6, #37.936 + 37.906,
       # "wjets": 61526, ##61457 * 0.252,  # e/mu+nu final states
        "tttt" : 0.009, 
        "data": None
    }

    # list of files
    with open(ntuples_json) as f:
        file_info = json.load(f)
    
    # process into "fileset" summarizing all info
    fileset = {}
    for process in file_info.keys():
        if process == "data":
            file_list = file_info[process][dataset]["files"]
            if n_files_max_per_sample != -1:
                #file_list = file_list[:int(n_files_max_per_sample/10)]
                file_list = file_list[:]  # use all data

            file_paths = [f["path"] for f in file_list]
            metadata = {"process": "data", "xsec": 1}
            fileset.update({"data": {"files": file_paths, "metadata": metadata}})
            

        for variation in file_info[process].keys():
            if onlyNominal & ~variation.startswith("nominal"): continue
            #print(variation)
            file_list = file_info[process][variation]["files"]
            if n_files_max_per_sample != -1:
                file_list = file_list[:n_files_max_per_sample]  # use partial set of samples, first 10 files

            file_paths = [f["path"] for f in file_list]
            nevts_total = sum([f["nevts"] for f in file_list])
            metadata = {"process": process, "variation": variation, "nevts": nevts_total, "xsec": xsec_info[process]}
            fileset.update({f"{process}__{variation}": {"files": file_paths, "metadata": metadata}})
            

    return fileset


In [42]:
fileset = construct_fileset(N_FILES_MAX_PER_SAMPLE, dataset=DATA,
                            onlyNominal=True, ntuples_json='../Tesis/ntuples_nanoaod.json') 

print(fileset["ttbar__nominal"]["metadata"]["nevts"])
print(fileset["tttt__nominal"]["metadata"]["nevts"])
#print(fileset["wjets__nominal"]["metadata"]["nevts"])
#print(fileset["data"]["files"])
#print(f"\nexample of information in fileset:\n{{\n  'files': [{fileset['ttbar__nominal']['files'][0]}, ...],")
#print(f"  'metadata': {fileset['ttbar__nominal']['metadata']}\n}}")
#print(f"\nexample of data information in fileset:\n{{\n  'files': [{fileset['data']['files'][0]}, ...],")

11378043
1210521


In [5]:
data_files = fileset["data"]["files"]

# Loop over the files and count the number of events in each one
nevents = 0
for data_file in data_files:
    with uproot.open(data_file) as rootfile:
        # Get the number of events in the "Events" TTree
        tree = rootfile["Events"]
        #print(len(tree))
        nevents += len(tree)

print(f"Total number of events: {nevents}")

Total number of events: 32938


## Analyzer

Here is the main analyzer. Uses coffea/awkward to make the analysis.

Advice: to understand how the selection is working, print the different arrays before and after the selections are made.

In [117]:
class fourTopAnalysis(processor.ProcessorABC):
    def __init__(self, DATASET):
        
        self.DATASET = DATASET
        
        ### booking histograms
        ## define categories
        process_cat = hist.axis.StrCategory([], name="process", label="Process", growth=True)
        variation_cat  = hist.axis.StrCategory([], name="variation", label="Systematic variation", growth=True)
        
        ## define bins (axis)
        pt_axis = hist.axis.Regular( bins=500, start=0, stop=500, name="var")
        eta_axis = hist.axis.Regular( bins=40, start=-5, stop=5, name="var")
        num_axis = hist.axis.Regular( bins=20, start=0, stop=20, name="var")
        #Htb
        htb_axis=hist.axis.Regular( bins=100, start=0, stop=1000, name="var")
        #Htratio
        htrat_axis=hist.axis.Regular(bins=500,start=0, stop=1, name="var")
        #3rd-highest CSV
        csv_axis=hist.axis.Regular(bins=100,start=0, stop=1, name="var")
        
        
        ## define a dictionary of histograms
        self.hist_muon_dict = {
            'muon_pt'  : (hist.Hist(pt_axis, process_cat, variation_cat, storage=hist.storage.Weight())),
            'muon_eta' : (hist.Hist(eta_axis, process_cat, variation_cat, storage=hist.storage.Weight())),
            'nmuons'   : (hist.Hist(num_axis, process_cat, variation_cat, storage=hist.storage.Weight())),
            'jets_pt'  : (hist.Hist(pt_axis, process_cat, variation_cat, storage=hist.storage.Weight())),
            'jets_eta' : (hist.Hist(eta_axis, process_cat, variation_cat, storage=hist.storage.Weight())),
            'njets'    : (hist.Hist(num_axis, process_cat, variation_cat, storage=hist.storage.Weight())), 
            'nbjets'   : (hist.Hist(num_axis, process_cat, variation_cat, storage=hist.storage.Weight())),
            'htb'      : (hist.Hist(htb_axis, process_cat, variation_cat, storage=hist.storage.Weight())), #variable for bdt
            'htrat'    : (hist.Hist(htrat_axis, process_cat, variation_cat, storage=hist.storage.Weight())), #variable for bdt
            'third_highest_csv': (hist.Hist(csv_axis, process_cat, variation_cat, storage=hist.storage.Weight())) #variable for bdt

        }
        
        sumw_dict = {'sumw': processor.defaultdict_accumulator(float)
        }
        

    def process(self, events):

        hists = self.hist_muon_dict.copy()

        process = events.metadata["process"]  # "ttbar" etc.

    
        if process != "data":
            # normalization for MC
            x_sec = events.metadata["xsec"]
            nevts_total = events.metadata["nevts"]
            lumi = 2256.38 # /pb integrated luminosity
            xsec_weight = x_sec * lumi / nevts_total #L*cross-section/N
        else:
            xsec_weight = 1

        events["pt_nominal"] = 1.0

        ### OBJECT SELECTION
        
        # Object selection: Muon (Tight - muon id definition in nanoAOD does not work, have to define manual)
        
        muon_is_global= events.Muon.isGlobal == True
        muon_is_tracker= events.Muon.isTracker == True
        
        loose_muon_selection= (events.Muon.pt > 10) & (abs(events.Muon.eta)<2.5) & (events.Muon.pfRelIso04_all < 0.25) & (muon_is_global | muon_is_tracker)
        selected_muon_selection = (events.Muon.pt > 26) & (abs(events.Muon.eta)<2.1) & (events.Muon.nTrackerLayers > 5) & (events.Muon.nStations > 0) & (abs(events.Muon.dxy) < 0.2) & (abs(events.Muon.dz) < 0.5) & (events.Muon.pfRelIso04_all < .15) & (muon_is_global & muon_is_tracker)
                # [[muon 1], [muon 1, muon 2],..]
                
        selected_muons=events.Muon[(loose_muon_selection & selected_muon_selection)]
        selected_muon= ak.count(selected_muons.pt, axis=1)==1 
        
        veto_muons=events.Muon[(loose_muon_selection & ~selected_muon_selection)] #  veto additional lose muon
        veto_muon= ak.count(veto_muons.pt, axis=1)== 0
        
        #Object selection: Jets
        selected_leptons = selected_muons 
        jet_selection = (events.Jet.pt * events["pt_nominal"] > 30) & (abs(events.Jet.eta) < 2.5) & (events.Jet.jetId > 1)
        selected_jets = events.Jet[jet_selection]
        nearest_lepton = selected_jets.nearest(selected_leptons, threshold=.4)
        selected_jets = selected_jets[ ~ak.is_none(nearest_lepton) ]
        
        ## the results of these 2 lines should be equivalent to the 2 lines above
        #lepton_mask = ak.any(selected_jets.metric_table(selected_lepton, metric=lambda j, e: ak.local_index(j, axis=1) == e.jetIdx,), axis=2)
        #selected_jets = selected_jets[~lepton_mask]
        
        selected_bjets = events.Jet[jet_selection & ~ak.is_none(nearest_lepton) & (events.Jet.btagCSVV2 >=0.8)]
        selected_jets_nobjets = events.Jet[jet_selection & ~ak.is_none(nearest_lepton) & ~(events.Jet.btagCSVV2 >=0.8)]  ### this we might use it later
        
       ################
        #### Event Selection
        ################
        
        if self.DATASET.endswith("Muon"):
            # trigger
            event_filters = ( events.HLT.IsoMu18 == 1 )  #trigger selection (1 value per event)
            selected_lepton = selected_muon
            veto_lepton = veto_muon

        event_filters = event_filters & ( selected_lepton & veto_lepton )
        # at least four jets
        event_filters = event_filters & (ak.count(selected_jets.pt, axis=1) >= 4)
        # at least one b-tagged jet ("tag" means score above threshold)
        event_filters = event_filters & (ak.sum(selected_bjets) >= 1)
        #print(event_filters)
        
        # apply event filters
        selected_events = events[event_filters]
        selected_muons = selected_muons[event_filters]
        selected_jets = selected_jets[event_filters]
        selected_bjets = selected_bjets[event_filters]
        
        ##### VARIABLES FOR BDT ####
        
        
        #### Calculate HTb
        htb = ak.sum(selected_bjets.pt, axis=1)
    
        
        #### Calculate H_t^ratio
        selected_jets_sorted = ak.sort(selected_jets.pt, axis=1, ascending=False)
        
        four_leading_jets=selected_jets_sorted[:, :4]
        ht_leading_jets=ak.sum(four_leading_jets,axis=1)
        
        other_jets=selected_jets_sorted[:, 4:]
        ht_other_jets = ak.sum(other_jets,axis=1)
        
        htrat = ht_other_jets/ht_leading_jets
        print(htrat)
        
        #### Calculate Third-highest CSV
        
        #Sort jets by CSV values 
        sorted_jets= ak.argsort(selected_jets.btagCSVV2, axis=1)
        
        #Extract the third-highest CSV value
        third_highest_csv = sorted_jets[:, 2]
        
        for ivar in [ "pt", "eta" ]:
            hists[f'muon_{ivar}'].fill(
                        var=ak.flatten(getattr(selected_muons, ivar)), process=process, variation="nominal", weight=xsec_weight)
            hists[f'jets_{ivar}'].fill(
                        var=ak.flatten(getattr(selected_jets, ivar)), process=process, variation="nominal", weight=xsec_weight)
            hists['nmuons'].fill(var=ak.count(selected_muons.pt, axis=1), process=process, variation="nominal", weight=xsec_weight)
            hists['njets'].fill(var=ak.count(selected_jets.pt, axis=1), process=process, variation="nominal", weight=xsec_weight)
            hists['nbjets'].fill(var=ak.count(selected_bjets.pt, axis=1), process=process,variation="nominal", weight=xsec_weight)
            hists['htb'].fill(var=htb, process=process, variation="nominal", weight=xsec_weight)
            hists['htrat'].fill(var=htrat, process=process, variation="nominal", weight=xsec_weight)
            hists['third_highest_csv'].fill(var=third_highest_csv, process=process, variation="nominal", weight=xsec_weight)
     
            output = {"nevents": {events.metadata["dataset"]: len(events)}, "hists" : hists}

            return output

    def postprocess(self, accumulator):
        return accumulator

Let's make it run:

In [118]:
executor = processor.FuturesExecutor(workers=NUM_CORES)

run = processor.Runner(executor=executor, schema=NanoAODSchema, 
                       savemetrics=True, metadata_cache={}, chunksize=CHUNKSIZE)
t0 = time.monotonic()
all_histograms, metrics = run(fileset, "Events", processor_instance=fourTopAnalysis(DATASET=DATA))
exec_time = time.monotonic() - t0


Preprocessing:   0%|          | 0/56 [00:00<?, ?file/s]

Processing:   0%|          | 0/69 [00:00<?, ?chunk/s]



[0.15, 0, 0.109, 0.126, 0, 0.26, 0, 0, 0, ... 0.126, 0, 0, 0, 0.162, 0, 0.0996, 0]
[0, 0, 0.287, 0, 0.115, 0, 0.396, 0, 0, ... 0, 0, 0, 0, 0.273, 0, 0.0923, 0, 0]
[0, 0, 0.286, 0, 0, 0, 0.173, 0, 0.121, ... 0, 0.316, 0, 0.24, 0, 0.108, 0.465, 0]
[0, 0.0818, 0, 0, 0.144, 0, 0, 0, 0, ... 0, 0.115, 0, 0, 0.126, 0.146, 0, 0.302, 0]
[0.337, 0, 0, 0, 0, 0.0978, 0, 0, 0, 0.11, 0, ... 0, 0, 0, 0.308, 0, 0, 0, 0, 0, 0]
[0, 0, 0.166, 0, 0, 0, 0.106, 0, 0, ... 0, 0, 0, 0.0786, 0, 0, 0.124, 0.142, 0]
[0, 0, 0, 0, 0, 0, 0.118, 0, 0.237, 0, ... 0, 0.11, 0, 0.186, 0, 0, 0, 0, 0.0922, 0]
[0, 0.112, 0.159, 0, 0, 0, 0, 0.228, 0, ... 0.208, 0.117, 0, 0.412, 0, 0, 0, 0]
[0, 0.284, 0, 0.099, 0, 0, 0, 0, 0, ... 0.0907, 0.0589, 0, 0, 0.0897, 0, 0, 0, 0]
[0, 0.181, 0, 0, 0.15, 0.114, 0.272, 0, 0, ... 0.12, 0.33, 0, 0, 0, 0, 0, 0.169]
[0, 0, 0.293, 0.0962, 0, 0, 0, 0.242, ... 0, 0.43, 0.104, 0.178, 0, 0, 0.301, 0]
[0, 0.119, 0, 0, 0, 0, 0, 0, 0.26, 0.312, ... 0, 0, 0.117, 0, 0.111, 0.134, 0, 0, 0]
[0, 0, 0, 0.

In [54]:
tttt_nevts = fileset["tttt__nominal"]["metadata"]["nevts"]
print(tttt_nevts)

1210521


In [55]:
ttbar_nevts = fileset["ttbar__nominal"]["metadata"]["nevts"]
print(ttbar_nevts)

11378043


In [119]:
import pickle

with open("histograms.pkl", "wb") as f:
    pickle.dump(all_histograms["hists"], f, protocol=pickle.HIGHEST_PROTOCOL)

In [120]:
dataset_source = "/data" if fileset["ttbar__nominal"]["files"][0].startswith("/data") else "https://xrootd-local.unl.edu:1094" # TODO: xcache support
metrics.update({"walltime": exec_time, "num_workers": NUM_CORES, "dataset_source": dataset_source, 
                "n_files_max_per_sample": N_FILES_MAX_PER_SAMPLE, 
                "cores_per_worker": CORES_PER_WORKER, "chunksize": CHUNKSIZE})#

print(f"event rate per worker (full execution time divided by NUM_CORES={NUM_CORES}): {metrics['entries'] / NUM_CORES / exec_time / 1_000:.2f} kHz")
print(f"event rate per worker (pure processtime): {metrics['entries'] / metrics['processtime'] / 1_000:.2f} kHz")
print(f"amount of data read: {metrics['bytesread']/1000**2:.2f} MB")  # likely buggy: https://github.com/CoffeaTeam/coffea/issues/717


event rate per worker (full execution time divided by NUM_CORES=4): 2.71 kHz
event rate per worker (pure processtime): 3.08 kHz
amount of data read: 6709.93 MB
