In [None]:
import numpy as np
import awkward as ak
import uproot
import matplotlib.pyplot as plt
import hist
import hist.dask as hda
import dask
import coffea.processor as processor
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import vector

NanoAODSchema.warn_missing_crossrefs = False

In [None]:
# Datasets:
#/BulkGravToWW_narrow_M-*_13TeV-madgraph/*NanoAODv7*/NANOAODSIM
#/RSGravToWWToWlepWhad_width0p1_M-*_TuneCUETP8M1_13TeV-madgraph-pythia8/*NanoAODv3*/NANOAODSIM


# When having a large number of files it is useful to put the list on a separate file.
# import json

# with open("semileptonic_notebooks/samples.json", 'r') as sample_file:
#     fileset = json.load(sample_file)

# for sample in fileset:
#     print(sample)

# For now we are only testing on a limited number of files so they are just listed here
fileset = {
    'BulkGravToWW': {
        'files': {
            'root://cmsxrootd.fnal.gov//store/mc/RunIISummer16NanoAODv7/BulkGravToWW_narrow_M-1000_13TeV-madgraph/NANOAODSIM/PUMoriond17_Nano02Apr2020_102X_mcRun2_asymptotic_v8-v1/100000/D4404DCB-FBF8-C640-87B0-2DA1D5139083.root': "Events",
        },
        'metadata': {
            'is_mc': 'Events',
        },
    },
    'RSGravToWW': {
        'files': {
            'root://cmsxrootd.fnal.gov//store/mc/RunIISummer16NanoAODv3/RSGravToWWToWlepWhad_width0p1_M-1200_TuneCUETP8M1_13TeV-madgraph-pythia8/NANOAODSIM/PUMoriond17_94X_mcRun2_asymptotic_v3-v1/60000/4442437A-52BB-E811-A8DA-90E2BACC5EEC.root': "Events",
          },
        'metadata': {
            'is_mc': 'Events',
        },
    }
}

In [None]:
# This step takes some time because it is loading the events
test_dataset = 'BulkGravToWW'
events = NanoEventsFactory.from_root(
    fileset[test_dataset]['files'],
    entry_stop = 1000,
    metadata = fileset[test_dataset]['metadata'],
    schemaclass = NanoAODSchema,
    delayed=False,
).events()

### Selecting interesting events

We will need to discuss what does it mean. For instance it could be events where both W decay to quarks to enhance our NN training sample, events where one W decays to leptons and one to quarks that more similar to what we will have in our final analysis. One could also select events where both Ws decay to leptons to compare with previous studies...

### TASK 1:
Lets refresh what is insiede an event

In [None]:
events.fields

### TASK 2:

You should already be familiar with some of the fields but you can try to understand more here https://cms-nanoaod-integration.web.cern.ch/.

The link above is very technical and used by experienced analyzers so I do not expect you to learn or understand everything in it (I also do not know all!!)

### TASK 3:

Let's apply basic selections on Muons and Electrons. Understand what the selections mean. You can have a better idea of their effect if you plot some variables as the pt or if you print the number of electrons/muons in the event. Or how many events are passing the selections.

In [None]:
muons = events.Muon
electrons = events.Electron


# Muon Tight selection
muons_tight = muons[
    (muons.pt > 10) &
    (np.abs(muons.eta) < 2.4) &
    (muons.tightId) &
    (
        ((muons.pt < 20) & (np.abs(muons.dxy) < 0.01)) |
        ((muons.pt >= 20) & (np.abs(muons.dxy) < 0.02))
    ) &
    (np.abs(muons.dz) < 0.1) &
    # particle flow isolated: tight or greater
    (muons.pfIsoId >= 4)
]



### TASK 4:

Try to add similar selections for the electrons as well



### TASK 5: 

Now require to have exactly one tightly identified electron or muon
and apply the following thresholds
```
electron_pt_threshold = 35
muon_pt_threshold = 30
pt_miss_threshold = 30
```

In [None]:
# Here are also some looser selections 
# You can play plotting variables for tight and loose identified leptons, 
# but it is not required...... 
loose_muons = muons[muons.looseId]
loose_muon_count = ak.num(loose_muons[loose_muons.pt > 10])

loose_electrons = electrons[(electrons.cutBased >= 2)]
loose_electron_count = ak.num(loose_electrons[loose_electrons.pt > 10])

# Mask for vetoing extra loose leptons
loose_lepton_veto_mask = (loose_muon_count + loose_electron_count) == 1

## We are mostly interested in Jets! 

### TASK 6:

Check the selections applied below on the jets. As before it is useful to plot some distributions like the jet mass or the momentum before and after applying the selections to check their effect and see how many events or jets pass a certain selection.

In [None]:
# FatJets cuts
clean_fatJets = events.FatJet[(events.FatJet.pt > 200) & (np.abs(events.FatJet.eta) < 2.4)]

#Jets cuts
clean_Jets = events.Jet[(events.Jet.pt > 30) & (np.abs(events.Jet.eta) < 4.7)]

### TASK 7:

To avoid double counting we need to remove Jets overlapping with FatJets. Check how this is done below.

In [None]:

#Removing AK4(Jet) jets overlapping with AK8(FatJets) jets
# Get all combinations of jets and fatjets in every event
jets_fatjets = ak.cartesian({"x": clean_Jets, "y": clean_fatJets})
# Check that jets satisfy the isolation
jets_iso_f = ((jets_fatjets["x"].eta-jets_fatjets["y"].eta)**2+(jets_fatjets["x"].phi-jets_fatjets["y"].phi)**2>0.8**2)
# Mask the jets_fatjets with the jets_iso_f to get jets isolated from fatjets
jets_fatjets = jets_fatjets[jets_iso_f]
# Separate pairs into jets and fatjets, redefining the jets (but not the fatjets)
jets, fj = ak.unzip(jets_fatjets)

### TASK 8:

1) Can you add below the initial FatJet pt distribution? and see how it is changed after applying all selections
2) Can you plot the second leading jet?

In [None]:
AK8jets_candidates_mask = ak.num(clean_fatJets) >= 1

Wjets_candidates = clean_fatJets[AK8jets_candidates_mask]
leading_W_jet = Wjets_candidates[:, 0]
leading_W_jet_pt = leading_W_jet.pt

leading_W_pt_hist = hist.Hist(hist.axis.StrCategory(name='dataset', label="Dataset", categories=[], growth=True),
                            hist.axis.Regular(name='leading_AK8_pt', label='Leading AK8 p_T (GeV)', bins=80, start=0, stop=4000))
leading_W_pt_hist.fill(dataset=test_dataset, leading_AK8_pt=leading_W_jet_pt)
leading_W_pt_hist.plot1d()

## Now that you are familiar with the physics behind what we are doing we move to the ML part


### TASK 9: 

To train the neural network it is useful to first select the interesting events (as we have done above). But now we need to save the events and variables in a more useful file format called h5. Check the example below to see how to add variables to be saved

In [None]:
# basic h5 creation
import pandas as pd
# Convert awkward array to numpy array (flat)
leading_W_jet_pt_np = ak.to_numpy(leading_W_jet_pt)
leading_W_jet_eta_np = ak.to_numpy(leading_W_jet.eta)
# Optional — wrap in a DataFrame for easy labeling
df_leading_W_jet_pt = pd.DataFrame({'leading_W_jet_pt': leading_W_jet_pt_np, 'leading_W_jet_eta': leading_W_jet_eta_np})


In [None]:
df_leading_W_jet_pt.to_hdf('leading_W_jet_pt.h5', key='df', mode='w')


In [None]:
# run this command to see if the file has been creatd
import os
os.listdir('.')

In [None]:
# now read the file to see if it looks ok
pd.read_hdf('leading_W_jet_pt.h5')


### TASK 10: 

Now we need to add variables that will be useful for our polarization tagger. So cos theta is one of them for sure! You can recall the calculations from the other notebook. 

Are there more variables we want to add? If nothing else comes up we can 
1) add the full stats!!
2) move to the ML part! 