In [2]:
from glob import glob
import uproot as ur
import numpy as np
import json, os
import os.path as osp
from tqdm.auto import tqdm
from json import load
from phc import module_reload, plot_hist
module_reload('zhh')
from zhh import parse_sample_path, get_preselection_passes, ProcessIndex
version = 'v1'

REPO_ROOT = '/afs/desy.de/user/b/bliewert/public/MarlinWorkdirs/ZHH'
DATA_ROOT = f'/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/{version}'
INDEX_DIR = '/nfs/dust/ilc/user/bliewert/zhh/CreateRawIndex/v1'

processes = np.load(f'{INDEX_DIR}/processes.npy')
samples = np.load(f'{INDEX_DIR}/samples.npy')
chunks = np.load(f'{DATA_ROOT}/../../CreatePreselectionChunks/v1/chunks.npy')

In [8]:
a = glob('/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_*.txt')
a.sort()

In [11]:
a

['/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_0To1.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10000To10001.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10001To10002.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10002To10003.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10003To10004.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10004To10005.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10005To10006.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10006To10007.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10007To10008.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10008To10009.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_10009To10010.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/v1/stdall_1000To1001.txt',
 '/nfs/dust/ilc/user/bliewert/zhh/Pres

Validate chunks

In [5]:
for chunk in chunks:
    with open(f'{DATA_ROOT}/{chunk["branch"]}_FinalStateMeta.json') as metaf:
        meta = load(metaf)
        
        if meta['nEvtSum'] != chunk['chunk_size']:
            raise Exception(f"Chunk mismatch for branch <{chunk['branch']}> : {meta['nEvtSum']} vs {chunk['chunk_size']}")

Exception: Chunk mismatch for branch <0> : 3583 vs 3600

In [5]:
files = ['_PreSelection_llHH.root',
         '_PreSelection_vvHH.root',
         '_PreSelection_qqHH.root',
         '_FinalStates.root',
         '_FinalStateMeta.json',
         '.slcio',
         '_Source.txt']

sel_chunks = chunks[chunks['chunk_start'] == 0]
to_delete = []
for chunk in sel_chunks:
    for f in files:
        path = f'{DATA_ROOT}/{chunk["branch"]}{f}'
        if osp.isfile(path):
            to_delete.append(path)

for p in tqdm(to_delete):
    os.remove(p)

In [5]:
chunks[chunks['branch'] == 12902]

array([(12902, 'eeveev', 'eeveev_RR', '/pnfs/desy.de/ilc/prod/ilc/ild/copy/dst-merged/500-TDR_ws/6f_eeWW/ILD_l5_o1_v02/v02-00-01/rv02-00-01.sv02-00-01.mILD_l5_o1_v02.E500-TDR_ws.I108622.Peeveev.eR.pR.n001.d_dstm_10322_0.slcio', 0, 0, 1)],
      dtype=[('branch', '<i4'), ('process', '<U60'), ('proc_pol', '<U64'), ('location', '<U512'), ('n_chunks', '<i4'), ('chunk_start', '<i4'), ('chunk_size', '<i4')])

Introduce indices on samples and processes (included in future runs)

In [12]:
dtype_sample = ProcessIndex.dtype_sample
dtype_process = ProcessIndex.dtype_process

if not 'sid' in samples.dtype.names:
    samples_new = np.empty(len(samples), dtype=dtype_sample)
    samples_new['sid'] = np.arange(len(samples))
    for col in samples.dtype.names:
        samples_new[col] = samples[col]
        
    if len(samples) == len(samples_new):
        np.save(f'{INDEX_DIR}/samples.npy', samples_new)

if not 'pid' in processes.dtype.names:
    #processes_new = np.array([pids, *(processes[col] for col in processes.dtype.names)], dtype=dtype_process)
    processes_new = np.empty(len(processes), dtype=dtype_process)
    processes_new['pid'] = np.arange(len(processes))
    for col in processes.dtype.names:
        processes_new[col] = processes[col]

    if len(processes) == len(processes_new):
        np.save(f'{INDEX_DIR}/processes.npy', processes_new)
        
print('Conversion successful')

Conversion successful


Testing

In [43]:
rf = ur.open('/afs/desy.de/user/b/bliewert/public/MarlinWorkdirs/ZHH/output/zhh_PreSelection_llHH.root')

In [55]:
tree = rf['eventTree']
rf_keys = tree.keys()

rf['eventTree'][rf_keys[0]].array()

Prototyping

In [2]:
results = get_preselection_passes(DATA_ROOT)

  0%|          | 0/14449 [00:00<?, ?it/s]

In [3]:
np.save(f'{REPO_ROOT}/preselection.npy', results)

In [14]:
results = np.load(f'{REPO_ROOT}/preselection.npy')

In [15]:
from ast import literal_eval as make_tuple
import json

def test_meta_files(DATA_ROOT:str=DATA_ROOT)->bool:
    files = glob(f'{DATA_ROOT}/*_Source.txt')
    #branches = list(map(lambda x: x.split(f'{DATA_ROOT}/')[1].split('_Source.txt')[0], files))

    for f in tqdm(files):
        branch = f.split(f'{DATA_ROOT}/')[1].split('_Source.txt')[0]
        
        if osp.isfile(f'{DATA_ROOT}/{branch}_FinalStateMeta.json'):
            with open(f, 'r') as file:
                src_spec = file.read()
                if src_spec.startswith('('):
                    src_file, chunk_start, chunk_end = make_tuple(src_spec)
                else:
                    src_file = src_spec
            
            # Read metadata
            with open(f'{DATA_ROOT}/{branch}_FinalStateMeta.json', 'r') as file:
                meta = json.load(file)

            n_gen = meta['nEvtSum']
            proc = meta["processName"]
            
            if proc == '' or n_gen == 0:
                print(src_file)
                raise Exception(branch)
    
    return True

In [16]:
test_meta_files()

  0%|          | 0/14449 [00:00<?, ?it/s]

True

In [17]:
for entry in results:
    print(f'{entry["proc_pol"]} | {entry["n_gen"]} events | wt: {entry["weight"]} | {entry["n_pass_llhh"]} : {entry["n_pass_vvhh"]} : {entry["n_pass_qqhh"]}')
    
#np.save(f'{REPO_ROOT}/results.npy', results)

2f_z_bhabhag_LL | 74271 events | wt: 50.249691009521484 | 310 : 0 : 0
2f_z_bhabhag_LR | 169230 events | wt: 50.40285110473633 | 659 : 0 : 0
2f_z_bhabhag_RL | 9394 events | wt: 50.254844665527344 | 41 : 0 : 0
2f_z_bhabhag_RR | 15299 events | wt: 50.2515983581543 | 66 : 0 : 0
2f_z_bhabhang_LL | 2966452 events | wt: 52.631221771240234 | 3252 : 0 : 0
2f_z_bhabhang_LR | 2984564 events | wt: 104.33204650878906 | 3772 : 0 : 0
2f_z_bhabhang_RL | 362853 events | wt: 50.24860763549805 | 421 : 0 : 0
2f_z_bhabhang_RR | 641176 events | wt: 50.24874496459961 | 679 : 0 : 0
2f_z_h_LR | 1498273 events | wt: 50.71236801147461 | 94 : 492 : 2956
2f_z_h_RL | 50136 events | wt: 50.24848175048828 | 2 : 16 : 95
2f_z_l_LR | 259201 events | wt: 50.08212661743164 | 100 : 3 : 6
2f_z_l_RL | 12145 events | wt: 50.0848388671875 | 1 : 0 : 0
2f_z_nung_LR | 1313478 events | wt: 50.04869079589844 | 0 : 0 : 0
2f_z_nung_RL | 5416 events | wt: 50.085670471191406 | 0 : 0 : 0
4f_lowmee_sze_l_LL | 9 events | wt: 54.8708992004

Preselection Detailed

In [29]:
a = ur.open(f'{DATA_ROOT}/{branch}_PreSelection_{presel}.root:eventTree')

In [30]:
a.show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
run                  | int32_t                  | AsDtype('>i4')
event                | int32_t                  | AsDtype('>i4')
nJets                | int32_t                  | AsDtype('>i4')
nIsoLeptons          | int32_t                  | AsDtype('>i4')
lepTypes             | std::vector<int32_t>     | AsJagged(AsDtype('>i4'), he...
missingPT            | float                    | AsDtype('>f4')
Evis                 | float                    | AsDtype('>f4')
thrust               | float                    | AsDtype('>f4')
dileptonMass         | float                    | AsDtype('>f4')
dileptonMassDiff     | float                    | AsDtype('>f4')
dijetChi2min         | float                    | AsDtype('>f4')
dijetMass            | std::vector<float>       | AsJagged(AsDtype('>f4'), he...
dijetMassDiff        | std

In [31]:
a['dileptonMass'].array()

In [32]:
a['lepTypes'].array()

In [25]:
from typing import Union

if False:
    DATA_ROOT = f'/nfs/dust/ilc/user/bliewert/zhh/PreselectionFinal/{version}'
    branch = 1
    presel = 'llHH'
else:
    DATA_ROOT = f'/afs/desy.de/user/b/bliewert/public/MarlinWorkdirs/ZHH/output'
    branch = 'zhh'
    presel = 'llHH'

def analyze_presel_file(DATA_ROOT:str, branch:int, presel):
    branch = str(branch)
    
    with ur.open(f'{DATA_ROOT}/{branch}_PreSelection_{presel}.root') as rf:
        if presel == 'llHH':
            nIsoLeptons = np.array(rf['eventTree']['nIsoLeptons'].array())
            print(nIsoLeptons)
            
            lepTypes = rf['eventTree']['lepTypes'].array()
            print(lepTypes)
            
            print(np.where(lepTypes != 0))
            
            dileptonMass = np.array(rf['eventTree']['dileptonMass'].array())
            print(dileptonMass)
        
    #plot_hist(dileptonMass, xlim=[0,500]);

analyze_presel_file(DATA_ROOT, branch, presel)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 1 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
[[], [], [], [], [], [], [], [], [], ..., [], [], [], [], [], [], [], [], []]


ValueError: cannot convert to RegularArray because subarray lengths are not regular (in compiled code: https://github.com/scikit-hep/awkward/blob/awkward-cpp-26/awkward-cpp/src/cpu-kernels/awkward_ListOffsetArray_toRegularArray.cpp#L22)

In [None]:
def get_preselection_per_channel(
    DATA_ROOT:str,
    channel:str,
    version:str='v1')->np.ndarray:
    
    dtype = [
        ('process', '<U60'),
        ('pol_e', 'i'),
        ('pol_p', 'i'),
        ('n_gen', 'i'),
        ('cross_sec', 'f'),
        ('cross_sec_err', 'f'),
        ('n_pass_llhh', 'i'),
        ('n_pass_vvhh', 'i'),
        ('n_pass_qqhh', 'i'),
        ('weight', 'f'),
        ('proc_pol', '<U64')]

    results = np.empty(0, dtype=dtype)
    finished = glob(f'{DATA_ROOT}/*_Source.txt')
    finished.sort()

    for f in (pbar := tqdm(finished)):
        branch = f.split(f'{version}/')[1].split('_Source.txt')[0]
        
        if osp.isfile(f'{DATA_ROOT}/{branch}_FinalStateMeta.json'):
            # Source file is written at the very end -> .root files exist
            
            with open(f, 'r') as file:
                src_spec = file.read()
                if src_spec.startswith('('):
                    src_file, chunk_start, chunk_end = make_tuple(src_spec)
                else:
                    src_file = src_spec
                
            # Read metadata
            with open(f'{DATA_ROOT}/{branch}_FinalStateMeta.json', 'r') as file:
                meta = json.load(file)
                
                # {'crossSection': 133070.796875, 'crossSectionError': 78.4000015258789, 'eventWeight': 1.0, 'nEvtSum': 995, 'polElectron': -1.0, 'polPositron': 1.0, 'processId': 250127, 'processName': '2f_z_bhabhang', 'run': 250127}
                
            n_gen = meta['nEvtSum']
            proc = meta["processName"]
            
            loc, pol = parse_sample_path(src_file)
            procpol = f'{proc}_{get_pol_key(*pol)}'
            
            pbar.set_description(f'Adding {n_gen} events from branch {branch} ({procpol})')
            
            if not procpol in results['proc_pol']:
                results = np.append(results, [np.array([
                    (proc, pol[0], pol[1], n_gen, meta['crossSection'], meta['crossSectionError'], 0, 0, 0, 0., procpol)
                ], dtype=dtype)])
            else:
                results['n_gen'][results['proc_pol'] == procpol] += n_gen
                
            for presel in ['llHH', 'vvHH', 'qqHH']:
                with ur.open(f'{DATA_ROOT}/{branch}_PreSelection_{presel}.root') as rf:
                    passed = rf['eventTree']['preselPassed'].array()
                    
                    n_items = len(passed)
                    if n_items != n_gen:
                        raise Exception('Constraint mismatch')
                    
                    n_passed = np.sum(passed)
                    results[f'n_pass_{presel.lower()}'][results['proc_pol'] == procpol] += n_passed
                    
    for entry in results:
        pol = (entry['pol_e'], entry['pol_p'])
        results['weight'][results['proc_pol'] == entry['proc_pol']] = sample_weight(entry['cross_sec'], pol, entry['n_gen'])

    results = results[np.argsort(results['proc_pol'])]
    
    return results

In [30]:
chunks[chunks['process'] == 'e1e1hh']

array([],
      dtype=[('branch', '<i4'), ('process', '<U60'), ('proc_pol', '<U64'), ('location', '<U512'), ('n_chunks', '<i4'), ('chunk_start', '<i4'), ('chunk_size', '<i4')])

In [33]:
list(filter(lambda x: 'hh' in x, np.unique(chunks['process'])))

['n1n1hh', 'n23n23hh', 'qqhh']