In [None]:
# imports

# external modules
import os
import sys
import json
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from fnmatch import fnmatch
from functools import partial
import importlib

# local modules
thisdir = os.getcwd()
topdir = os.path.abspath(os.path.join(thisdir, '../../../'))
sys.path.append(topdir)

import tools.iotools as iotools
import tools.dftools as dftools
import tools.patternfiltering as patternfiltering
from tools.dataloadertools import MEDataLoader
import studies.ecal_occupancy_2024.preprocessing.preprocessor
importlib.reload(studies.ecal_occupancy_2024.preprocessing.preprocessor)
from studies.ecal_occupancy_2024.preprocessing.preprocessor import PreProcessor, make_default_preprocessor
from studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy import plot_cluster_occupancy

In [None]:
# settings

# settings
datadir = '/eos/user/l/llambrec/dialstools-output-test'
year = '2024'
eras = [
    'B-v1',
    'C-v1',
    'D-v1',
    'E-v1',
    'E-v2',
    'F-v1',
    'G-v1',
    'H-v1',
    'I-v1',
    'I-v2'
]
dataset = 'ZeroBias'
reco = 'PromptReco'
menames = {
    'EB': 'EcalBarrel-EBOccupancyTask-EBOT digi occupancy',
    #'EE+': 'EcalEndcap-EEOccupancyTask-EEOT digi occupancy EE +',
    #'EE-': 'EcalEndcap-EEOccupancyTask-EEOT digi occupancy EE -'
}

# find files corresponding to settings
files = {}
for era in eras:
    files[era] = {}
    for melabel, mename in menames.items():
        mainera, version = era.split('-', 1)
        f = f'{dataset}-Run{year}{mainera}-{reco}-{version}-DQMIO-{mename}.parquet'
        f = os.path.join(datadir, f)
        files[era][melabel] = f

# existence check
missing = []
for era, temp in files.items():
    for melabel, f in temp.items():
        if not os.path.exists(f):
            missing.append(f)
if len(missing) > 0:
    raise Exception(f'The following files do not exist: {missing}')

In [None]:
# set path to OMS files

omsfiles = {}
for era in eras:
    basedir = '../omsdata'
    omsfile = os.path.join(basedir, f'omsdata_Run{year}{era}.json')
    if not os.path.exists(omsfile):
        print(f'WARNING: OMS file {omsfile} does not exist.')
    omsfiles[era] = omsfile

In [None]:
# set path to HLT rate files
# (just use the ones from pixel study as we're dealing with the same ZeroBias triggers)

hltfiles = {}
for era in eras:
    basedir = '../../pixel_clusters_2024/omsdata'
    hltfile = os.path.join(basedir, f'hltrate_Run{year}{era}.json')
    if not os.path.exists(hltfile):
        print(f'WARNING: HLT rate file {hltfile} does not exist.')
    hltfiles[era] = hltfile

In [None]:
# load total number of lumisections per era

nlumis = {}
melabel = list(menames.keys())[0]
for era in eras:
    dftemp = iotools.read_parquet(files[era][melabel], columns=['run_number'])
    nlumis[era] = len(dftemp)
nlumis['total'] = sum(nlumis.values())

In [None]:
from studies.pixel_clusters_2024.nmf.nmf_testing_pattern import filter_dfs # make local equivalent later

# loop over mes
results = {}
for melabel, mename in menames.items():
    
    # initialize results
    results[melabel] = {}
    
    # get shape of current ME
    temp = PreProcessor(melabel)
    meshape = temp.required_dims
    
    # initialize loss maps to prepend to each batch
    # (for the dynamic loss map and time correction to not reset between batches)
    prepend_size = 10 # must be larger than dynamic loss map and time correction size, maybe later determine automatically
    prepend_binary_loss_batch = np.zeros((prepend_size, meshape[0], meshape[1]))
    
    # loop over eras
    for era in eras:
        print(f'Now processing me {mename}, era {era}...')
        
        # make dataloader
        dataloader = MEDataLoader([files[era][melabel]])
        preprocessor_era = 'C-v1'
        preprocessor = make_default_preprocessor(preprocessor_era, melabel,
                     global_normalization = 'avg',
                     local_normalization = 'avg')
        
        # make (static) loss mask
        loss_mask_era = 'C-v1'
        loss_mask_file = f'../preprocessing/normdata/zerofrac_Run2024{loss_mask_era}_{melabel}.npy'
        loss_mask = np.load(loss_mask_file)
        loss_mask = (loss_mask < 0.9).astype(bool)
        total_loss_mask = np.copy(loss_mask)
        
        # other initializations
        flagged_run_numbers = []
        flagged_ls_numbers = []
        batch_filter_results = []
        batch_txt_results = []
        
        # load OMS data
        with open(omsfiles[era]) as f:
            oms_info = json.load(f)
            
        # load HLT rate data
        with open(hltfiles[era]) as f:
            hltrate_info = json.load(f)
        
        # run over batches
        batch_size = 20000
        batch_params = dataloader.prepare_sequential_batches(batch_size=batch_size)
        print(f'Will process {len(batch_params)} batches of size {batch_size}')
        for batch_idx, batch_param in enumerate(batch_params):
            batch = dataloader.read_sequential_batch(batch_param)
            batch_size = len(batch)
            print(f'  Loaded batch {batch_idx+1} / {len(batch_params)} with size {batch_size}')
            
            # filtering
            fdict = {melabel: batch}
            oms_filters = [
                ['beams_stable'],
                ['cms_active'],
                ['ebm_ready'],
                ['ebp_ready'],
                ['eem_ready'],
                ['eep_ready'],
                ['esm_ready'],
                ['esp_ready'],
                ['pileup', '>', 10]
            ]
            hltrate_filters = [
                ["HLT_ZeroBias_v*", '>', 25]
            ]
            mask, filter_results = filter_dfs(fdict,
                                     oms_info = oms_info,
                                     oms_filters = oms_filters,
                                     hltrate_info = hltrate_info,
                                     hltrate_filters = hltrate_filters
                                    )
            mask = mask.astype(bool)
            batch = batch[mask]
            batch_size_filtered = len(batch)
            print(f'  Found {batch_size_filtered} / {batch_size} instances passing filters.')
            if batch_size_filtered==0:
                # stop further processing (except adding filter results)
                batch_filter_results.append(filter_results)
                continue
                
            # update, hard-coded for now, to be generalized later:
            # get a more fine-grained local normalization.
            # note: just a rough first version, taking the proper normalization for the first run in the batch
            #       and assume it is valid for the whole batch; this is fine in most cases on the condition
            #       that the evaluation batch size is the same or much smaller than the batches used to make the normalization.
            first_run = int(batch['run_number'].values[0])
            candidates = sorted([f for f in os.listdir(f'../preprocessing/normdata/') if fnmatch(f, f'avgme_*_{melabel}_*')])
            norm_idx = -1
            for candidate_idx, candidate in enumerate(candidates):
                last_run = int(candidate.split('run')[-1].split('-')[-1].replace('.npy', ''))
                if last_run >= first_run:
                    norm_idx = candidate_idx - 1
                    break
            if norm_idx < 0: norm_idx = 0
            normfile = candidates[norm_idx]
            normfile = os.path.join('../preprocessing/normdata', normfile)
            normmap = np.load(normfile)
            normmap[normmap==0] = -1
            preprocessor.local_norm = normmap
            
            # preprocessing
            mes_preprocessed = preprocessor.preprocess(batch)
            
            # flagging criterion: look for empty regions
            #patterns = {
            #    'EB': [
            #        np.zeros((2,2))
            #    ],
            #    'EE+': [np.zeros((2,2))
            #     ],
            #    'EE-': [
            #        np.zeros((2,2))
            #    ]
            #}
            #flag_empty_regions = patternfiltering.contains_any_pattern(mes_preprocessed, patterns[melabel], mask=loss_mask).astype(bool)
            
            # look for regions different from the reference
            # (for reference we use just 1 and for the difference we use the ratio, so in practice nothing needs to be done,
            #  just define thresholds on the preprocessed MEs)
            binary_loss = np.logical_or(mes_preprocessed < 0.1, mes_preprocessed > 5)
            
            # prepend losses from the previous batch
            binary_loss = np.concatenate((prepend_binary_loss_batch, binary_loss))
            prepend_binary_loss_batch = binary_loss[-prepend_size:, :, :]
            
            # define dynamic loss map
            # (i.e. mask out bins that have been lossy for the past n LS)
            half_window_size = 5
            window = np.concatenate((np.zeros(half_window_size+1), np.ones(half_window_size))).astype(float)
            window /= np.sum(window)
            dynamic_loss_mask = sp.ndimage.convolve1d(binary_loss.astype(float), window, axis=0)
            dynamic_loss_mask = (dynamic_loss_mask >= 0.999)
            total_loss_mask = ((loss_mask) & (~dynamic_loss_mask))
                
            # do time correction
            half_window_size = 3
            window = np.concatenate((np.zeros(half_window_size-1), np.ones(half_window_size))).astype(float)
            window /= np.sum(window)
            binary_loss_time_corrected = sp.ndimage.convolve1d(binary_loss.astype(float), window, axis=0)
            binary_loss_time_corrected = (binary_loss_time_corrected >= 0.999)
            
            # flag patterns
            patterns = {
                'EB': [
                    np.ones((1,1))
                ],
                'EE+': [
                    np.ones((1,1))
                ],
                'EE-': [
                    np.ones((1,1))
                ]
            }
            flag_extrema = patternfiltering.contains_any_pattern(binary_loss_time_corrected, patterns[melabel], mask=total_loss_mask).astype(bool)
            flag_extrema = flag_extrema[prepend_size:]
            
            # optional: parse loss map to txt (e.g. for later processing with language models...)
            final_loss_map = np.multiply(binary_loss_time_corrected, total_loss_mask)[prepend_size:]
            ids = np.nonzero(final_loss_map)
            time_ids = ids[0]
            spatial_ids = (ids[1], ids[2])
            run_nbs = batch['run_number'].values[time_ids]
            ls_nbs = batch['ls_number'].values[time_ids]
            for idx in range(len(time_ids)):
                infostr = f'ME {melabel}, era {era}, run {run_nbs[idx]}, LS {ls_nbs[idx]}: bin {spatial_ids[0][idx]}, {spatial_ids[1][idx]}'
                batch_txt_results.append(infostr)
            
            # combine criteria
            flags = (
                #(flag_empty_regions) |
                (flag_extrema)
            )
            
            # store flagged lumisections
            flagged_run_numbers_batch = batch['run_number'].values[flags]
            flagged_ls_numbers_batch = batch['ls_number'].values[flags]
            n_unique_runs = len(np.unique(flagged_run_numbers_batch))
            print(f'    -> Found {np.sum(flags.astype(int))} flagged lumisections in {n_unique_runs} runs.')
            
            # add to results
            flagged_run_numbers.append(flagged_run_numbers_batch)
            flagged_ls_numbers.append(flagged_ls_numbers_batch)
            batch_filter_results.append(filter_results)
            
        # contatenate the results from the batches
        filter_results = {}
        if len(batch_filter_results)>0:
            for key in batch_filter_results[0].keys():
                filter_results[key] = sum([batch_filter_result[key] for batch_filter_result in batch_filter_results], [])
        if len(flagged_run_numbers) > 0:
            flagged_run_numbers = np.concatenate(flagged_run_numbers)
            flagged_ls_numbers = np.concatenate(flagged_ls_numbers)
        else:
            flagged_run_numbers = np.array([])
            flagged_ls_numbers = np.array([])
        
        # add to dict
        results[melabel][era] = {
            'flagged_run_numbers': flagged_run_numbers,
            'flagged_ls_numbers': flagged_ls_numbers,
            'filter_results': filter_results
        }
        
        # optional: write txt output
        outputfile = f'temp_{melabel}_{era}_flagged_bins.txt'
        with open(outputfile, 'w') as f:
            for line in batch_txt_results:
                f.write(line+'\n')
                
    # optional: concatenate txt output files
    cmd = f'cat temp_{melabel}_*_flagged_bins.txt > temp_{melabel}_flagged_bins.txt'
    os.system(cmd)

In [None]:
# print flagged lumisections

nflags = {}
ntotal_ls = 0
ntotal_runs = 0
for melabel, mename in menames.items():
    print(f'Results for {melabel}:')
    nflags[melabel] = {}
    for era in eras:
        flagged_run_numbers = results[melabel][era]['flagged_run_numbers']
        flagged_ls_numbers = results[melabel][era]['flagged_ls_numbers']
        print(f'  Flagged lumisections in era {era} ({len(flagged_run_numbers)}):')
        for run, ls in zip(flagged_run_numbers, flagged_ls_numbers):
            print(f'    Run {run}, LS {ls}')
        ntotal_ls += len(flagged_ls_numbers)
        ntotal_runs += len(np.unique(flagged_run_numbers))
        nflags[melabel][era] = len(flagged_ls_numbers)
print('---')
print('Results for total:')
print(f'  Flagged {ntotal_ls} lumisections in {ntotal_runs} runs.')

In [None]:
# make a plot of the filter results

do_plot_filter = True

if do_plot_filter:
    
    # help functions for plotting
    def abs_to_frac(x, tot=1):
        return x / tot

    def frac_to_abs(x, tot=1):
        return x * tot
    
    melabel = list(menames.keys())[0]
    nfiltered = {}
    for era in eras:
            filter_results = results[melabel][era]['filter_results']

            # make a table
            filter_results_arrays = {key: np.array([el[0]*10000+el[1] for el in val]) for key, val in filter_results.items()}
            failed_ls = np.unique(np.concatenate(list(filter_results_arrays.values())))
            nfiltered[era] = {key: len(val) for key, val in filter_results_arrays.items()}
            nfiltered[era]['total'] = len(failed_ls)

            # make a figure
            #fig, ax = plt.subplots(figsize=(8, 4))
            #ax.bar(nfiltered[era].keys(), nfiltered[era].values())
            #ax.set_xticks(ax.get_xticks())
            #ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=15)
            #ax.set_ylabel('Number of failing LS', fontsize=15)
            #ax.grid(which='both', axis='y', color='gray', linestyle='dashed')
            #ax.text(0, 1.03, f'Lumisection preselection for era {era}', transform=ax.transAxes, fontsize=15)
            #plt.show()
            
    # combined
    nfiltered['total'] = {}
    for key in nfiltered[eras[0]]:
        nfiltered['total'][key] = sum([nfiltered[era][key] for era in eras])
    
    # make a figure
    fig, ax = plt.subplots(figsize=(8, 4))
    labels = list(nfiltered['total'].keys())
    labels = [label.replace('>', '$>$') for label in labels]
    ax.bar(labels, nfiltered['total'].values())
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=15)
    ax.set_ylabel('Number of failing LS', fontsize=15)
    ax.grid(which='both', axis='y', color='gray', linestyle='dashed')
    ax.text(0, 1.03, f'Lumisection preselection', transform=ax.transAxes, fontsize=15)
    secyax = ax.secondary_yaxis('right', functions=(partial(abs_to_frac, tot=nlumis['total']), partial(frac_to_abs, tot=nlumis['total'])))
    secyax.set_ylabel('Fraction of failed LS', fontsize=15)
    plt.show()

In [None]:
# print fraction of flagged LS per era

tot_nlumis = 0
tot_npass = 0
tot_nflags = 0
melabel = list(menames.keys())[0]
for era in eras:
    this_nlumis = nlumis[era]
    this_npass = this_nlumis - nfiltered[era]['total']
    this_nflags = nflags[melabel][era]
    tot_nlumis += this_nlumis
    tot_npass += this_npass
    tot_nflags += this_nflags
    print(f'Era {era}:')
    print(f'Flagged {this_nflags} out of {this_nlumis} lumisections in total' + ' ({:.2f} %)'.format(this_nflags/this_nlumis*100))
    print(f'Flagged {this_nflags} out of {this_npass} lumisections that pass filters' + ' ({:.2f} %)'.format(this_nflags/this_npass*100))
    
print('-----')
print(f'Total:')
print(f'Flagged {tot_nflags} out of {tot_nlumis} lumisections in total' + ' ({:.2f} %)'.format(tot_nflags/tot_nlumis*100))
print(f'Flagged {tot_nflags} out of {tot_npass} lumisections that pass filters' + ' ({:.2f} %)'.format(tot_nflags/tot_npass*100))

In [None]:
# find out why a given lumisection did not pass the selections

test_runlumis = [(386795, 37), (386795, 38), (386795, 39), (386795, 40), (386795, 41), (386795, 42), (386795, 43)]
for runlumi in test_runlumis:
    failkeys = []
    for melabel, mename in menames.items():
        for era in eras:
            filter_results = results[melabel][era]['filter_results']
            
            for key, values in filter_results.items():
                if tuple(runlumi) in values: failkeys.append(key)
    if len(failkeys)==0:
        print(f'Lumisection {runlumi} not found in filter info, i.e. it did not seem to have failed any of the selections.')
    else:
        print(f'Lumisection {runlumi} failed the following selections:')
        print(failkeys)

In [None]:
# load data for plotting some random (or not random) examples

# random lumisections
nplot = 0
#random_ids = np.random.choice(len(available_run_numbers), size=min(nplot, len(available_run_numbers)), replace=False)
#selected_run_numbers = available_run_numbers[random_ids]
#selected_ls_numbers = available_ls_numbers[random_ids]
#random_ids = np.random.choice(len(flagged_run_numbers), size=min(nplot, len(flagged_run_numbers)), replace=False)
#selected_run_numbers = flagged_run_numbers[random_ids]
#selected_ls_numbers = flagged_ls_numbers[random_ids]

# alternative: specific selected lumisections
era = 'I-v2'
selected_runlumis = [(386851, 97)]
selected_run_numbers = [el[0] for el in selected_runlumis]
selected_ls_numbers = [el[1] for el in selected_runlumis]

if len(selected_run_numbers) > 0:
    
    # calculate random indices and load data
    print('Loading data...')
    dfs = {}
    mes = {}
    for melabel, mename in menames.items():
        dfs[melabel] = iotools.read_lumisections(files[era][melabel], selected_run_numbers, selected_ls_numbers, mode='batched')
        mes[melabel], runs, lumis = dftools.get_mes(dfs[melabel], xbinscolumn='x_bin', ybinscolumn='y_bin', runcolumn='run_number', lumicolumn='ls_number')
    print('Done.')

In [None]:
# plot examples

import studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy
importlib.reload(studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy)
from studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy import plot_cluster_occupancy

if len(selected_run_numbers) > 0:
    
    # make loss mask
    loss_mask_era = 'C-v1'
    loss_mask_file = f'../preprocessing/normdata/zerofrac_Run2024{loss_mask_era}_{melabel}.npy'
    loss_mask = np.load(loss_mask_file)
    loss_mask = (loss_mask < 0.9).astype(bool)
    
    # preprocess
    print('Processing...')
    mes_preprocessed = {}
    mes_pred = {}
    losses = {}
    for melabel, mename in mes.items():
        preprocessor_era = 'C-v1'
        preprocessor = make_default_preprocessor(preprocessor_era, melabel,
                         global_normalization = 'avg',
                         local_normalization = 'avg')
        mes_preprocessed[melabel] = preprocessor.preprocess(dfs[melabel])
        
    # calculate binary loss
    binary_losses = {}
    for melabel, mename in mes.items():
        binary_losses[melabel] = np.logical_or(mes_preprocessed[melabel] < 0.1, mes_preprocessed[melabel] > 5)
        
    # do (static) loss masking
    binary_losses_static_masked = {}
    for melabel, mename in mes.items():
        binary_losses_static_masked[melabel] = np.multiply(binary_losses[melabel], loss_mask[np.newaxis, :, :])
        
    # make the plots
    print('Plotting...')
    for idx in range(len(selected_run_numbers)):
        run = runs[idx]
        lumi = lumis[idx]
        for melabel, mename in menames.items():
            me_orig = mes[melabel][idx, :, :]
            me_preprocessed = mes_preprocessed[melabel][idx, :, :]
            binary_loss = binary_losses[melabel][idx, :, :]
            binary_loss_static_masked = binary_losses_static_masked[melabel][idx, :, :]
    
            # initialize figure
            fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(18, 6), squeeze=False)
            
            # plot raw data
            fig, axs[0, 0] = plot_cluster_occupancy(me_orig, fig=fig, ax=axs[0, 0],
                   title='Raw', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Digi occupancy',
                   caxtitlesize=15, caxtitleoffset=15)
        
            # plot preprocessed
            fig, axs[0, 1] = plot_cluster_occupancy(me_preprocessed, fig=fig, ax=axs[0, 1],
                   title='Input', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Digi occupancy\n(normalized)',
                   caxrange=(1e-6,5),
                   caxtitlesize=15, caxtitleoffset=30)
            
            # plot binary loss
            fig, axs[0, 2] = plot_cluster_occupancy(binary_loss, fig=fig, ax=axs[0, 2],
                   title='Binary loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(1e-6, 1),
                   caxtitlesize=15, caxtitleoffset=30)
            
            # plot binary loss (with static loss mask)
            fig, axs[1, 0] = plot_cluster_occupancy(binary_loss_static_masked, fig=fig, ax=axs[1, 0],
                   title='Binary loss (with mask applied)', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(1e-6, 1),
                   caxtitlesize=15, caxtitleoffset=30)
                
            # plot aesthetics
            plt.subplots_adjust(hspace=0.5, wspace=-0.1)
            fig.tight_layout()
            title = f'Run {run}, LS {lumi}, ECAL {melabel}'
            axs[0, 0].text(0.01, 1.3, title, fontsize=15, transform=axs[0, 0].transAxes)
            plt.show()
            plt.close()