In [None]:
# imports

# external modules
import os
import sys
import json
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import importlib

# local modules
thisdir = os.getcwd()
topdir = os.path.abspath(os.path.join(thisdir, '../../../'))
sys.path.append(topdir)

import tools.iotools as iotools
import tools.dftools as dftools
import tools.patternfiltering as patternfiltering
from tools.dataloadertools import MEDataLoader
import studies.ecal_occupancy_2024.preprocessing.preprocessor
importlib.reload(studies.ecal_occupancy_2024.preprocessing.preprocessor)
from studies.ecal_occupancy_2024.preprocessing.preprocessor import make_default_preprocessor
from studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy import plot_cluster_occupancy

In [None]:
# settings

# settings
datadir = '/eos/user/l/llambrec/dialstools-output-test'
year = '2024'
eras = [
    'B-v1',
    'C-v1',
    'D-v1',
    'E-v1',
    'E-v2',
    #'F-v1',
    #'G-v1',
    #'H-v1',
    #'I-v1',
    #'I-v2'
]
dataset = 'ZeroBias'
reco = 'PromptReco'
menames = {
    'EB': 'EcalBarrel-EBOccupancyTask-EBOT digi occupancy',
    #'EE+': 'EcalEndcap-EEOccupancyTask-EEOT digi occupancy EE +',
    #'EE-': 'EcalEndcap-EEOccupancyTask-EEOT digi occupancy EE -'
}

# find files corresponding to settings
files = {}
for era in eras:
    files[era] = {}
    for melabel, mename in menames.items():
        mainera, version = era.split('-', 1)
        f = f'{dataset}-Run{year}{mainera}-{reco}-{version}-DQMIO-{mename}.parquet'
        f = os.path.join(datadir, f)
        files[era][melabel] = f

# existence check
missing = []
for era, temp in files.items():
    for melabel, f in temp.items():
        if not os.path.exists(f):
            missing.append(f)
if len(missing) > 0:
    raise Exception(f'The following files do not exist: {missing}')

In [None]:
# set path to OMS files

omsfiles = {}
for era in eras:
    basedir = '../omsdata'
    omsfile = os.path.join(basedir, f'omsdata_Run{year}{era}.json')
    if not os.path.exists(omsfile):
        print(f'WARNING: OMS file {omsfile} does not exist.')
    omsfiles[era] = omsfile

In [None]:
from studies.pixel_clusters_2024.nmf.nmf_testing_pattern import filter_dfs # make local equivalent later

# loop over eras and mes
results = {}
for melabel, mename in menames.items():
    results[melabel] = {}
    for era in eras:
        print(f'Now processing me {mename}, era {era}...')
        
        # make dataloader
        dataloader = MEDataLoader([files[era][melabel]])
        preprocessor_era = 'C-v1'
        preprocessor = make_default_preprocessor(preprocessor_era, melabel,
                     global_normalization = 'avg',
                     local_normalization = 'avg')
        
        # make loss mask
        loss_mask_era = 'C-v1'
        loss_mask_file = f'../preprocessing/normdata/zerofrac_Run2024{loss_mask_era}_{melabel}.npy'
        loss_mask = np.load(loss_mask_file)
        loss_mask = (loss_mask < 0.9).astype(bool)
        
        # other initializations
        flagged_run_numbers = []
        flagged_ls_numbers = []
        batch_filter_results = []
        
        # load OMS data
        with open(omsfiles[era]) as f:
            oms_info = json.load(f)
        
        # run over batches
        batch_size = 10000
        batch_params = dataloader.prepare_sequential_batches(batch_size=batch_size)
        print(f'Will process {len(batch_params)} batches of size {batch_size}')
        for batch_idx, batch_param in enumerate(batch_params):
            batch = dataloader.read_sequential_batch(batch_param)
            batch_size = len(batch)
            print(f'  Loaded batch {batch_idx+1} / {len(batch_params)} with size {batch_size}')
            
            # filtering
            fdict = {melabel: batch}
            min_entries = {'EB': 25e3, 'EE+': 2e3, 'EE-': 2e3}
            oms_filters = [
                ['beams_stable'],
                ['cms_active'],
                ['ebm_ready'],
                ['ebp_ready'],
                ['eem_ready'],
                ['eep_ready'],
                ['esm_ready'],
                ['esp_ready'],
            ]
            mask, filter_results = filter_dfs(fdict,
                                     min_entries_filter = min_entries,
                                     oms_info = oms_info,
                                     oms_filters = oms_filters)
            mask = mask.astype(bool)
            batch = batch[mask]
            batch_size_filtered = len(batch)
            print(f'  Found {batch_size_filtered} / {batch_size} instances passing filters.')
            if batch_size_filtered==0: continue
            
            # preprocessing
            mes_preprocessed = preprocessor.preprocess(batch)
            
            # flagging criterion: look for empty regions
            patterns = {
                'EB': [
                    np.zeros((2,2))
                ],
                'EE+': [np.zeros((2,2))
                 ],
                'EE-': [
                    np.zeros((2,2))
                ]
            }
            flag_empty_regions = patternfiltering.contains_any_pattern(mes_preprocessed, patterns[melabel], mask=loss_mask).astype(bool)
            
            # flagging criterion: look for regions different from the reference
            # (for reference we use just 1 and for the difference we use the ratio, so in practice nothing needs to be done,
            #  just define thresholds on the preprocessed MEs)
            binary_loss = np.logical_or(mes_preprocessed < 0.2, mes_preprocessed > 5)
            # do time correction
            window_size = 3
            binary_loss = sp.ndimage.convolve1d(binary_loss, np.ones(window_size)/window_size, axis=0)
            binary_loss = (binary_loss == 1)
            patterns = {
                'EB': [
                    np.ones((2,2))
                ],
                'EE+': [
                    np.ones((2,2))
                ],
                'EE-': [
                    np.ones((2,2))
                ]
            }
            flag_extrema = patternfiltering.contains_any_pattern(binary_loss, patterns[melabel], mask=loss_mask).astype(bool)
            
            # todo: add more
            
            # combine criteria
            flags = (
                (flag_empty_regions) |
                (flag_extrema)
            )
            
            # store flagged lumisections
            flagged_run_numbers_batch = batch['run_number'].values[flags]
            flagged_ls_numbers_batch = batch['ls_number'].values[flags]
            n_unique_runs = len(np.unique(flagged_run_numbers_batch))
            print(f'    -> Found {np.sum(flags.astype(int))} flagged lumisections in {n_unique_runs} runs.')
            
            # add to results
            flagged_run_numbers.append(flagged_run_numbers_batch)
            flagged_ls_numbers.append(flagged_ls_numbers_batch)
            batch_filter_results.append(filter_results)
            
        # contatenate the results from the batches
        filter_results = {}
        if len(batch_filter_results)>0:
            for key in batch_filter_results[0].keys():
                filter_results[key] = sum([batch_filter_result[key] for batch_filter_result in batch_filter_results], [])
        if len(flagged_run_numbers) > 0:
            flagged_run_numbers = np.concatenate(flagged_run_numbers)
            flagged_ls_numbers = np.concatenate(flagged_ls_numbers)
        else:
            flagged_run_numbers = np.array([])
            flagged_ls_numbers = np.array([])
        
        # add to dict
        results[melabel][era] = {
            'flagged_run_numbers': flagged_run_numbers,
            'flagged_ls_numbers': flagged_ls_numbers,
        }

In [None]:
# print flagged lumisections

for melabel, mename in menames.items():
    print(f'Results for {melabel}:')
    for era in eras:
        flagged_run_numbers = results[melabel][era]['flagged_run_numbers']
        flagged_ls_numbers = results[melabel][era]['flagged_ls_numbers']
        print(f'  Flagged lumisections in era {era} ({len(flagged_run_numbers)}):')
        for run, ls in zip(flagged_run_numbers, flagged_ls_numbers):
            print(f'    Run {run}, LS {ls}')

In [None]:
# load data for plotting some random (or not random) examples

# random lumisections
nplot = 0
#random_ids = np.random.choice(len(available_run_numbers), size=min(nplot, len(available_run_numbers)), replace=False)
#selected_run_numbers = available_run_numbers[random_ids]
#selected_ls_numbers = available_ls_numbers[random_ids]
#random_ids = np.random.choice(len(flagged_run_numbers), size=min(nplot, len(flagged_run_numbers)), replace=False)
#selected_run_numbers = flagged_run_numbers[random_ids]
#selected_ls_numbers = flagged_ls_numbers[random_ids]

# alternative: specific selected lumisections
era = 'E-v1'
selected_runlumis = [(381151, 320)]
selected_run_numbers = [el[0] for el in selected_runlumis]
selected_ls_numbers = [el[1] for el in selected_runlumis]

if len(selected_run_numbers) > 0:
    
    # calculate random indices and load data
    print('Loading data...')
    dfs = {}
    mes = {}
    for melabel, mename in menames.items():
        dfs[melabel] = iotools.read_lumisections(files[era][melabel], selected_run_numbers, selected_ls_numbers, mode='batched')
        mes[melabel], runs, lumis = dftools.get_mes(dfs[melabel], xbinscolumn='x_bin', ybinscolumn='y_bin', runcolumn='run_number', lumicolumn='ls_number')

In [None]:
# plot examples

import studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy
importlib.reload(studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy)
from studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy import plot_cluster_occupancy

if len(selected_run_numbers) > 0:
    
    # make loss mask
    loss_mask_era = 'C-v1'
    loss_mask_file = f'../preprocessing/normdata/zerofrac_Run2024{loss_mask_era}_{melabel}.npy'
    loss_mask = np.load(loss_mask_file)
    loss_mask = (loss_mask < 0.9).astype(bool)
    
    # preprocess
    print('Processing...')
    mes_preprocessed = {}
    mes_pred = {}
    losses = {}
    for melabel, mename in mes.items():
        preprocessor_era = 'C-v1'
        preprocessor = make_default_preprocessor(preprocessor_era, melabel,
                         global_normalization = 'avg',
                         local_normalization = 'avg')
        mes_preprocessed[melabel] = preprocessor.preprocess(dfs[melabel])
        
    # calculate binary loss
    binary_losses = {}
    for melabel, mename in mes.items():
        binary_loss = np.logical_or(mes_preprocessed[melabel] < 0.2, mes_preprocessed[melabel] > 5)
        binary_loss = np.multiply(binary_loss, loss_mask[np.newaxis, :, :])
        binary_losses[melabel] = binary_loss
        
    # make the plots
    print('Plotting...')
    for idx in range(len(selected_run_numbers)):
        run = runs[idx]
        lumi = lumis[idx]
        for melabel, mename in menames.items():
            me_orig = mes[melabel][idx, :, :]
            me_preprocessed = mes_preprocessed[melabel][idx, :, :]
            binary_loss = binary_losses[melabel][idx, :, :]
    
            # initialize figure
            fig, axs = plt.subplots(ncols=3, nrows=1, figsize=(18, 6), squeeze=False)
            
            # plot raw data
            fig, axs[0, 0] = plot_cluster_occupancy(me_orig, fig=fig, ax=axs[0, 0],
                   title='Raw', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Digi occupancy',
                   caxtitlesize=15, caxtitleoffset=15)
        
            # plot preprocessed
            fig, axs[0, 1] = plot_cluster_occupancy(me_preprocessed, fig=fig, ax=axs[0, 1],
                   title='Input', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Digi occupancy\n(normalized)',
                   caxrange=(1e-6,2),
                   caxtitlesize=15, caxtitleoffset=30)
            
            # plot binary loss
            fig, axs[0, 2] = plot_cluster_occupancy(binary_loss, fig=fig, ax=axs[0, 2],
                   title='Binary loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(1e-6, 1),
                   caxtitlesize=15, caxtitleoffset=30)
                
            # plot aesthetics
            plt.subplots_adjust(wspace=0.55)
            title = f'Run {run}, LS {lumi}, ECAL {melabel}'
            axs[0, 0].text(0.01, 1.3, title, fontsize=15, transform=axs[0, 0].transAxes)
            plt.show()
            plt.close()