In [None]:
# imports

import os
import sys
import json
import copy
import importlib
import numpy as np

thisdir = os.getcwd()
topdir = os.path.abspath(os.path.join(thisdir, '../../../'))
sys.path.append(topdir)

from studies.pixel_clusters_2024.preprocessing.preprocessor import get_metype
from studies.pixel_clusters_2024.nmf.nmf_training import find_files
import studies.pixel_clusters_2024.nmf.nmf_testing_pattern as evaltools
importlib.reload(evaltools)

In [None]:
# era and layer settings

eras = [
    #'2024B-v1',
    #'2024C-v1',
    #'2024D-v1',
    #'2024E-v1',
    #'2024E-v2',
    #'2024F-v1',
    #'2024F-v1-part1',
    #'2024F-v1-part2',
    #'2024F-v1-part3',
    #'2024F-v1-part4',
    #'2024G-v1',
    #'2024G-v1-part1',
    #'2024G-v1-part2',
    #'2024G-v1-part3',
    #'2024G-v1-part4',
    #'2024H-v1',
    #'2024I-v1',
    #'2024I-v2',
    #'2025B-v1',
    '2025C-v1',
    #'2025C-v2',
    #'2025D-v1',
    #'2025E-v1',
    #'2025F-v1',
    #'2025F-v2',
    #'2025G-v1'
]
layers = [
    'BPix1',
    'BPix2',
    'BPix3',
    'BPix4'
]

# initialize config
config = {}
config['eras'] = eras
config['layers'] = layers

In [None]:
# set path to input files

inputmode = 'dials'
#dataset = 'ZeroBias'
#reco = 'PromptReco'
dataset = 'StreamExpress'
reco = 'Express'
mebase = 'PixelPhase1-Phase1_MechanicalView-PXBarrel-clusters_per_SignedModuleCoord_per_SignedLadderCoord_PXLayer_{}'

# default case of using stored input files
if inputmode=='filebased':

    # settings
    datadir = '/eos/user/l/llambrec/dialstools-output'
    npart = None # not yet defined for file-based input

    # find files corresponding to settings
    input_files = {}
    for era in eras:
        mainera, version = era.split('-', 1)
        input_files[era] = {}
        for layer in layers:
            me = mebase.format(layer[-1])
            f = f'{dataset}-Run{mainera}-{reco}-{version}-DQMIO-{me}.parquet'
            f = os.path.join(datadir, f)
            input_files[era][layer] = [f]

    # existence check
    missing = []
    present = []
    for era, values in input_files.items():
        for layer, files in values.items():
            for f in files:
                if not os.path.exists(f): missing.append(f)
                else: present.append(f)
    if len(missing) > 0:
        raise Exception(f'The following files do not exist: {missing}')
    else: print(f'Found {len(present)} files.')

# alternative mode of getting input on the fly from dials
# (experimental)
elif inputmode=='dials':

    # settings
    nparts = 10
    
    # make dials filters corresponding to settings
    input_files = {}
    for era in eras:
        mainera, version = era.split('-', 1)
        input_files[era] = {}
        for part in range(nparts):
            input_files[era][part] = {}
            for layer in layers:
                me = mebase.format(layer[-1])
                dfilter = {
                    'dataset': f'/{dataset}/Run{mainera}-{reco}-{version}/DQMIO',
                    'me': me.replace('-', '/'),
                    'part': part,
                    'nparts': nparts
                }
                input_files[era][part][layer] = [dfilter]

# add to config
config['input_files'] = input_files
print(json.dumps(input_files, indent=2))

In [None]:
# set path to nmf model files

modeldir = 'output_20250714_consolidation/models'

# set up how to match eras
# options:
# - current era for each input file (e.g. for offline application)
# - previous era for each input file (e.g. for online application)
# - one and the same era for each input file

# use current era
#model_eradict = {era: era for era in eras}

# use previous era
#model_eradict = {}
#for i in range(len(eras)): model_eradict[eras[i]] = eras[i-1]
#model_eradict[eras[0]] = eras[1]

# use a fixed era
model_eradict = {era: 'C-v1' for era in eras}

# set path
nmf_files = {}
for era in eras:
    nmf_files[era] = {}
    model_era = model_eradict[era]
    for layer in layers: nmf_files[era][layer] = os.path.join(modeldir, f'nmf_model_{layer.upper()}_{model_era}.pkl')
    
# existence check
missing = []
for era in eras:
    for layer, f in nmf_files[era].items():
        if not os.path.exists(f): missing.append(f)
if len(missing) > 0:
    raise Exception(f'The following files do not exist: {missing}')
    
# add to config
config['nmf_files'] = nmf_files
print(json.dumps(nmf_files, indent=2))

In [None]:
# settings for preprocessing

local_norm_eradict = {}
for era in eras: local_norm_eradict[era] = 'avg_era_2024C-v1' if '2024' in era else 'avg_era_2025B-v1'
#for era in eras: local_norm_eradict[era] = 'avg_era_2024C-v1'
    
# add to config
config['preprocessing_global_normalization'] = 'avg'
config['preprocessing_local_normalization'] = local_norm_eradict
print(json.dumps(local_norm_eradict, indent=2))

In [None]:
# settings for filtering

# min entries filter
min_entries_filter = {
    'BPix1': 0.5e6,
    'BPix2': 0.5e6/2,
    'BPix3': 0.5e6/3,
    'BPix4': 0.5e6/4
}

# set OMS filters
oms_filters = [
    ["beams_stable"],
    ["cms_active"],
    ["bpix_ready"],
    ["fpix_ready"],
    ["tibtid_ready"],
    ["tob_ready"],
    ["tecp_ready"],
    ["tecm_ready"],
    ["pileup", '>', 25]
]

# set path to files with OMS info
oms_filter_files = {}
for era in eras:
    oms_era = era
    if '-part' in era: oms_era = era.split('-part')[0]
    #oms_filter_files[era] = f'/eos/user/l/llambrec/pixelae/studies/pixel_clusters_2024/omsdata/omsdata_Run{oms_era}.json'
    oms_filter_files[era] = None # retrieve on the fly
for oms_filter_file in oms_filter_files.values():
    if oms_filter_file is None: continue
    if not os.path.exists(oms_filter_file):
        raise Exception(f'File {oms_filter_file} does not exist.')

# set HLT rate filters
hltrate_filters = [
    ["HLT_ZeroBias_v*", '>', 5]
]

# set path to files with HLT rate info
hltrate_filter_files = {}
for era in eras:
    hltrate_era = era
    if '-part' in era: hltrate_era = era.split('-part')[0]
    #hltrate_filter_files[era] = f'/eos/user/l/llambrec/pixelae/studies/pixel_clusters_2024/omsdata/hltrate_Run{hltrate_era}.json'
    hltrate_filter_files[era] = None # retrieve on the fly
for hltrate_filter_file in hltrate_filter_files.values():
    if hltrate_filter_file is None: continue
    if not os.path.exists(hltrate_filter_file):
        raise Exception(f'File {hltrate_filter_file} does not exist.')

# add to config
config['min_entries_filter'] = min_entries_filter
config['oms_filter_files'] = oms_filter_files
config['oms_filters'] = oms_filters
config['hltrate_filter_files'] = hltrate_filter_files
config['hltrate_filters'] = hltrate_filters

In [None]:
# set path to automask data

do_automasking = False

automask_data_file = None
if do_automasking:
    automask_data_dir = '/eos/user/l/llambrec/pixelae/automasking/data/automask_data'
    automask_data_file = os.path.join(automask_data_dir, f'automask_2024.json')
    if not os.path.exists(automask_data_file):
        raise Exception(f'The automask data file {automask_data_file} does not exist.')
    config['automask_data_file'] = automask_data_file
    
# add to config
config['do_automasking'] = do_automasking
config['automask_data_file'] = automask_data_file

In [None]:
# set path to loss mask data

do_loss_masking = True
loss_masking_zero_frac_threshold = 0.9

# set up how to match eras
# options:
# - current era for each input file (e.g. for offline application)
# - previous era for each input file (e.g. for online application)
# - one and the same era for each input file

# use current era
#loss_mask_eradict = {era: era for era in eras}

# use previous era
#loss_mask_eradict = {}
#for i in range(len(eras)): loss_mask_eradict[eras[i]] = eras[i-1]
#loss_mask_eradict[eras[0]] = eras[1]

# use a fixed era
loss_mask_eradict = {}
for era in eras: loss_mask_eradict[era] = '2024C-v1' if '2024' in era else '2025B-v1'

# set path
loss_masking_zero_frac_files = None
if do_loss_masking:
    loss_masking_zero_frac_files = {}
    for era in eras:
        loss_mask_era = era
        if '-part' in era: loss_mask_era = era.split('-part')[0]
        loss_mask_era = loss_mask_eradict[loss_mask_era]
        loss_masking_zero_frac_files[era] = {}
        for layer in layers:
            zerofrac_file = f'/eos/user/l/llambrec/pixelae/studies/pixel_clusters_2024/preprocessing/normdata/zerofrac_Run{loss_mask_era}_{get_metype(layer)}.npy'
            loss_masking_zero_frac_files[era][layer] = zerofrac_file

# existence check
missing = []
for era in eras:
    for layer, f in loss_masking_zero_frac_files[era].items():
        if not os.path.exists(f): missing.append(f)
if len(missing) > 0:
    raise Exception(f'The following files do not exist: {missing}')
    
# add to config
config['do_loss_masking'] = do_loss_masking
config['loss_masking_zero_frac_threshold'] = loss_masking_zero_frac_threshold
config['loss_masking_zero_frac_files'] = loss_masking_zero_frac_files
print(json.dumps(loss_masking_zero_frac_files, indent=2))

In [None]:
# settings for dynamic loss masking

do_dynamic_loss_masking = False
dynamic_loss_masking_window = 100

config['do_dynamic_loss_masking'] = do_dynamic_loss_masking
config['dynamic_loss_masking_window'] = dynamic_loss_masking_window

In [None]:
# set evaluation parameters

# general
batch_size = 'run' # either an int (for fixed size batches) or "run" (for run-based batches)
target_batch_size = 3000 # ignored for fixed size batches, approximate batch size for run-based batches

# flagging
flagging_patterns = [np.ones((1,8)), np.ones((2,4))]
flagging_threshold = 1e-3

# clustering
pattern_thresholds = [
    {"loss_threshold": 0.04, "pattern": np.ones((2, 16)).tolist(), "filter_threshold": 1.5},
    #{"loss_threshold": 0.01, "pattern": np.ones((2, 24)).tolist(), "filter_threshold": 1.5},
    #{"loss_threshold": 0.005, "pattern": np.ones((4, 24)).tolist(), "filter_threshold": 3.5},
    #{"loss_threshold": 0.002, "pattern": np.ones((8, 24)).tolist(), "filter_threshold": 3.5},
]

# add to config
config['batch_size'] = batch_size
config['target_batch_size'] = target_batch_size
config['flagging_patterns'] = [el.tolist() for el in flagging_patterns]
config['flagging_threshold'] = flagging_threshold
config['pattern_thresholds'] = pattern_thresholds

In [None]:
# set directory for this config

outputdir = 'output_test_part3'

if os.path.exists(outputdir):
    msg = f'Output directory {outputdir} already exists.'
    raise Exception(msg)

In [None]:
# set output file

outputfile = os.path.join(outputdir, 'flagged_lumisections.json')
outputfile = os.path.join('/eos/user/l/llambrec/pixelae/studies/pixel_clusters_2024/nmf', outputfile)
# note: cannot use getwcd() to define the directory above,
#       as it seems to give some kind of virtual directory that is ok in the notebook
#       but gives errors when used in the lxplus terminal or condor job.

config['outputfile'] = outputfile

In [None]:
# make the configuration

split_per_era = True
basename = 'temp_config'

if not os.path.exists(outputdir):
    os.makedirs(outputdir)

if split_per_era:
    for era in eras:
        for part in range(nparts): # todo: fix for file-based input
            this_config = copy.deepcopy(config)
            this_config['eras'] = [era]
            this_config['input_files'] = {era: input_files[era][part]}
            this_config['nmf_files'] = {era: nmf_files[era]}
            this_config['oms_filter_files'] = {era: oms_filter_files[era]}
            this_config['hltrate_filter_files'] = {era: hltrate_filter_files[era]}
            this_config['outputfile'] = outputfile.replace('.json', f'_{era}_part{part}.json')
            if this_config['loss_masking_zero_frac_files'] is not None:
                this_config['loss_masking_zero_frac_files'] = {era: loss_masking_zero_frac_files[era]}
            configfile = os.path.join(outputdir, f'{basename}_{era}_part{part}.json')
            with open(configfile, 'w') as f:
                json.dump(this_config, f, indent=2)

else:
    configfile = os.path.join(outputdir, f'{basename}.json')
    with open(configfile, 'w') as f:
        json.dump(config, f, indent=2)