In [None]:
# import external modules
import os
import sys
import json
import joblib
import numpy as np
import matplotlib.pyplot as plt
import importlib

# import PixelNMF
thisdir = os.getcwd()
mlserverdir = os.path.join(thisdir, 'mlserver-model')
sys.path.append(mlserverdir)
from pixelnmf import PixelNMF

# import other tools (local)
topdir = os.path.abspath(os.path.join(thisdir, '../../../..'))
sys.path.append(topdir)
from tools.iotools import read_parquet
from tools.dftools import get_mes
from studies.pixel_clusters_2024.plotting.plot_cluster_occupancy import plot_cluster_occupancy
from studies.pixel_clusters_2024.nmf.nmf_testing_pattern import filter_dfs

In [None]:
# set monitoring element names
# todo: better distinction between names and short tags

menames = ['BPix1', 'BPix2', 'BPix3', 'BPix4']

In [None]:
# set path to nmf model files

modeldir = '../output_20250714_consolidation/models'

# set path
nmf_files = {}
for mename in menames: nmf_files[mename] = os.path.abspath(os.path.join(modeldir, f'nmf_model_{mename.upper()}_C-v1.pkl'))
    
# existence check
missing = []
for mename, f in nmf_files.items():
    if not os.path.exists(f): missing.append(f)
if len(missing) > 0:
    raise Exception(f'The following files do not exist: {missing}')
else:
    print(json.dumps(nmf_files, indent=2))

In [None]:
# make local norms

normdata_dir = os.path.join('../../preprocessing/normdata')

# set path
local_norm_files = {}
for mename in menames:
    metag = f'PXLayer_{mename[-1]}'
    local_norm_files[mename] = os.path.join(normdata_dir, f'avgme_Run2024C-v1_{metag}.npy')
print(json.dumps(local_norm_files, indent=2))
    
# load
local_norms = {}
for mename in menames:
    local_norms[mename] = np.load(local_norm_files[mename])

In [None]:
# make loss masks

# set path
loss_masking_files = {}
for mename in menames:
    metag = f'PXLayer_{mename[-1]}'
    zerofrac_file = os.path.join(normdata_dir, f'zerofrac_Run2024C-v1_{metag}.npy')
    loss_masking_files[mename] = zerofrac_file
print(json.dumps(loss_masking_files, indent=2))
                                 
# load
loss_masks = {}
for mename in menames:
    loss_masks[mename] = np.load(loss_masking_files[mename])
    loss_masks[mename] = (loss_masks[mename] < 0.9).astype(bool)

In [None]:
# make a PixelNMF instance

pnmf = PixelNMF(nmf_files, local_norms=local_norms, loss_masks=loss_masks)

In [None]:
# load some dummy data

# settings
datadir = '/eos/user/l/llambrec/dialstools-output'
year = '2024'
dataset = 'ZeroBias'
reco = 'PromptReco'
era = 'B-v1'
mebase = 'PixelPhase1-Phase1_MechanicalView-PXBarrel-clusters_per_SignedModuleCoord_per_SignedLadderCoord_PXLayer_{}'

# find files corresponding to settings
X = {}
mainera, version = era.split('-', 1)
for mename in menames:
    layer = mename[-1]
    me = mebase.format(layer)
    f = f'{dataset}-Run{year}{mainera}-{reco}-{version}-DQMIO-{me}.parquet'
    f = os.path.join(datadir, f)
    X[mename] = read_parquet(f, verbose=False, batch_size=3000, batch_ids=[0])
    
# print run numbers
runs = np.unique(X[menames[0]]['run_number'].values)
print(runs)

In [None]:
# filter data
# (not strictly needed but just to make valid comparison to earlier results)
# (note: not yet synchronized with earlier results, only use min_entries_filter for now)

# min entries filter
min_entries_filter = {
    'BPix1': 0.5e6,
    'BPix2': 0.5e6/2,
    'BPix3': 0.5e6/3,
    'BPix4': 0.5e6/4
}

# OMS attribute filters
oms_info_file = f'/eos/user/l/llambrec/pixelae/studies/pixel_clusters_2024/omsdata/omsdata_Run2024{era}.json'
with open(oms_info_file, 'r') as f:
    oms_info = json.load(f)
oms_filters = [
    ["beams_stable"],
    ["cms_active"],
    ["bpix_ready"],
    ["fpix_ready"],
    ["tibtid_ready"],
    ["tob_ready"],
    ["tecp_ready"],
    ["tecm_ready"],
    ["pileup", '>', 25]
]

# HLT rate filter
hltrate_info_file = f'/eos/user/l/llambrec/pixelae/studies/pixel_clusters_2024/omsdata/hltrate_Run2024{era}.json'
with open(hltrate_info_file, 'r') as f:
    hltrate_info = json.load(f)
hltrate_filters = [
    ["HLT_ZeroBias_v*", '>', 5]
]

ndf = len(X[menames[0]])
mask, _ = filter_dfs(X,
            min_entries_filter = min_entries_filter,
            oms_info = oms_info,
            oms_filters = oms_filters,
            hltrate_info = hltrate_info,
            hltrate_filters = hltrate_filters
          )
for mename in menames:
    X[mename] = X[mename][mask]
ndfnew = len(X[menames[0]])    
print(f'    Found {ndfnew} / {ndf} instances passing filters.')

In [None]:
# extract np arrays from dataframes

X_data = {}
for mename in menames:
    mes, _, _ = get_mes(X[mename],
                    xbinscolumn='x_bin', ybinscolumn='y_bin',
                    runcolumn='run_number', lumicolumn='ls_number')
    X_data[mename] = mes

In [None]:
# run the model

# temp for quick testing
import pixelnmf
importlib.reload(pixelnmf)
from pixelnmf import PixelNMF
pnmf = PixelNMF(nmf_files, local_norms=local_norms, loss_masks=loss_masks)

flags = pnmf.predict(X_data, verbose=True)
print(len(flags))
print(np.sum(flags.astype(int)))

In [None]:
# run the model step by step and plot intermediate outputs
# (for debugging)

# temp for quick testing
import pixelnmf
importlib.reload(pixelnmf)
from pixelnmf import PixelNMF
pnmf = PixelNMF(nmf_files, local_norms=local_norms, loss_masks=loss_masks)

# select small data (single instance)
X_small = {mename: X[mename].iloc[1700:1701] for mename in menames}
meidx = 2
print(X_small[menames[meidx]])

# convert from dataframe to np arrays
X_small_data = {}
for mename in menames:
    mes, _, _ = get_mes(X_small[mename],
                    xbinscolumn='x_bin', ybinscolumn='y_bin',
                    runcolumn='run_number', lumicolumn='ls_number')
    X_small_data[mename] = mes

mes_preprocessed = pnmf.preprocess(X_small_data)
mes_reco = pnmf.infer(mes_preprocessed)
losses = pnmf.loss(mes_preprocessed, mes_reco, do_thresholding=False)
losses_binary = pnmf.loss(mes_preprocessed, mes_reco, do_thresholding=True)
losses_combined = pnmf.combine(losses_binary, do_masking=False, do_thresholding=False)
losses_combined_binary = pnmf.combine(losses_binary, do_masking=True, do_thresholding=True)

plot_cluster_occupancy(mes_preprocessed[menames[meidx]][0],
                   title='Input', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Number of clusters\n(normalized)',
                   caxrange=(1e-6,2),
                   caxtitlesize=15, caxtitleoffset=30)
plot_cluster_occupancy(mes_reco[menames[meidx]][0],
                   title='Reconstructed', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Number of clusters\n(normalized)',
                   caxrange=(1e-6,2),
                   caxtitlesize=15, caxtitleoffset=30)
plot_cluster_occupancy(losses[menames[meidx]][0],
                   title='Loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 0.1),
                   caxtitlesize=15, caxtitleoffset=30)
plot_cluster_occupancy(losses_binary[menames[meidx]][0],
                   title='Loss (binary)', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 1),
                   caxtitlesize=15, caxtitleoffset=30)
plot_cluster_occupancy(losses_combined[0],
                   title='Combined loss (before masking)', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 4),
                   caxtitlesize=15, caxtitleoffset=30)
plot_cluster_occupancy(pnmf.loss_mask[0],
                   title='Combined loss mask', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Value',
                   caxrange=(0, 4),
                   caxtitlesize=15, caxtitleoffset=30)
plot_cluster_occupancy(losses_combined_binary[0],
                   title='Combined loss (after masking)', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 1),
                   caxtitlesize=15, caxtitleoffset=30)

In [None]:
# store to joblib file

joblib.dump(pnmf, 'mlserver-model/pixelnmf.joblib')