In [None]:
# imports

import os
import sys
import json
import joblib
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

thisdir = os.getcwd()
topdir = os.path.abspath(os.path.join(thisdir, '../../../'))
sys.path.append(topdir)

import tools.iotools as iotools
import tools.dftools as dftools
import tools.patternfiltering as patternfiltering
import tools.rebinning as rebinning

from studies.clusters_2024.preprocessing.preprocessor import make_default_preprocessor
from studies.clusters_2024.preprocessing.preprocessor import PreProcessor
from studies.clusters_2024.preprocessing.preprocessor import get_metype
from studies.clusters_2024.plotting.plot_cluster_occupancy import plot_cluster_occupancy
from studies.clusters_2024.nmf.modeldefs.nmf2d import NMF2D
from studies.clusters_2024.nmf.nmf_training import find_files
from studies.clusters_2024.nmf.nmf_testing_pattern import run_evaluation

In [None]:
# set path to files

layers = ['BPix1', 'BPix2', 'BPix3', 'BPix4']
input_files = {layer: find_files(layer[-1]) for layer in layers}

In [None]:
# optional: print out available runs for a given era


In [None]:
# settings for training and evaluation

do_training = True
training_runs = [382752]
do_evaluation = True

In [None]:
# define runs to use in training

era = 'F-v1'
dftemp = iotools.read_parquet(input_files[layers[0]][era], columns=['run_number', 'entries'])
dftemp = dftemp[dftemp['entries'] > 0.5e6]
available_runs = np.unique(dftemp['run_number'].values)
#print('Available runs:')
#print(available_runs)

# application run: 382769
training_runs = [382752]
print('Chosen training runs:')
print(training_runs)

# check
for training_run in training_runs:
    if training_run not in available_runs:
        raise Exception(f'Run {training_run} not in available runs.')
        
# find number of lumisections
df_temp = iotools.read_runs(input_files[layers[0]][era], training_runs, columns=['run_number'])
print(f'Found {len(df_temp)} lumisections for training (before any filters).')

In [None]:
# make preprocessors for the corresponding era

global_normalization = 'avg'
local_normalization = 'avg_previous_era'

preprocessors = {}
preprocessor_era = era
if '-part' in preprocessor_era: preprocessor_era = era.split('-part')[0]
for layer in layers:
    preprocessors[layer] = make_default_preprocessor(preprocessor_era, layer,
                             global_normalization = global_normalization,
                             local_normalization = local_normalization)

In [None]:
# load training data

dfs_training = {}
for layer in layers:
    print(f'Loading training data for layer {layer}...')
    dfs_training[layer] = iotools.read_runs(input_files[layer][era], training_runs, mode='batched', verbose=True)
ndf = len(dfs_training[layers[0]])
print(f'Found {ndf} instances.')

In [None]:
# do training

nmfs = {}
batch_size = 300
nbatches = 10

do_plot_components = True

min_entries_filter = {
    'BPix1': 0.5e6,
    'BPix2': 0.5e6/2,
    'BPix3': 0.5e6/3,
    'BPix4': 0.5e6/4
}

# loop over layers
for layer in layers:
    print(f'Now running on layer {layer}...')
    print(f'Will train on {nbatches} batches of size {batch_size}.')
    
    # make the NMF model for this layer
    nmf = NMF2D(n_components=5, forget_factor=1, batch_size=batch_size, verbose=True,
                tol=0.0, max_no_improvement=100, max_iter=1000,
                alpha_H=0.1)
    
    # load the data
    df = dfs_training[layer]
    
    # filtering
    df = df[df['entries'] > min_entries_filter[layer]]
    print(f'  Found {len(df)} / {ndf} instances passing filters.')
    if len(df)==0: continue
        
    # preprocessing
    mes_preprocessed = preprocessors[layer].preprocess(df)
        
    # experimental: set zero-occupancy to 1 (average expected value after preprocessing)
    mes_preprocessed[mes_preprocessed==0] = 1
    
    # loop over random batches
    for batchidx in range(nbatches):
        print(f'Now processing batch {batchidx+1} / {nbatches}...')

        # make random indices
        random_ids = np.random.choice(np.arange(len(mes_preprocessed)), size=batch_size, replace=False)
        batch = mes_preprocessed[random_ids, :, :]

        # fit NMF
        nmf.fit(batch)
        
    nmfs[layer] = nmf
        
    # plot components
    if do_plot_components:
        C = nmf.components
        for idx in range(len(C)):
            fig, ax = plot_cluster_occupancy(C[idx],
                   title=f'Component {idx+1}', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Number of clusters\n(normalized)',
                   caxrange=(1e-6, 2),
                   caxtitlesize=15, caxtitleoffset=35)
        plt.show()
        plt.close()

In [None]:
# save the models

dosave = False

if dosave:
    outputdir = f'models_local'
    if not os.path.exists(outputdir): os.makedirs(outputdir)

    for layer in layers:
        outputfile = os.path.join(outputdir, f'nmf_model_{layer}.pkl')
        joblib.dump(nmfs[layer], outputfile)

In [None]:
# define runs/lumisections for testing

# for now, just pick one or multiple runs, later implement selection of specific LS range

testing_runs = [382769]

In [None]:
# settings for filtering

# min entries filter
min_entries_filter = {
    'BPix1': 0.5e6,
    'BPix2': 0.5e6/2,
    'BPix3': 0.5e6/3,
    'BPix4': 0.5e6/4
}

# OMS attribute filters
oms_filter_file = f'../omsdata/omsdata_Run2024{era}.json'
with open(oms_filter_file, 'r') as f:
    oms_filters = json.load(f)
filter_keys = [
    'run_number',
    'lumisection_number',
    "beams_stable",
    "cms_active",
    "bpix_ready",
    "fpix_ready",
    "tibtid_ready",
    "tob_ready",
    "tecp_ready",
    "tecm_ready"
]
oms_filters = {key: val for key, val in oms_filters.items() if key in filter_keys}

In [None]:
# settings for automasking

do_automasking = False

In [None]:
# settings for loss masking

do_loss_masking = True
zero_frac_threshold = 0.9

if do_loss_masking:
    loss_mask_era = era
    if '-part' in era: loss_mask_era = era.split('-part')[0]
    loss_masks = {}
    loss_masks[era] = {}
    loss_mask_preprocessors = {}
    for layer in layers:
        zerofrac_file = f'../preprocessing/normdata/zerofrac_Run2024{loss_mask_era}_{get_metype(layer)}.npy'
        zerofrac = np.load(zerofrac_file)
        loss_mask = (zerofrac < zero_frac_threshold)
        loss_masks[era][layer] = loss_mask
        loss_mask_preprocessors[layer] = PreProcessor(f'PXLayer_{layer}')

In [None]:
# other evaluation settings

# general
loss_threshold = 0.1

# flagging
flagging_patterns = [np.ones((1,8)), np.ones((2,4))]
flagging_threshold = 1e-3

# cleaning
do_per_layer_cleaning = True
cleaning_patterns = {
    'BPix1': [np.ones((2,16))], # two neighbouring modules
    'BPix2': [np.ones((2,16))], # two neighbouring modules
    'BPix3': [np.ones((2,16))], # two neighbouring modules
    'BPix4': [np.ones((2,16))] # two neighbouring modules
}
cleaning_threshold = 1.5

In [None]:
# load the testing data

dfs_testing = {}
for layer in layers:
    print(f'Loading testing data for layer {layer}...')
    dfs_testing[layer] = iotools.read_runs(input_files[layer][era], testing_runs, mode='batched', verbose=True)
ndf = len(dfs_testing[layers[0]])
print(f'Found {ndf} instances.')

In [None]:
# process the testing data

output = run_evaluation(dfs_testing, nmfs,
                     preprocessors = preprocessors,
                     min_entries_filter = min_entries_filter,
                     oms_filters = oms_filters,
                     loss_threshold = loss_threshold,
                     flagging_patterns = flagging_patterns,
                     do_per_layer_cleaning = do_per_layer_cleaning,
                     cleaning_patterns = cleaning_patterns,
                     cleaning_threshold = cleaning_threshold,
                     do_automasking = False,
                     automask_reader = None,
                     automask_map_preprocessors = None,
                     do_loss_masking = do_loss_masking,
                     loss_masks = loss_masks[era],
                     loss_mask_preprocessors = loss_mask_preprocessors)

filter_results = output['filter_results']
flagged_run_numbers = output['flagged_run_numbers']
flagged_ls_numbers = output['flagged_ls_numbers']

In [None]:
# make a plot of the filter results

# make a table
filter_results_arrays = {key: np.array([el[0]*10000+el[1] for el in val]) for key, val in filter_results.items()}
failed_ls = np.unique(np.concatenate(list(filter_results_arrays.values())))
nfiltered = {key: len(val) for key, val in filter_results_arrays.items()}
nfiltered['total'] = len(failed_ls)
print(f'Found {nfiltered["total"]} lumisections in total that did not pass the filters.')

# make a figure
fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(nfiltered.keys(), nfiltered.values())
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=15)
ax.set_ylabel('Number of failing LS', fontsize=15)
ax.grid(which='both', axis='y', color='gray', linestyle='dashed')
ax.text(0, 1.03, f'Lumisection preselection', transform=ax.transAxes, fontsize=15)
plt.show()

In [None]:
# printouts of flagged lumisections

print(f'Found {len(flagged_run_numbers)} flagged lumisections:')
for run_number, ls_number in zip(flagged_run_numbers, flagged_ls_numbers):
    print(f'  - Run {run_number}, LS {ls_number}')

In [None]:
# plot some random (or not random) examples

# general settings
do_extended_loss_plots = True
do_combined_loss_plot = True

# random lumisections
nplot = 3
#random_ids = np.random.choice(len(available_run_numbers), size=min(nplot, len(available_run_numbers)), replace=False)
#selected_run_numbers = available_run_numbers[random_ids]
#selected_ls_numbers = available_ls_numbers[random_ids]
random_ids = np.random.choice(len(flagged_run_numbers), size=min(nplot, len(flagged_run_numbers)), replace=False)
selected_run_numbers = flagged_run_numbers[random_ids]
selected_ls_numbers = flagged_ls_numbers[random_ids]
selected_runlumis = [(run, lumi) for run, lumi in zip(selected_run_numbers, selected_ls_numbers)]

# alternative: specific selected lumisections
#selected_runlumis = [(385443, 1566), (385443, 1578), (385443, 1579), (385443, 1592)]
#selected_run_numbers = [el[0] for el in selected_runlumis]
#selected_ls_numbers = [el[1] for el in selected_runlumis]

if len(selected_run_numbers) > 0:
    
    # calculate random indices and load data
    print('Loading data...')
    dfs = {}
    mes = {}
    for layer in layers:
        dfs[layer] = dftools.select_runsls(dfs_testing[layer], selected_runlumis, runcolumn='run_number', lumicolumn='ls_number')
        mes[layer], runs, lumis = dftools.get_mes(dfs[layer], xbinscolumn='x_bin', ybinscolumn='y_bin', runcolumn='run_number', lumicolumn='ls_number')
    
    # preprocess and predict
    print('Processing...')
    mes_preprocessed = {}
    mes_pred = {}
    losses = {}
    losses_binary = {}
    for layer in layers:
        mes_preprocessed[layer] = preprocessors[layer].preprocess(dfs[layer])
        mes_pred[layer] = nmfs[layer].predict(mes_preprocessed[layer])
        losses[layer] = np.square(mes_preprocessed[layer] - mes_pred[layer])
        losses_binary[layer] = (losses[layer] > loss_threshold).astype(int)
    
    # automasking
    if do_automasking:
        print('Applying automasks...')
        for layer in layers:
            subsystem = f'BPix{layer}'
            automask_maps = automask_reader.get_automask_maps_for_ls(selected_run_numbers, selected_ls_numbers, subsystem, invert=True)
            automask_maps = automask_map_preprocessors[layer].preprocess_mes(automask_maps, None, None)
            losses[layer] = np.multiply(losses[layer], automask_maps)
            losses_binary[layer] = np.multiply(losses_binary[layer], automask_maps)
            
    # manual masking
    # update: now applied after combining layers instead of per-layer,
    # in order to be able to find cases where one layer is masked but another is not.
    '''
    if do_loss_masking:
        print('Applying loss mask...')
        for layer in layers:
            mask = loss_masks[era][layer]
            mask = np.expand_dims(mask, 0)
            mask = loss_mask_preprocessors[layer].preprocess_mes(mask, None, None)
            losses[layer] = np.multiply(losses[layer], mask)
            losses_binary[layer] = np.multiply(losses_binary[layer], mask)
    '''
            
    # cleaning
    if do_per_layer_cleaning:
        print('Cleaning loss maps')
        losses_binary_cleaned = {}
        for layer in layers:
            losses_binary_cleaned[layer] = patternfiltering.filter_any_pattern(losses_binary[layer], cleaning_patterns[layer], threshold=cleaning_threshold)
    
    # make rebinned and overlayed binary loss map
    target_shape = losses[layers[0]].shape[1:3]
    losses_binary_rebinned = {}
    losses_binary_combined = np.zeros(losses[layers[0]].shape)
    for layer in layers:
        source = losses_binary[layer]
        if do_per_layer_cleaning: source = losses_binary_cleaned[layer]
        losses_binary_rebinned[layer] = rebinning.rebin_keep_clip(source, target_shape, 1, mode='cv2')
        losses_binary_combined += losses_binary_rebinned[layer]
        
    # optional: do loss masking
    loss_mask = np.zeros(losses_binary_combined.shape)
    if do_loss_masking:
        print('Applying loss mask...')
        loss_mask = np.zeros((1, target_shape[0], target_shape[1]))
        for layer in layers:
            this_loss_mask = loss_masks[era][layer]
            # preprocess
            this_loss_mask = np.expand_dims(this_loss_mask, 0)
            this_loss_mask = loss_mask_preprocessors[layer].preprocess_mes(this_loss_mask, None, None)
            # invert
            this_loss_mask = 1 - this_loss_mask
            # rescale
            this_loss_mask = rebinning.rebin_keep_clip(this_loss_mask, target_shape, 1, mode='cv2')
            # add to total
            loss_mask += this_loss_mask
        loss_mask = np.repeat(loss_mask, len(losses_binary_combined), axis=0)
        
    # apply threshold on combined binary loss
    losses_binary_combined = ((losses_binary_combined >= 2) & (losses_binary_combined > loss_mask)).astype(int)
        
    # make the plots
    print('Plotting...')
    for idx in range(len(selected_run_numbers)):
        run = runs[idx]
        lumi = lumis[idx]
        for layer in layers:
            me_orig = mes[layer][idx, :, :]
            me_preprocessed = mes_preprocessed[layer][idx, :, :]
            me_pred = mes_pred[layer][idx, :, :]
            loss = losses[layer][idx, :, :]
            loss_binary = losses_binary[layer][idx, :, :]
            loss_binary_cleaned = losses_binary_cleaned[layer][idx, :, :]
            loss_binary_rebinned = losses_binary_rebinned[layer][idx, :, :]
    
            # initialize figure
            nrows = 1
            figheight = 6
            if do_extended_loss_plots:
                nrows = 2
                figheight = 12
            fig, axs = plt.subplots(ncols=4, nrows=nrows, figsize=(24, figheight), squeeze=False)
            
            # plot raw data
            fig, axs[0, 0] = plot_cluster_occupancy(me_orig, fig=fig, ax=axs[0, 0],
                   title='Raw', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Number of clusters',
                   caxtitlesize=15, caxtitleoffset=15)
        
            # overlay automask
            if do_automasking:
                subsystem = f'BPix{layer}'
                automask_map = amreader.get_automask_map_for_ls(run, lumi, subsystem)
                ids = np.nonzero(automask_map.astype(int))
                for yidx, xidx in zip(ids[0], ids[1]):
                    linewidth = 1 if layer>=3 else 2
                    patch = mpl.patches.Rectangle((xidx-0.5, yidx-0.5), 1, 1,
                                      edgecolor='red', linewidth=linewidth,
                                      facecolor='none')
                    axs[0, 0].add_patch(patch)
        
            # plot preprocessed, reconstructed and loss
            fig, axs[0, 1] = plot_cluster_occupancy(me_preprocessed, fig=fig, ax=axs[0, 1],
                   title='Input', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Number of clusters\n(normalized)',
                   caxrange=(1e-6,2),
                   caxtitlesize=15, caxtitleoffset=30)
            fig, axs[0, 2] = plot_cluster_occupancy(me_pred, fig=fig, ax=axs[0, 2],
                   title='Reconstructed', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Number of clusters\n(normalized)',
                   caxrange=(1e-6,2),
                   caxtitlesize=15, caxtitleoffset=30)
            fig, axs[0, 3] = plot_cluster_occupancy(loss, fig=fig, ax=axs[0, 3],
                   title='Loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 0.1),
                   caxtitlesize=15, caxtitleoffset=30)
            
            # optional: plot more post-processing steps with the loss map
            if do_extended_loss_plots:
                fig, axs[1, 0] = plot_cluster_occupancy(loss_binary, fig=fig, ax=axs[1, 0],
                   title=f'Binary loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 1),
                   caxtitlesize=15, caxtitleoffset=15)
                fig, axs[1, 1] = plot_cluster_occupancy(loss_binary_cleaned, fig=fig, ax=axs[1, 1],
                   title=f'Cleaned loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 1),
                   caxtitlesize=15, caxtitleoffset=15)
                fig, axs[1, 2] = plot_cluster_occupancy(loss_binary_rebinned, fig=fig, ax=axs[1, 2],
                   title=f'Rebinned loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 1),
                   caxtitlesize=15, caxtitleoffset=15)
                fig.delaxes(axs[1, 3])
                
            
            # plot aesthetics
            plt.subplots_adjust(wspace=0.55)
            if str(layer)=='BPix1': plt.subplots_adjust(hspace=-0.65)
            if str(layer)=='BPix2': plt.subplots_adjust(hspace=-0.35)
            title = f'Run {run}, LS {lumi}, layer {layer}'
            axs[0, 0].text(0.01, 1.3, title, fontsize=15, transform=axs[0, 0].transAxes)
            plt.show()
            plt.close()
            
        # plot the combined loss map
        if do_combined_loss_plot:
            loss_binary_combined = losses_binary_combined[idx, :, :]
            fig, ax = plt.subplots()
            fig, ax = plot_cluster_occupancy(loss_binary_combined, fig=fig, ax=ax,
                   title='Combined binary loss', titlesize=15,
                   xaxtitlesize=15, yaxtitlesize=15,
                   ticklabelsize=12, colorticklabelsize=12,
                   docolorbar=True, caxtitle='Loss',
                   caxrange=(0, 1),
                   caxtitlesize=15, caxtitleoffset=15)
            title = f'Run {run}, LS {lumi}'
            ax.text(0.01, 1.3, title, fontsize=15, transform=ax.transAxes)
            plt.show()
            plt.close()