In [None]:
# imports

import os
import sys
import json
import importlib
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

thisdir = os.getcwd()
topdir = os.path.abspath(os.path.join(thisdir, '../../../'))
sys.path.append(topdir)

import tools.iotools as iotools
import tools.dftools as dftools
import plotting.plottools as plottools

In [None]:
# set path to files

# load occupancy from dqmio files

mes = ({
    'PXLayer_1': 'PixelPhase1-Phase1_MechanicalView-PXBarrel-clusters_per_SignedModuleCoord_per_SignedLadderCoord_PXLayer_1',
    'PXLayer_2': 'PixelPhase1-Phase1_MechanicalView-PXBarrel-clusters_per_SignedModuleCoord_per_SignedLadderCoord_PXLayer_2',
    #'PXLayer_3': 'PixelPhase1-Phase1_MechanicalView-PXBarrel-clusters_per_SignedModuleCoord_per_SignedLadderCoord_PXLayer_3',
    #'PXLayer_4': 'PixelPhase1-Phase1_MechanicalView-PXBarrel-clusters_per_SignedModuleCoord_per_SignedLadderCoord_PXLayer_4',
})

eras = [
  #'Run2024A-v1', # only commissioning, no lumisections with physics flag set to True
  'Run2024B-v1',
  'Run2024C-v1',
  'Run2024D-v1',
  'Run2024E-v1',
  'Run2024E-v2',
  'Run2024F-v1',
  'Run2024G-v1',
  'Run2024H-v1',
  'Run2024I-v1',
  'Run2024I-v2',
  #'Run2024J-v1'  # pp reference run for heavy ion run; lower pileup and occupancy
]

datadir = '/eos/user/l/llambrec/dialstools-output'
year = '2024'
dataset = 'ZeroBias'
reco = 'PromptReco'

files = {}
for era in eras:
    files[era] = {}
    for melabel, mename in mes.items():
        mainera, version = era.split('-')
        f = f'{dataset}-{mainera}-{reco}-{version}-DQMIO-{mename}.parquet'
        f = os.path.join(datadir, f)
        files[era][melabel] = f

# existence check
missing = []
for era in eras:
    for melabel in mes.keys():
        f = files[era][melabel]
        if not os.path.exists(f):
            missing.append(f)
if len(missing) > 0:
    raise Exception(f'The following files do not exist: {missing}')
else:
    print(f'Found {len(files)} files.')

**Part 1: calculate norm**

In [None]:
# calculate average per era and per ME

outputdir = 'normdata'
if not os.path.exists(outputdir): os.makedirs(outputdir)
    
doplot = True
dosave = True

for melabel, mename in mes.items():
    for era in eras:
        print(f'Now running on era {era}, ME {melabel}...')
        f = files[era][melabel]
        
        # read number of instances in this era
        dummy = iotools.read_parquet(f, columns=['run_number'])
        nlumis = len(dummy)
        print(f'Found {nlumis} lumisections in this era.')
        
        # split in batches
        batch_size = 5000
        num_batches = int((nlumis-1)/batch_size)+1
        batch_sums = []
        batch_counts = []
        for batchidx in range(num_batches):
            print(f'  - Extracting batch {batchidx+1} / {num_batches}...')
            
            # get batch
            df = iotools.read_parquet(f, batch_size=batch_size, first_batch=batchidx, last_batch=batchidx)
            
            # do filtering
            df = df[df['entries'] > 0]
            print(f'    Found {len(df)} entries in batch passing filters.')
            if len(df)==0: continue
                
            me_array, _, _ = dftools.get_mes(df, xbinscolumn='x_bin', ybinscolumn='y_bin', runcolumn='run_number', lumicolumn='ls_number')
            print(f'    Found array of shape {me_array.shape}')
            
            # get sum
            threshold = 1000
            mesum = np.sum(me_array, axis=0, where=(me_array > threshold))
            batch_sums.append(mesum)
            #mecounts = np.count_nonzero(me_array > 1000, axis=0)
            # note: not sure anymore what the above (with threshold 1000) was intended for,
            #       but in any case it seems unsuitable for layer 2, 3 and 4 (maybe ok for layer 1?).
            #       now rerunning for for layer 3 and 4 without this threshold, but not yet for layer 1 and 2,
            #       as I already have results for those and I first want to just add layer 3 and 4.
            #       maybe later also rerun for layer 1 and 2.
            # note: looking back at it, probably the threshold should have been applied to both the sum and the count
            #       (to avoid being impacted by transient zero- or low-occupancy bins),
            #       but it seems like it was only applied to the count, resulting in a distorted mean distribution.
            mecounts = np.count_nonzero(me_array > threshold, axis=0)
            batch_counts.append(mecounts)
            
            # explicitly delete some variables for memory saving
            del df
            del me_array
            
        # make total normalized sum over batches
        mesum = np.sum(np.array(batch_sums), axis=0)
        mesum = mesum / np.mean(mesum)
        # set small but nonzero values to zero for stability
        mesum[mesum < 0.05] = 0
        
        # calculate total per-bin nonzero counts
        mecounts = np.sum(np.array(batch_counts), axis=0)
        mecounts = np.where(mecounts==0, 1, mecounts)
        
        # calculate normalized per-bin averaged value
        avgme = np.divide(mesum, mecounts)
        avgme = avgme / np.mean(avgme)
        # set small values to zero (so they will be automatically masked)
        avgme[avgme < 0.05] = 0
        
        # plot result
        if doplot:
            '''title = mename.split('-')[-1]+ '\n' + f'Era {era} normalized sum'
            fig, ax = plottools.plot_hist_2d(mesum, figsize=(12,6), title=title, titlesize=15,
                xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                docolorbar=True, caxtitle='Number of clusters', caxrange=(1e-6, 2), caxtitlesize=15, caxtitleoffset=15,
                origin='lower')
            plt.show()
            plt.close()
        
            title = mename.split('-')[-1]+ '\n' + f'Era {era} nonzero counts'
            fig, ax = plottools.plot_hist_2d(mecounts, figsize=(12,6), title=title, titlesize=15,
                xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                docolorbar=True, caxtitle='Number of clusters', caxtitlesize=15, caxtitleoffset=15,
                caxrange = (1e-6, 10), # temp for testing
                origin='lower')
            plt.show()
            plt.close()'''
        
            title = mename.split('-')[-1]+ '\n' + f'Era {era} normalized mean'
            fig, ax = plottools.plot_hist_2d(avgme, figsize=(12,6), title=title, titlesize=15,
                xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                docolorbar=True, caxtitle='Number of clusters', caxrange=(1e-6, 2), caxtitlesize=15, caxtitleoffset=15,
                origin='lower')
            plt.show()
            plt.close()
        
        # save array
        if dosave:
            outputfile = f'avgme_{era}_{melabel}.npy'
            outputfile = os.path.join(outputdir, outputfile)
            np.save(outputfile, avgme)

**Part 2: plot result**

In [None]:
outputdir = 'normdata'
    
for melabel, mename in mes.items():
    for era in eras:
        
        # load array
        f = f'avgme_{era}_{melabel}.npy'
        f = os.path.join(outputdir, f)
        avgme = np.load(f)
        
        # plot result
        title = mename.split('-')[-1]+ '\n' + f'Era {era} mean'
        fig, ax = plottools.plot_hist_2d(avgme, figsize=(12,6), title=title, titlesize=15,
                xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                docolorbar=True, caxtitle='Number of clusters', caxrange=(1e-6, 2), caxtitlesize=15, caxtitleoffset=15,
                origin='lower')
        plt.show()
        plt.close()

**Part 3: some more advanced plotting**

In [None]:
outputdir = 'normdata'

res = {}

for melabel, mename in mes.items():
    for era in eras:
        res[era] = {}
        
        # load array
        f = f'avgme_{era}_{melabel}.npy'
        f = os.path.join(outputdir, f)
        avgme = np.load(f)
        
        # average over y-axis
        nladders = int((avgme.shape[0] - 2) / 4)
        ids_far = np.concatenate((np.arange(start=0, stop=2*nladders, step=4), np.arange(start=2*nladders+2, stop=4*nladders+1, step=4)))
        ids_far = np.sort(np.concatenate((ids_far, ids_far+1)))
        mask_far = np.zeros(avgme.shape[0]).astype(bool)
        mask_far[ids_far] = True
        ids_close = np.concatenate((np.arange(start=2, stop=2*nladders, step=4), np.arange(start=2*nladders+4, stop=4*nladders+1, step=4)))
        ids_close = np.sort(np.concatenate((ids_close, ids_close+1)))
        mask_close = np.zeros(avgme.shape[0]).astype(bool)
        mask_close[ids_close] = True
        # approach 1, with mean
        #avgme_far = np.mean(avgme[mask_far, :], axis=0)
        #avgme_close = np.mean(avgme[mask_close, :], axis=0)
        # approach 2, with median
        avgme_far = np.quantile(avgme[mask_far, :], 0.5, axis=0)
        avgme_close = np.quantile(avgme[mask_close, :], 0.5, axis=0)
        res[era]['far'] = avgme_far
        res[era]['close'] = avgme_close
        
    # make a plot
    fig, axs = plt.subplots(ncols=3, figsize=(15, 6))
    cmap = mpl.colormaps.get_cmap('jet')
    colors = [cmap(f) for f in np.linspace(0, 1, num=len(eras))]
    for idx, era in enumerate(eras):
        axs[0].plot(res[era]['far'], color='b')
        axs[0].plot(res[era]['close'], color='r')
        axs[1].plot(res[era]['far'], color=colors[idx])
        axs[2].plot(res[era]['close'], color=colors[idx])
    axs[1].set_ylim((0.7, 2.5))
    axs[2].set_ylim((0.7, 2.5))