In [None]:
# imports

import os
import sys
import json
import importlib
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

thisdir = os.getcwd()
topdir = os.path.abspath(os.path.join(thisdir, '../../../'))
sys.path.append(topdir)

import tools.iotools as iotools
import tools.dftools as dftools
import plotting.plottools as plottools
from studies.ecal_occupancy_2024.plotting.plot_cluster_occupancy import plot_cluster_occupancy

In [None]:
# set path to files

# load occupancy from dqmio files

mes = ({
    'EB': 'EcalBarrel-EBOccupancyTask-EBOT digi occupancy',
    #'EE+': 'EcalEndcap-EEOccupancyTask-EEOT digi occupancy EE +',
    #'EE-': 'EcalEndcap-EEOccupancyTask-EEOT digi occupancy EE -'
})

eras = [
  #'Run2024A-v1', # only commissioning, no lumisections with physics flag set to True
  'Run2024B-v1',
  'Run2024C-v1',
  'Run2024D-v1',
  'Run2024E-v1',
  'Run2024E-v2',
  'Run2024F-v1',
  'Run2024G-v1',
  'Run2024H-v1',
  'Run2024I-v1',
  'Run2024I-v2',
  #'Run2024J-v1'  # pp reference run for heavy ion run; lower pileup and occupancy
]

datadir = '/eos/user/l/llambrec/dialstools-output-test'
year = '2024'
dataset = 'ZeroBias'
reco = 'PromptReco'

files = {}
for era in eras:
    files[era] = {}
    for melabel, mename in mes.items():
        mainera, version = era.split('-')
        f = f'{dataset}-{mainera}-{reco}-{version}-DQMIO-{mename}.parquet'
        f = os.path.join(datadir, f)
        files[era][melabel] = f

# existence check
missing = []
for era in eras:
    for melabel in mes.keys():
        f = files[era][melabel]
        if not os.path.exists(f):
            missing.append(f)
if len(missing) > 0:
    raise Exception(f'The following files do not exist: {missing}')
else:
    print(f'Found {len(files)} files.')

### Default approach: per era

In [None]:
# calculate average per era and per ME

outputdir = 'normdata'
if not os.path.exists(outputdir): os.makedirs(outputdir)
    
doplot = True
dosave = True

for melabel, mename in mes.items():
    for era in eras:
        print(f'Now running on era {era}, ME {melabel}...')
        f = files[era][melabel]
        
        # read number of instances in this era
        dummy = iotools.read_parquet(f, columns=['run_number'])
        nlumis = len(dummy)
        print(f'Found {nlumis} lumisections in this era.')
        
        # split in batches
        batch_size = 5000
        num_batches = int((nlumis-1)/batch_size)+1
        batch_sums = []
        batch_counts = []
        for batchidx in range(num_batches):
            print(f'  - Extracting batch {batchidx+1} / {num_batches}...')
            
            # get batch
            df = iotools.read_parquet(f, batch_size=batch_size, first_batch=batchidx, last_batch=batchidx)
            
            # do filtering
            df = df[df['entries'] > 0]
            print(f'    Found {len(df)} entries in batch passing filters.')
            if len(df)==0: continue
                
            me_array, _, _ = dftools.get_mes(df, xbinscolumn='x_bin', ybinscolumn='y_bin', runcolumn='run_number', lumicolumn='ls_number')
            print(f'    Found array of shape {me_array.shape}')
            
            # get sum
            threshold = 50
            mesum = np.sum(me_array, axis=0, where=(me_array > threshold))
            batch_sums.append(mesum)
            mecounts = np.count_nonzero(me_array > threshold, axis=0)
            batch_counts.append(mecounts)
            
            # explicitly delete some variables for memory saving
            del df
            del me_array
            
        # make total normalized sum over batches
        mesum = np.sum(np.array(batch_sums), axis=0)
        mesum = mesum / np.mean(mesum)
        # set small but nonzero values to zero for stability
        mesum[mesum < 0.05] = 0
        
        # calculate total per-bin nonzero counts
        mecounts = np.sum(np.array(batch_counts), axis=0)
        mecounts = np.where(mecounts==0, 1, mecounts)
        
        # calculate normalized per-bin averaged value
        avgme = np.divide(mesum, mecounts)
        avgme = avgme / np.mean(avgme)
        # set small values to zero (so they will be automatically masked)
        avgme[avgme < 0.05] = 0
        
        # plot result
        if doplot:
            '''title = melabel + f', era {era} normalized sum'
            fig, ax = plottools.plot_hist_2d(mesum, figsize=(12,6), title=title, titlesize=15,
                xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                docolorbar=True, caxtitle='Number of clusters', caxrange=(1e-6, 2), caxtitlesize=15, caxtitleoffset=15,
                origin='lower')
            plt.show()
            plt.close()
        
            title = melabel + f', era {era} nonzero counts'
            fig, ax = plottools.plot_hist_2d(mecounts, figsize=(12,6), title=title, titlesize=15,
                xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                docolorbar=True, caxtitle='Number of clusters', caxtitlesize=15, caxtitleoffset=15,
                caxrange = (1e-6, 10), # temp for testing
                origin='lower')
            plt.show()
            plt.close()'''
        
            title = melabel + f', era {era} normalized mean'
            fig, ax = plottools.plot_hist_2d(avgme, figsize=(12,6), title=title, titlesize=15,
                xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                docolorbar=True, caxtitle='Number of clusters', caxrange=(1e-6, 2), caxtitlesize=15, caxtitleoffset=15,
                origin='lower')
            plt.show()
            plt.close()
        
        # save array
        if dosave:
            outputfile = f'avgme_{era}_{melabel}.npy'
            outputfile = os.path.join(outputdir, outputfile)
            np.save(outputfile, avgme)

### Alternative approach: per batch

In [None]:
# calculate average per era and per ME

outputdir = 'normdata'
if not os.path.exists(outputdir): os.makedirs(outputdir)
    
doplot = True
dosave = True

for melabel, mename in mes.items():
    for era in eras:
        print(f'Now running on era {era}, ME {melabel}...')
        f = files[era][melabel]
        
        # read number of instances in this era
        metadata = iotools.read_parquet(f, columns=['run_number', 'entries'])
        nlumis = len(metadata)
        print(f'Found {nlumis} lumisections in this era.')
        
        # split in batches based on run number and target batch size
        mask_nonempty = (metadata['entries'].values > 0)
        run_nbs = metadata['run_number'].values
        run_nb_change_ids = np.nonzero(run_nbs[:-1] != run_nbs[1:])[0] + 1
        min_batch_size = 20000
        batch_ids = [0]
        for idx in run_nb_change_ids:
            if( np.sum(mask_nonempty[batch_ids[-1]:idx]) >= min_batch_size
                and np.sum(mask_nonempty[idx:]) >= min_batch_size ):
                batch_ids.append(idx)
        batch_ids.append(-1)
        batch_first_runs = [run_nbs[idx] for idx in batch_ids[:-1]]
        batch_last_runs = [run_nbs[idx-1] for idx in batch_ids[1:]]
        num_batches = len(batch_first_runs)
        batch_runs = [np.unique(run_nbs[batch_ids[idx]:batch_ids[idx+1]]) for idx in range(num_batches)]
        
        # loop over batches
        for batchidx in range(num_batches):
            print(f'  - Extracting batch {batchidx+1} / {num_batches}...')
            
            # get batch
            df = iotools.read_runs(f, batch_runs[batchidx])
            
            # do filtering
            df = df[df['entries'] > 0]
            print(f'    Found {len(df)} entries in batch passing filters.')
            if len(df)==0: continue
            
            # extract data
            first_run = df['run_number'].values[0]
            last_run = df['run_number'].values[-1]
            me_array, _, _ = dftools.get_mes(df, xbinscolumn='x_bin', ybinscolumn='y_bin', runcolumn='run_number', lumicolumn='ls_number')
            print(f'    Found array of shape {me_array.shape}')
            
            # get sum
            threshold = 50
            mesum = np.sum(me_array, axis=0, where=(me_array > threshold))
            mecounts = np.count_nonzero(me_array > threshold, axis=0)
            
            # explicitly delete some variables for memory saving
            del df
            del me_array
            
            # post-processing of mesum
            mesum = mesum / np.mean(mesum)
            # set small but nonzero values to zero for stability
            mesum[mesum < 0.05] = 0
        
            # post-processing of mecounts
            mecounts = np.where(mecounts==0, 1, mecounts)
        
            # calculate normalized per-bin averaged value
            avgme = np.divide(mesum, mecounts)
            avgme = avgme / np.mean(avgme)
            # set small values to zero (so they will be automatically masked)
            avgme[avgme < 0.05] = 0
        
            # plot result
            if doplot:
            
                title = melabel + f', era {era} normalized mean'
                fig, ax = plottools.plot_hist_2d(avgme, figsize=(12,6), title=title, titlesize=15,
                  xaxtitle=None, xaxtitlesize=None, yaxtitle=None, yaxtitlesize=None,
                  ticklabelsize=12, colorticklabelsize=12, extent=None, aspect=None,
                  docolorbar=True, caxtitle='Number of clusters', caxrange=(1e-6, 2), caxtitlesize=15, caxtitleoffset=15,
                  origin='lower')
                plt.show()
                plt.close()
        
            # save array
            if dosave:
                outputfile = f'avgme_{era}_{melabel}_batch{batchidx}_run{first_run}-{last_run}.npy'
                outputfile = os.path.join(outputdir, outputfile)
                np.save(outputfile, avgme)

### Plot results

In [None]:
outputdir = 'normdata'
    
for melabel, mename in mes.items():
    for era in eras:
        
        # load array
        f = f'avgme_{era}_{melabel}.npy'
        f = os.path.join(outputdir, f)
        avgme = np.load(f)
        
        # plot result
        title = melabel + f', era {era} normalized mean'
        fig, ax = plot_cluster_occupancy(avgme, figsize=(12,6), title=title, titlesize=15,
                xaxtitlesize=15, yaxtitlesize=15,
                ticklabelsize=12, colorticklabelsize=12,
                docolorbar=True, caxtitle='Digi occupancy',
                caxtitlesize=15, caxtitleoffset=15)
        plt.show()
        plt.close()