# Preparing Data 4 the AE

### Import and function definition

In [1]:
import pandas as pd
import numpy as np
import timeit, math, os, copy, yaml, ast
import runregistry
from omsapi import OMSAPI
import matplotlib.pyplot as plt
from scipy.stats import linregress
from scipy.optimize import curve_fit
#%load_ext memory_profiler

In [2]:
def Make_img(histo, Xbins, Xmin, Xmax, Ybins, Ymin, Ymax):
    img = np.zeros((100, 100), dtype=np.float32)
    
    for i in range(int(Ybins)):
        for j in range(int(Xbins)):
            img[i, j] = histo[i][j]#histo[i*(int(Xbins)+2)+j]
    #img = img[1:-1, 1:-1]
    return img
    
def Show2Dimg(img, title='CSC occupancy'):
    img_temp = copy.deepcopy(img)
    cmap = plt.cm.jet
    cmap.set_under(color='white')
    max_=np.max(img_temp)
    img_temp[img_temp==0] = np.nan
    plt.imshow(img_temp, cmap=cmap, vmin=0.0000000001, vmax=max_)
    plt.colorbar()
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.show()
    del img_temp
    #plt.savefig('CSC_occupancy.png')

In [3]:
def Show1Dimg(vx, vy, xfit=None, yfit=None, x=r"Lumi [10$^{33}$ cm$^{-2}$ s$^{-1}$]", y="Occupancy (Hits/LS)", eymin=4, eymax=4, marker='.', line=False):
    plt.figure(figsize=(7, 4))
    if line==True:
        plt.axvline(x=9, color='red', linestyle='--')
    plt.plot(vx, vy, marker=marker, linestyle='', markersize=5, label='Data')
    if yfit is not None:
        plt.plot(xfit, yfit, color='red', label='Fit')
        plt.legend()
    plt.xlabel(x, size='14')
    plt.ylabel(y, size='14')
    plt.grid(True)
    plt.ticklabel_format(axis="y", style="sci", scilimits=(eymin,eymax))
    plt.rc('xtick', labelsize='12')
    plt.rc('ytick', labelsize='12')
    plt.title(r'$\mathbf{CMS}\ \mathit{Private\ work}$', x=0.24, y=1.0, size=14)
    #plt.title('CMS', fontweight='bold',x=0.12, y=1.0, size=14)
    plt.title('2023 (13.6 TeV)',loc='right', size=14)
    plt.legend()
    plt.show()

### Read data and add lumi info

In [4]:
me = "hRHGlobalm3"
dirs = os.listdir(".")
me_dirs = [i for i in dirs if me in i and os.path.isdir(i)]
print(me_dirs)

['hRHGlobalm3E', 'hRHGlobalm3D']


In [5]:
files_all = []
for dir in me_dirs:
    files = os.listdir(dir)
    me_files = [dir+"/"+i for i in files if me in i]
    files_all = files_all + me_files
filtered_files = [file for file in files_all if os.path.exists(file) and os.path.getsize(file) >= 601]

In [6]:
%%time
#monitoring_elements =pl.read_parquet(filtered_files).filter((pl.col('dataset').str.contains("StreamExpress"))).to_pandas()
monitoring_elements = pd.read_parquet(filtered_files)
monitoring_elements = monitoring_elements[monitoring_elements['dataset'].str.contains("StreamExpress")]

CPU times: user 17.4 s, sys: 5.06 s, total: 22.5 s
Wall time: 9.6 s


In [7]:
#%memit

In [8]:
monitoring_elements

Unnamed: 0,dataset,me,dataset_id,file_id,run_number,ls_number,me_id,x_min,x_max,x_bin,y_min,y_max,y_bin,entries,data
0,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,14895359997,381053,1,3,-800.0,800.0,100.0,-800.0,800.0,100.0,44,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,14895359997,381053,2,3,-800.0,800.0,100.0,-800.0,800.0,100.0,24,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,14895359997,381053,3,3,-800.0,800.0,100.0,-800.0,800.0,100.0,100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,14895359997,381053,4,3,-800.0,800.0,100.0,-800.0,800.0,100.0,87,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,14895359997,381053,5,3,-800.0,800.0,100.0,-800.0,800.0,100.0,74,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128361,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14540943117,380649,556,3,-800.0,800.0,100.0,-800.0,800.0,100.0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
128362,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14540943117,380649,557,3,-800.0,800.0,100.0,-800.0,800.0,100.0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
128363,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14540943117,380649,558,3,-800.0,800.0,100.0,-800.0,800.0,100.0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
128364,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14540943117,380649,559,3,-800.0,800.0,100.0,-800.0,800.0,100.0,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [9]:
dataset = monitoring_elements["dataset"].unique()
print(dataset)

['/StreamExpress/Run2024E-Express-v1/DQMIO'
 '/StreamExpress/Run2024D-Express-v1/DQMIO']


In [10]:
#monitoring_elements = monitoring_elements[monitoring_elements['dataset'].str.contains("StreamExpress")]

In [11]:
run_list = np.sort(np.unique(monitoring_elements["run_number"].unique()))
print(run_list)

[380255 380306 380307 380308 380309 380310 380325 380328 380329 380346
 380348 380349 380357 380360 380377 380384 380385 380399 380400 380401
 380403 380428 380442 380443 380444 380446 380447 380466 380470 380480
 380481 380513 380516 380517 380531 380532 380533 380534 380535 380537
 380538 380564 380565 380567 380620 380623 380624 380625 380626 380627
 380647 380648 380649 380808 380809 380812 380817 380818 380842 380843
 380844 380845 380846 380847 380848 380854 380860 380878 380879 380883
 380884 380895 380924 380925 380932 380933 380934 380942 380943 380944
 380945 380946 380947 380956 380963 381012 381017 381022 381053 381065
 381067 381068 381069 381070 381075 381078 381079 381080 381094 381102
 381105 381113 381114 381115 381147 381148 381149 381150 381151 381152
 381164 381166 381189 381190 381191 381199 381212 381277 381286 381289
 381290 381291 381292 381294 381298 381304 381309 381338 381341 381351
 381358 381364 381365 381371 381379 381380 381384 381398 381417 381443
 38145

In [12]:
runreg_df = pd.DataFrame(columns=["run_number", "cscGOOD", "cscSTANDBY", "cscBAD", "cscEMPTY"])
bad_runs = []
for r in run_list:
    run = runregistry.get_run(run_number=int(r))
    dict = {"run_number": int(r), "class": run["class"], "cscGOOD": 0, "cscSTANDBY":0, "cscBAD":0, "cscEMPTY":0}
    if 'csc-csc' in run["lumisections"]:
        #print("Run :", r)
        data_dict = run["lumisections"]["csc-csc"]
        for key in data_dict.keys():
            if key == "GOOD":
                dict["cscGOOD"] = data_dict["GOOD"]
            if key == "STANDBY":
                dict["cscSTANDBY"] = data_dict["STANDBY"]
            if key == "BAD":
                dict["cscBAD"] = data_dict["BAD"]
            if key == "EMPTY":
                dict["cscEMPTY"] = data_dict["EMPTY"]
        del data_dict
    runreg_df = pd.concat([runreg_df, pd.DataFrame([dict])], ignore_index=True)
    if ("Collisions" not in dict["class"]) or dict["cscSTANDBY"]!=0 or dict["cscBAD"]!=0 or dict["cscGOOD"]==0:
        print("Run :", r, "--> BAD")
        bad_runs.append(int(r))
    else:
        print("Run :", r, "--> GOOD")
    print(dict)
    del dict
    del run

Run : 380255 --> BAD
{'run_number': 380255, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380306 --> GOOD
{'run_number': 380306, 'class': 'Collisions24', 'cscGOOD': 281, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380307 --> GOOD
{'run_number': 380307, 'class': 'Collisions24', 'cscGOOD': 29, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380308 --> GOOD
{'run_number': 380308, 'class': 'Collisions24', 'cscGOOD': 44, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380309 --> GOOD
{'run_number': 380309, 'class': 'Collisions24', 'cscGOOD': 356, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380310 --> GOOD
{'run_number': 380310, 'class': 'Collisions24', 'cscGOOD': 1213, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380325 --> BAD
{'run_number': 380325, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 25, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380328 --> BAD
{'run_number': 380328, 'class': 'Commissioning24', 'cscGOOD': 

In [13]:
print(bad_runs)

[380255, 380325, 380328, 380329, 380357, 380428, 380442, 380443, 380480, 380620, 380808, 380809, 380812, 380817, 380818, 380842, 380843, 380848, 380854, 380895, 380924, 380925, 380956, 381012, 381017, 381094, 381102, 381105, 381166, 381277, 381289, 381290, 381291, 381292, 381294, 381298, 381304, 381338, 381341, 381351, 381398, 381458, 381464, 381465, 381484, 381542, 381593]


In [14]:
monitoring_elements = monitoring_elements[~monitoring_elements['run_number'].isin(bad_runs)]

In [15]:
run_list = np.sort(np.unique(monitoring_elements["run_number"].unique()))
print(run_list)

[380306 380307 380308 380309 380310 380346 380348 380349 380360 380377
 380384 380385 380399 380400 380401 380403 380444 380446 380447 380466
 380470 380481 380513 380516 380517 380531 380532 380533 380534 380535
 380537 380538 380564 380565 380567 380623 380624 380625 380626 380627
 380647 380648 380649 380844 380845 380846 380847 380860 380878 380879
 380883 380884 380932 380933 380934 380942 380943 380944 380945 380946
 380947 380963 381022 381053 381065 381067 381068 381069 381070 381075
 381078 381079 381080 381113 381114 381115 381147 381148 381149 381150
 381151 381152 381164 381189 381190 381191 381199 381212 381286 381309
 381358 381364 381365 381371 381379 381380 381384 381417 381443 381477
 381478 381479 381480 381499 381500 381515 381516 381543 381544 381594]


In [16]:
with open("config.yaml", 'r') as f:
    try:
        info = yaml.safe_load(f)
    except yaml.YAMLError as exc:
        print(f"Errore nella lettura del file: {exc}")

In [17]:
omsapi = OMSAPI("https://cmsoms.cern.ch/agg/api", "v1", cert_verify=False)
omsapi.auth_oidc(info["APIClient"]["client_ID"], info["APIClient"]["Client_Secret"])

In [18]:
lumi_dfs = []
for r in run_list:
    print("Run :", r)
    df = []
    ls_query = omsapi.query("lumisections")
    ls_query.filter("run_number", r)
    ls_query.sort("lumisection_number", asc=False).paginate(page=1, per_page=100000)
    response = ls_query.data().json()["data"];
    for i in range(len(response)):
        df.append(response[i]["attributes"])
    lumi_dfs.append(pd.DataFrame(df)) # Downcasting object dtype arrays on .fillna is deprecated
    del df
    del response
    del ls_query

Run : 380306
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380306&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380307
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380307&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380308
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380308&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380309
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380309&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380310
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380310&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380346
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380346&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380348
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380348&sort=-lumisectio

In [19]:
with pd.option_context('future.no_silent_downcasting', True):
    for i in range(len(lumi_dfs)):
        lumi_dfs[i]['castor_ready'] = lumi_dfs[i]['castor_ready'].fillna(False)
        lumi_dfs[i]['gem_ready'] = lumi_dfs[i]['gem_ready'].fillna(False)
        lumi_dfs[i]['zdc_ready'] = lumi_dfs[i]['zdc_ready'].fillna(False)
        lumi_dfs[i]['prescale_index'] = lumi_dfs[i]['prescale_index'].fillna(-1)
        lumi_dfs[i]['prescale_name'] = lumi_dfs[i]['prescale_name'].fillna("")

In [20]:
lumi_info = pd.concat(lumi_dfs)

In [21]:
lumi_info = lumi_info.rename(columns={'lumisection_number': 'ls_number'})
lumi_info["mean_lumi"]=(lumi_info["init_lumi"]+lumi_info["end_lumi"])/2

In [22]:
print(lumi_info.columns)

Index(['beam1_present', 'bpix_ready', 'ho_ready', 'dtp_ready', 'tecm_ready',
       'delivered_lumi_per_lumisection', 'recorded_lumi_per_lumisection',
       'castor_ready', 'init_lumi', 'hbhea_ready', 'recorded_lumi',
       'prescale_name', 'dtm_ready', 'end_lumi', 'beams_stable', 'esm_ready',
       'gemm_ready', 'ebp_ready', 'cscm_ready', 'start_time', 'beam1_stable',
       'hbhec_ready', 'rp_time_ready', 'cscp_ready', 'physics_flag',
       'dt0_ready', 'gem_ready', 'ls_number', 'tibtid_ready', 'fpix_ready',
       'rpc_ready', 'rp_sect_56_ready', 'pileup', 'esp_ready', 'eep_ready',
       'ebm_ready', 'delivered_lumi', 'gemp_ready', 'eem_ready', 'fill_number',
       'beam_present', 'tecp_ready', 'end_time', 'hf_ready',
       'rp_sect_45_ready', 'cms_active', 'prescale_index', 'zdc_ready',
       'hbheb_ready', 'tob_ready', 'run_number', 'beam2_stable',
       'beam2_present', 'mean_lumi'],
      dtype='object')


In [23]:
monitoring_elements = pd.merge(monitoring_elements, lumi_info, on=['run_number', 'ls_number'], how='left') 

In [24]:
#%memit

In [25]:
monitoring_elements = monitoring_elements[(monitoring_elements["beams_stable"]==True) & (monitoring_elements["cscm_ready"]==True) & (monitoring_elements["cms_active"]==True) & (monitoring_elements["beam_present"]==True) & (monitoring_elements["physics_flag"]==True)]


In [26]:
monitoring_elements = monitoring_elements.sort_values(by=['run_number', 'ls_number']).reset_index()
monitoring_elements = monitoring_elements.drop(columns=["index"])

In [27]:
monitoring_elements

Unnamed: 0,dataset,me,dataset_id,file_id,run_number,ls_number,me_id,x_min,x_max,x_bin,...,hf_ready,rp_sect_45_ready,cms_active,prescale_index,zdc_ready,hbheb_ready,tob_ready,beam2_stable,beam2_present,mean_lumi
0,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14276336277,380306,28,3,-800.0,800.0,100.0,...,True,False,True,9.0,False,True,True,True,True,0.595781
1,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14276336277,380306,29,3,-800.0,800.0,100.0,...,True,False,True,9.0,False,True,True,True,True,2.181817
2,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14276336277,380306,30,3,-800.0,800.0,100.0,...,True,False,True,9.0,False,True,True,True,True,5.422793
3,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14276366957,380306,31,3,-800.0,800.0,100.0,...,True,False,True,9.0,False,True,True,True,True,9.724593
4,/StreamExpress/Run2024D-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14957597,14276367117,380306,32,3,-800.0,800.0,100.0,...,True,False,True,9.0,False,True,True,True,True,12.181449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58206,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,15114499157,381594,1009,3,-800.0,800.0,100.0,...,True,False,True,15.0,False,True,True,True,True,0.000000
58207,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,15114499157,381594,1010,3,-800.0,800.0,100.0,...,True,False,True,15.0,False,True,True,True,True,0.000000
58208,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,15114499157,381594,1011,3,-800.0,800.0,100.0,...,True,False,True,15.0,False,True,True,True,True,0.002853
58209,/StreamExpress/Run2024E-Express-v1/DQMIO,CSC/CSCOfflineMonitor/recHits/hRHGlobalm3,14964606,15114499157,381594,1012,3,-800.0,800.0,100.0,...,True,False,True,15.0,False,True,True,True,True,0.019838


In [29]:
monitoring_elements.to_parquet(me+'_files_p1.parquet', index=False)