# Preparing Data 4 the AE

### Import and function definition

In [1]:
import sys, os
sys.path.append(os.path.abspath('./oms-api-client'))

In [2]:
import pandas as pd
import numpy as np
import timeit, math, copy, yaml, ast
from omsapi import OMSAPI
import matplotlib.pyplot as plt
from scipy.stats import linregress
from scipy.optimize import curve_fit
import re
from drawing_utilities import *
import pyarrow as pa

In [3]:
try:
    import runregistry
except:
    #Change this according to !pip show run-registry
    sys.path.append('/eos/home-i03/m/mcrucian/.local/lib/python3.9/site-packages')
    import runregistry

### Read data and add lumi info

In [4]:
path = "/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs"
dirs = os.listdir(path)

# Regex pattern to capture the part up to the number
pattern = r'hRHGlobal[mp]\d+'

unique_matches = sorted({match for dir_name in os.listdir(path) for match in re.findall(pattern, dir_name)})

# Display the unique matches
print("Available MEs in dir:")
print('\n'.join(unique_matches))

Available MEs in dir:
hRHGlobalm2
hRHGlobalm3
hRHGlobalm4
hRHGlobalp2
hRHGlobalp3
hRHGlobalp4


In [5]:
out_label = "180924"
me = "hRHGlobalm2"
dirs = os.listdir(path)
me_dirs = [os.path.join(path, i) for i in dirs if me in i and os.path.isdir(os.path.join(path, i))]
print(me_dirs)

['/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D', '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2E', '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2F', '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2G']


In [6]:
files_all = []
for dir in me_dirs:
    files = os.listdir(dir)
    me_files = [dir+"/"+i for i in files if me in i]
    files_all = files_all + me_files
filtered_files = [file for file in files_all if os.path.exists(file) and os.path.getsize(file) >= 601]
filtered_files

['/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D/CSC_CSCOfflineMonitor_recHits_hRHGlobalm2_None_380253_380287.parquet',
 '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D/CSC_CSCOfflineMonitor_recHits_hRHGlobalm2_None_380288_380322.parquet',
 '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D/CSC_CSCOfflineMonitor_recHits_hRHGlobalm2_None_380323_380357.parquet',
 '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D/CSC_CSCOfflineMonitor_recHits_hRHGlobalm2_None_380358_380392.parquet',
 '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D/CSC_CSCOfflineMonitor_recHits_hRHGlobalm2_None_380393_380427.parquet',
 '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D/CSC_CSCOfflineMonitor_recHits_hRHGlobalm2_None_380428_380462.parquet',
 '/eos/cms/store/group/ml/AD4MVDHackathon/ML4DQM_MUON/MEs/hRHGlobalm2D/CSC_CSCOfflineMonitor_recHits_hRHGlobalm2_None_380463_380497.parquet',
 '/eos

In [7]:
%%time
import dask.dataframe as dd
# Read the parquet file
monitoring_elements = dd.read_parquet(filtered_files)

# Filter rows
monitoring_elements = monitoring_elements[monitoring_elements['dataset'].str.contains("StreamExpress")]

CPU times: user 787 ms, sys: 237 ms, total: 1.02 s
Wall time: 2.82 s


In [8]:
monitoring_elements

Unnamed: 0_level_0,dataset,me,dataset_id,file_id,run_number,ls_number,me_id,x_min,x_max,x_bin,y_min,y_max,y_bin,entries,data
npartitions=65,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,object,object,int64,int64,int64,int64,int64,float64,float64,float64,float64,float64,float64,int64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [9]:
dataset = monitoring_elements["dataset"].unique()
print(dataset)

Dask Series Structure:
npartitions=1
    object
       ...
Name: dataset, dtype: object
Dask Name: unique-agg, 7 graph layers


In [10]:
run_list = np.sort(np.unique(monitoring_elements["run_number"].unique()))
print(run_list)

[380255 380306 380307 380308 380309 380310 380325 380328 380329 380346
 380348 380349 380357 380360 380377 380384 380385 380399 380400 380401
 380403 380428 380442 380443 380444 380446 380447 380466 380470 380480
 380481 380513 380516 380517 380531 380532 380533 380534 380535 380537
 380538 380564 380565 380567 380620 380623 380624 380625 380626 380627
 380647 380648 380649 380711 380712 380723 380725 380731 380736 380808
 380809 380812 380817 380818 380842 380843 380844 380845 380846 380847
 380848 380854 380860 380878 380879 380883 380884 380895 380924 380925
 380932 380933 380934 380942 380943 380944 380945 380946 380947 380956
 380963 381012 381017 381022 381053 381065 381067 381068 381069 381070
 381075 381078 381079 381080 381094 381102 381105 381113 381114 381115
 381147 381148 381149 381150 381151 381152 381164 381166 381189 381190
 381191 381199 381212 381277 381286 381289 381290 381291 381292 381294
 381298 381304 381309 381338 381341 381351 381358 381364 381365 381371
 38137

In [11]:
#if you get errors here, go through the readme and look for the instructions on how to get runregistry credentials
runreg_df = pd.DataFrame(columns=["run_number", "cscGOOD", "cscSTANDBY", "cscBAD", "cscEMPTY"])
bad_runs = []
for r in run_list:
    run = runregistry.get_run(run_number=int(r))
    try:
        dict = {"run_number": int(r), "class": run["class"], "cscGOOD": 0, "cscSTANDBY":0, "cscBAD":0, "cscEMPTY":0}
        if 'csc-csc' in run["lumisections"]:
            #print("Run :", r)
            data_dict = run["lumisections"]["csc-csc"]
            for key in data_dict.keys():
                if key == "GOOD":
                    dict["cscGOOD"] = data_dict["GOOD"]
                if key == "STANDBY":
                    dict["cscSTANDBY"] = data_dict["STANDBY"]
                if key == "BAD":
                    dict["cscBAD"] = data_dict["BAD"]
                if key == "EMPTY":
                    dict["cscEMPTY"] = data_dict["EMPTY"]
            del data_dict
        runreg_df = pd.concat([runreg_df, pd.DataFrame([dict])], ignore_index=True)
    except:
        dict = {"run_number": int(r), "class": "BAD", "cscGOOD": 0, "cscSTANDBY":0, "cscBAD":0, "cscEMPTY":0}
    if ("Collisions" not in dict["class"]) or dict["cscSTANDBY"]!=0 or dict["cscBAD"]!=0 or dict["cscGOOD"]==0:
        print("Run :", r, "--> BAD")
        bad_runs.append(int(r))
    else:
        print("Run :", r, "--> GOOD")
    print(dict)
    del dict
    del run

Run : 380255 --> BAD
{'run_number': 380255, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380306 --> GOOD
{'run_number': 380306, 'class': 'Collisions24', 'cscGOOD': 281, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380307 --> GOOD
{'run_number': 380307, 'class': 'Collisions24', 'cscGOOD': 29, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380308 --> GOOD
{'run_number': 380308, 'class': 'Collisions24', 'cscGOOD': 44, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380309 --> GOOD
{'run_number': 380309, 'class': 'Collisions24', 'cscGOOD': 356, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380310 --> GOOD
{'run_number': 380310, 'class': 'Collisions24', 'cscGOOD': 1213, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380325 --> BAD
{'run_number': 380325, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 25, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380328 --> BAD
{'run_number': 380328, 'class': 'Commissioning24', 'cscGOOD': 

Run : 380818 --> BAD
{'run_number': 380818, 'class': 'Commissioning24', 'cscGOOD': 89, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380842 --> BAD
{'run_number': 380842, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 21, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380843 --> BAD
{'run_number': 380843, 'class': 'Collisions24Special', 'cscGOOD': 23, 'cscSTANDBY': 52, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380844 --> GOOD
{'run_number': 380844, 'class': 'Collisions24Special', 'cscGOOD': 17, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380845 --> GOOD
{'run_number': 380845, 'class': 'Collisions24Special', 'cscGOOD': 131, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380846 --> GOOD
{'run_number': 380846, 'class': 'Collisions24Special', 'cscGOOD': 390, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380847 --> GOOD
{'run_number': 380847, 'class': 'Collisions24Special', 'cscGOOD': 1054, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 380848 --> BAD
{'run_number': 380848, 'cl

Run : 381289 --> BAD
{'run_number': 381289, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 381290 --> BAD
{'run_number': 381290, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 381291 --> BAD
{'run_number': 381291, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 381292 --> BAD
{'run_number': 381292, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 381294 --> BAD
{'run_number': 381294, 'class': 'Commissioning24', 'cscGOOD': 29, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 381298 --> BAD
{'run_number': 381298, 'class': 'Commissioning24', 'cscGOOD': 58, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 381304 --> BAD
{'run_number': 381304, 'class': 'Commissioning24', 'cscGOOD': 56, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 381309 --> GOOD
{'run_number': 381309, 'class': 'Collisions24', 'cscGOOD':

Run : 382063 --> BAD
{'run_number': 382063, 'class': 'Commissioning24', 'cscGOOD': 148, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382064 --> BAD
{'run_number': 382064, 'class': 'Commissioning24', 'cscGOOD': 40, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382070 --> BAD
{'run_number': 382070, 'class': 'Commissioning24', 'cscGOOD': 276, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382074 --> BAD
{'run_number': 382074, 'class': 'Commissioning24', 'cscGOOD': 218, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382075 --> BAD
{'run_number': 382075, 'class': 'Commissioning24', 'cscGOOD': 117, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382081 --> BAD
{'run_number': 382081, 'class': 'Commissioning24', 'cscGOOD': 35, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382082 --> BAD
{'run_number': 382082, 'class': 'Commissioning24', 'cscGOOD': 22, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382083 --> BAD
{'run_number': 382083, 'class': 'Commissioning24', 

Run : 382504 --> GOOD
{'run_number': 382504, 'class': 'Collisions24', 'cscGOOD': 413, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382562 --> BAD
{'run_number': 382562, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382568 --> GOOD
{'run_number': 382568, 'class': 'Collisions24', 'cscGOOD': 462, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382580 --> GOOD
{'run_number': 382580, 'class': 'Collisions24', 'cscGOOD': 1758, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382593 --> BAD
{'run_number': 382593, 'class': 'Commissioning24', 'cscGOOD': 15, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382594 --> GOOD
{'run_number': 382594, 'class': 'Collisions24', 'cscGOOD': 1434, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382595 --> GOOD
{'run_number': 382595, 'class': 'Collisions24', 'cscGOOD': 867, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 382617 --> GOOD
{'run_number': 382617, 'class': 'Collisions24', 'cscGOOD':

Run : 383247 --> GOOD
{'run_number': 383247, 'class': 'Collisions24', 'cscGOOD': 167, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383254 --> GOOD
{'run_number': 383254, 'class': 'Collisions24', 'cscGOOD': 1265, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383255 --> GOOD
{'run_number': 383255, 'class': 'Collisions24', 'cscGOOD': 958, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383275 --> GOOD
{'run_number': 383275, 'class': 'Collisions24', 'cscGOOD': 216, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383276 --> GOOD
{'run_number': 383276, 'class': 'Collisions24', 'cscGOOD': 305, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383277 --> GOOD
{'run_number': 383277, 'class': 'Collisions24', 'cscGOOD': 706, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383322 --> BAD
{'run_number': 383322, 'class': 'Commissioning24', 'cscGOOD': 0, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383323 --> GOOD
{'run_number': 383323, 'class': 'Collisions24', 'cscGOOD': 1

Run : 383743 --> GOOD
{'run_number': 383743, 'class': 'Collisions24', 'cscGOOD': 57, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383756 --> GOOD
{'run_number': 383756, 'class': 'Collisions24', 'cscGOOD': 1456, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383757 --> GOOD
{'run_number': 383757, 'class': 'Collisions24', 'cscGOOD': 19, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383758 --> GOOD
{'run_number': 383758, 'class': 'Collisions24', 'cscGOOD': 139, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383767 --> GOOD
{'run_number': 383767, 'class': 'Collisions24', 'cscGOOD': 1619, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383779 --> GOOD
{'run_number': 383779, 'class': 'Collisions24', 'cscGOOD': 255, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383811 --> GOOD
{'run_number': 383811, 'class': 'Collisions24', 'cscGOOD': 79, 'cscSTANDBY': 0, 'cscBAD': 0, 'cscEMPTY': 0}
Run : 383812 --> GOOD
{'run_number': 383812, 'class': 'Collisions24', 'cscGOOD': 347

In [12]:
print(bad_runs)

[380255, 380325, 380328, 380329, 380357, 380428, 380442, 380443, 380480, 380620, 380723, 380725, 380731, 380736, 380808, 380809, 380812, 380817, 380818, 380842, 380843, 380848, 380854, 380895, 380924, 380925, 380956, 381012, 381017, 381094, 381102, 381105, 381166, 381277, 381289, 381290, 381291, 381292, 381294, 381298, 381304, 381338, 381341, 381351, 381398, 381458, 381464, 381465, 381484, 381542, 381593, 381946, 381984, 381987, 382003, 382004, 382010, 382013, 382014, 382018, 382019, 382020, 382021, 382022, 382023, 382024, 382025, 382026, 382027, 382030, 382036, 382037, 382040, 382046, 382047, 382054, 382063, 382064, 382070, 382074, 382075, 382081, 382082, 382083, 382086, 382088, 382089, 382090, 382091, 382114, 382134, 382159, 382160, 382161, 382165, 382169, 382171, 382180, 382190, 382191, 382192, 382197, 382200, 382201, 382202, 382204, 382216, 382224, 382298, 382381, 382382, 382383, 382384, 382392, 382393, 382434, 382470, 382562, 382593, 382638, 382720, 382749, 382945, 382973, 383322,

In [13]:
monitoring_elements = monitoring_elements[~monitoring_elements['run_number'].isin(bad_runs)]

In [14]:
run_list = np.sort(np.unique(monitoring_elements["run_number"].unique()))
print(run_list)

[380306 380307 380308 380309 380310 380346 380348 380349 380360 380377
 380384 380385 380399 380400 380401 380403 380444 380446 380447 380466
 380470 380481 380513 380516 380517 380531 380532 380533 380534 380535
 380537 380538 380564 380565 380567 380623 380624 380625 380626 380627
 380647 380648 380649 380711 380712 380844 380845 380846 380847 380860
 380878 380879 380883 380884 380932 380933 380934 380942 380943 380944
 380945 380946 380947 380963 381022 381053 381065 381067 381068 381069
 381070 381075 381078 381079 381080 381113 381114 381115 381147 381148
 381149 381150 381151 381152 381164 381189 381190 381191 381199 381212
 381286 381309 381358 381364 381365 381371 381379 381380 381384 381417
 381443 381477 381478 381479 381480 381499 381500 381515 381516 381543
 381544 381594 382209 382213 382225 382226 382227 382229 382250 382251
 382255 382256 382257 382258 382260 382262 382299 382300 382313 382314
 382328 382329 382343 382344 382504 382568 382580 382594 382595 382617
 38262

In [15]:
with open("config.yaml", 'r') as f:
    try:
        info = yaml.safe_load(f)
    except yaml.YAMLError as exc:
        print(f"Cannot read the file: {exc}")

In [16]:
omsapi = OMSAPI("https://cmsoms.cern.ch/agg/api", "v1", cert_verify=False)
omsapi.auth_oidc(info["APIClient"]["client_ID"], info["APIClient"]["Client_Secret"])

In [18]:
lumi_dfs = []
for r in run_list:
    print("Run :", r)
    df = []
    ls_query = omsapi.query("lumisections")
    ls_query.filter("run_number", r)
    ls_query.sort("lumisection_number", asc=False).paginate(page=1, per_page=100000)
    response = ls_query.data().json()["data"];
    for i in range(len(response)):
        df.append(response[i]["attributes"])
    lumi_dfs.append(pd.DataFrame(df)) # Downcasting object dtype arrays on .fillna is deprecated
    del df
    del response
    del ls_query

Run : 380306
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380306&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380307
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380307&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380308
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380308&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380309
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380309&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380310
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380310&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380346
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380346&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380348
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380348&sort=-lumisectio

Run : 380933
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380933&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380934
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380934&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380942
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380942&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380943
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380943&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380944
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380944&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380945
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380945&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 380946
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=380946&sort=-lumisectio

Run : 381544
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=381544&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 381594
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=381594&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382209
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382209&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382213
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382213&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382225
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382225&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382226
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382226&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382227
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382227&sort=-lumisectio

Run : 382830
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382830&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382834
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382834&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382856
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382856&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382878
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382878&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382913
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382913&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382921
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382921&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 382922
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=382922&sort=-lumisectio

Run : 383512
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=383512&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 383514
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=383514&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 383536
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=383536&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 383537
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=383537&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 383538
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=383538&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 383539
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=383539&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 383540
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=383540&sort=-lumisectio

Run : 384030
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=384030&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 384031
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=384031&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 384032
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=384032&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 384033
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=384033&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 384034
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=384034&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 384035
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=384035&sort=-lumisection_number&page[offset]=0&page[limit]=100000
Run : 384036
https://cmsoms.cern.ch/agg/api/v1/lumisections?filter[run_number][EQ]=384036&sort=-lumisectio

In [19]:
for i in range(len(lumi_dfs)):
    lumi_dfs[i]['castor_ready'] = lumi_dfs[i]['castor_ready'].fillna(False)
    lumi_dfs[i]['gem_ready'] = lumi_dfs[i]['gem_ready'].fillna(False)
    lumi_dfs[i]['zdc_ready'] = lumi_dfs[i]['zdc_ready'].fillna(False)
    lumi_dfs[i]['prescale_index'] = lumi_dfs[i]['prescale_index'].fillna(-1)
    lumi_dfs[i]['prescale_name'] = lumi_dfs[i]['prescale_name'].fillna("")

In [20]:
lumi_info = pd.concat(lumi_dfs)

In [21]:
lumi_info = lumi_info.rename(columns={'lumisection_number': 'ls_number'})
lumi_info["mean_lumi"]=(lumi_info["init_lumi"]+lumi_info["end_lumi"])/2

In [22]:
print(lumi_info.columns)

Index(['beam1_present', 'bpix_ready', 'ho_ready', 'dtp_ready', 'tecm_ready',
       'delivered_lumi_per_lumisection', 'recorded_lumi_per_lumisection',
       'castor_ready', 'init_lumi', 'hbhea_ready', 'recorded_lumi',
       'prescale_name', 'dtm_ready', 'end_lumi', 'beams_stable', 'esm_ready',
       'gemm_ready', 'ebp_ready', 'cscm_ready', 'start_time', 'beam1_stable',
       'hbhec_ready', 'rp_time_ready', 'cscp_ready', 'physics_flag',
       'dt0_ready', 'gem_ready', 'ls_number', 'tibtid_ready', 'fpix_ready',
       'rpc_ready', 'rp_sect_56_ready', 'pileup', 'esp_ready', 'eep_ready',
       'ebm_ready', 'delivered_lumi', 'gemp_ready', 'eem_ready', 'fill_number',
       'beam_present', 'tecp_ready', 'end_time', 'hf_ready',
       'rp_sect_45_ready', 'cms_active', 'prescale_index', 'zdc_ready',
       'hbheb_ready', 'tob_ready', 'run_number', 'beam2_stable',
       'beam2_present', 'mean_lumi'],
      dtype='object')


In [23]:
import dask.dataframe as dd

# Convert the pandas DataFrame to a Dask DataFrame
lumi_info_dask = dd.from_pandas(lumi_info, npartitions=1)

# Merge the Dask DataFrames
monitoring_elements = monitoring_elements.merge(lumi_info_dask, on=['run_number', 'ls_number'], how='left')

#monitoring_elements = pd.merge(monitoring_elements, lumi_info, on=['run_number', 'ls_number'], how='left') 

In [24]:
monitoring_elements = monitoring_elements[(monitoring_elements["beams_stable"]==True) & (monitoring_elements["cscm_ready"]==True) & (monitoring_elements["cms_active"]==True) & (monitoring_elements["beam_present"]==True) & (monitoring_elements["physics_flag"]==True)]


In [25]:
# monitoring_elements = monitoring_elements.sort_values(by=['run_number', 'ls_number']).reset_index()
# monitoring_elements = monitoring_elements.drop(columns=["index"])

# Create the 'run_and_lumi' column to sort by
monitoring_elements['run_and_lumi'] = monitoring_elements['run_number'].astype(str) + monitoring_elements['ls_number'].astype(str)
monitoring_elements['run_and_lumi'] = monitoring_elements['run_and_lumi'].astype(int)

# Sort each partition by 'run_and_lumi'
monitoring_elements = monitoring_elements.map_partitions(lambda df: df.sort_values('run_and_lumi'))

# Compute the minimum 'run_and_lumi' value in each partition
# to concatenate them in the correct order
min_values = monitoring_elements.map_partitions(lambda df: df['run_and_lumi'].min()).compute()



In [26]:
# Order the partitions by the minimum 'run_and_lumi' value
ordered_partitions = [part for _, part in sorted(zip(min_values, monitoring_elements.to_delayed()))]

# Concatenate the ordered partitions
monitoring_elements = dd.from_delayed(ordered_partitions)

# Drop the 'run_and_lumi' column
monitoring_elements = monitoring_elements.drop('run_and_lumi', axis=1)

In [27]:
monitoring_elements

Unnamed: 0_level_0,dataset,me,dataset_id,file_id,run_number,ls_number,me_id,x_min,x_max,x_bin,y_min,y_max,y_bin,entries,data,beam1_present,bpix_ready,ho_ready,dtp_ready,tecm_ready,delivered_lumi_per_lumisection,recorded_lumi_per_lumisection,castor_ready,init_lumi,hbhea_ready,recorded_lumi,prescale_name,dtm_ready,end_lumi,beams_stable,esm_ready,gemm_ready,ebp_ready,cscm_ready,start_time,beam1_stable,hbhec_ready,rp_time_ready,cscp_ready,physics_flag,dt0_ready,gem_ready,tibtid_ready,fpix_ready,rpc_ready,rp_sect_56_ready,pileup,esp_ready,eep_ready,ebm_ready,delivered_lumi,gemp_ready,eem_ready,fill_number,beam_present,tecp_ready,end_time,hf_ready,rp_sect_45_ready,cms_active,prescale_index,zdc_ready,hbheb_ready,tob_ready,beam2_stable,beam2_present,mean_lumi
npartitions=65,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1
,object,object,int64,int64,int64,int64,int64,float64,float64,float64,float64,float64,float64,int64,object,bool,bool,bool,bool,bool,float64,float64,bool,float64,bool,object,object,bool,float64,bool,bool,bool,bool,bool,object,bool,bool,object,bool,bool,bool,bool,bool,bool,bool,object,float64,bool,bool,bool,object,bool,bool,int64,bool,bool,object,bool,object,bool,float64,bool,bool,bool,bool,bool,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
#For some reason some types in the table are not well understood and thus it is needed to manually
#specify the type of some columns, using pyarrow types

schema = {
    "data":pa.list_(pa.list_(pa.float64())),
    "recorded_lumi": pa.float64(),
    "prescale_name": pa.string(),
    "start_time": pa.string(),
    "rp_sect_56_ready": pa.bool_(),
    "rp_time_ready": pa.bool_(),
    "delivered_lumi": pa.float64(),
    "end_time": pa.string(),
    "rp_sect_45_ready": pa.bool_()
}

monitoring_elements.to_parquet(me+'_'+out_label+'_s0.parquet', schema=schema)