# Gather HCAL histos

In [1]:
import pandas as pd
import numpy as np

# import cmsdials
from cmsdials import Dials
from cmsdials.auth.bearer import Credentials
from cmsdials.filters import (
    FileIndexFilters,
    LumisectionHistogram1DFilters,
    LumisectionHistogram2DFilters,
    LumisectionFilters,
    RunFilters,
    MEFilters
)

In [2]:
creds = Credentials.from_creds_file()

dials = Dials(creds,workspace="hcal") # this will load into a DQM workspace by default (i.e. Tracker,HCAL,ECAL,etc.)

[2025-07-15 13:21:38,460] INFO: Credentials file not found, triggering device authentication flow...
[2025-07-15 13:21:39,652] INFO: This device will expire in 600 seconds.
[2025-07-15 13:21:39,653] INFO: Go to the following url and authenticate: https://auth.cern.ch/auth/realms/cern/device?user_code=VZXZ-LLRF
[2025-07-15 13:21:39,653] INFO: Checking authorization status every 5 seconds...
[2025-07-15 13:21:47,373] INFO: Device not authorized yet.
[2025-07-15 13:21:53,237] INFO: Device not authorized yet.
[2025-07-15 13:21:58,963] INFO: Device not authorized yet.
[2025-07-15 13:22:04,739] INFO: Device not authorized yet.
[2025-07-15 13:22:10,447] INFO: Device not authorized yet.
[2025-07-15 13:22:16,183] INFO: Device not authorized yet.
[2025-07-15 13:22:21,891] INFO: Device not authorized yet.
[2025-07-15 13:22:27,625] INFO: Device not authorized yet.
[2025-07-15 13:22:34,202] INFO: Device not authorized yet.
[2025-07-15 13:22:40,163] INFO: Device authorized, authentication finished s

In [5]:
kwargs= dict(
    status="FINISHED",
    dataset__regex="ZeroBias/Run2024[A-Z]-PromptReco/*",
    dim = 2,
    page_size = 500,
    me__regex = "OccupancyCut"
            )

In [7]:
allMEs= dials.mes.list(MEFilters(**kwargs))
allMEs

[MonitoringElement(me_id=102, me='Hcal/DigiTask/OccupancyCut/depth/depth1', count=1174143, dim=2),
 MonitoringElement(me_id=103, me='Hcal/DigiTask/OccupancyCut/depth/depth2', count=1174143, dim=2),
 MonitoringElement(me_id=104, me='Hcal/DigiTask/OccupancyCut/depth/depth3', count=1174143, dim=2),
 MonitoringElement(me_id=105, me='Hcal/DigiTask/OccupancyCut/depth/depth4', count=1174143, dim=2),
 MonitoringElement(me_id=106, me='Hcal/DigiTask/OccupancyCut/depth/depth5', count=1174143, dim=2),
 MonitoringElement(me_id=107, me='Hcal/DigiTask/OccupancyCut/depth/depth6', count=1174143, dim=2),
 MonitoringElement(me_id=108, me='Hcal/DigiTask/OccupancyCut/depth/depth7', count=1174143, dim=2),
 MonitoringElement(me_id=109, me='Hcal/DigiTask/OccupancyCut/depth/depthHO', count=1174143, dim=2)]

In [8]:
RunsDF = dials.run.list_all(
                    RunFilters(**kwargs),
                    enable_progress = True
).to_pandas()

ls_mask = RunsDF.loc[:,"ls_count"] > 1000
long_runs = RunsDF.loc[ls_mask].reset_index()
del RunsDF

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Now selecting only the Golden Json runs that are over 1000 Ls

In [37]:
import requests, json
# getting the 2024 Golden Json
url = "https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions24/Cert_Collisions2024_378981_386951_Golden.json"
res = requests.get(url)

golden2024json=json.loads(res.content)
golden_numbers = [int(i) for i in golden2024json]

In [45]:
df_golden = long_runs[long_runs.run_number.isin(golden_numbers)].drop(columns='index')
df_golden

Unnamed: 0,dataset_id,dataset,run_number,ls_count
0,14944573,/ZeroBias/Run2024B-PromptReco-v1/DQMIO,379154,1075
1,14949731,/ZeroBias/Run2024C-PromptReco-v1/DQMIO,379456,1208
2,14949731,/ZeroBias/Run2024C-PromptReco-v1/DQMIO,379660,1563
3,14949731,/ZeroBias/Run2024C-PromptReco-v1/DQMIO,379729,1534
4,14949731,/ZeroBias/Run2024C-PromptReco-v1/DQMIO,379765,1728
...,...,...,...,...
126,15099576,/ZeroBias/Run2024I-PromptReco-v2/DQMIO,386694,1336
127,15099576,/ZeroBias/Run2024I-PromptReco-v2/DQMIO,386704,1098
130,15099576,/ZeroBias/Run2024I-PromptReco-v2/DQMIO,386864,1496
131,15099576,/ZeroBias/Run2024I-PromptReco-v2/DQMIO,386885,1409


In [46]:
df_golden.nunique()

dataset_id     10
dataset        10
run_number    105
ls_count      100
dtype: int64

# Now downloading all the data and making the files
Let's make parque files of Golden Json runs

In [54]:
kwargs

{'status': 'FINISHED',
 'dataset__regex': 'ZeroBias/Run2024[A-Z]-PromptReco/*',
 'dim': 2,
 'page_size': 500,
 'me__regex': 'OccupancyCut'}

In [50]:
from urllib3 import Retry

In [102]:
to_remove = set(finished_runs).difference(df_golden.run_number)
remove_files = [f'run-{i:_}.parquet' for i in to_remove]

In [103]:
from glob import glob

In [108]:
for i in os.listdir('../files/Ls_ge_1k/'):
    if i in remove_files:
        print(f'rm {i}')

rm run-380_601.parquet
rm run-382_511.parquet
rm run-381_208.parquet
rm run-383_903.parquet
rm run-381_698.parquet
rm run-382_435.parquet
rm run-380_603.parquet
rm run-380_644.parquet
rm run-381_968.parquet
rm run-381_594.parquet
rm run-380_848.parquet
rm run-383_067.parquet
rm run-381_793.parquet
rm run-380_705.parquet
rm run-380_614.parquet
rm run-380_895.parquet
rm run-385_054.parquet
rm run-381_778.parquet
rm run-382_120.parquet
rm run-379_956.parquet
rm run-381_900.parquet
rm run-380_847.parquet
rm run-382_330.parquet
rm run-383_615.parquet


In [88]:
finished_runs = [int(i.removeprefix("run-").removesuffix(".parquet").replace("_","")) for i in finished]
df_golden[df_golden.run_number.isin(finished_runs)].run_number

0      379154
1      379456
2      379660
3      379729
4      379765
        ...  
100    385152
101    385168
102    385194
103    385281
104    385286
Name: run_number, Length: 81, dtype: int64

In [75]:
[int(i.removeprefix("run-").removesuffix(".parquet").replace("_","")) for i in finished]

[383712,
 385142,
 385094,
 383449,
 385194,
 380601,
 385168,
 380531,
 383767,
 384413,
 381443,
 382511,
 381208,
 381115,
 381417,
 383903,
 383814,
 384202,
 383756,
 384644,
 384935,
 385286,
 381698,
 383323,
 382684,
 380074,
 379456,
 380385,
 379660,
 379765,
 382435,
 384052,
 385127,
 380603,
 380644,
 381968,
 381594,
 380848,
 383996,
 380470,
 381191,
 382258,
 384492,
 383067,
 383512,
 380310,
 379729,
 383368,
 381793,
 384565,
 383174,
 384981,
 381484,
 384069,
 384188,
 383155,
 380567,
 381516,
 380005,
 384128,
 379154,
 381384,
 380513,
 385281,
 380705,
 384468,
 382654,
 383854,
 383162,
 380614,
 382343,
 383631,
 381164,
 381380,
 382769,
 380895,
 382580,
 385054,
 379866,
 381544,
 381778,
 384383,
 384239,
 384963,
 383254,
 382300,
 382594,
 381480,
 382120,
 384614,
 379956,
 380360,
 382921,
 381190,
 381900,
 383487,
 384291,
 380115,
 382913,
 380847,
 382330,
 380446,
 385152,
 383468,
 383615]

In [109]:
base_path = '../files/Ls_ge_1k'
os.makedirs(base_path,exist_ok=True)
finished = [file for file in os.listdir(base_path) if "run" in file]
for run in df_golden.run_number:
    for fin in finished:
        if f"{run:_}" in fin:
            print(f"skipping {run:_}")
        
    
# os.path.exists("run-378_239.parquet")
finished


skipping 379_154
skipping 379_456
skipping 379_660
skipping 379_729
skipping 379_765
skipping 379_866
skipping 380_005
skipping 380_074
skipping 380_115
skipping 380_310
skipping 380_360
skipping 380_385
skipping 380_446
skipping 380_470
skipping 380_513
skipping 380_531
skipping 380_567
skipping 381_115
skipping 381_164
skipping 381_190
skipping 381_191
skipping 381_380
skipping 381_384
skipping 381_417
skipping 381_443
skipping 381_480
skipping 381_484
skipping 381_516
skipping 381_544
skipping 382_258
skipping 382_300
skipping 382_343
skipping 382_580
skipping 382_594
skipping 382_654
skipping 382_684
skipping 382_769
skipping 382_913
skipping 382_921
skipping 383_155
skipping 383_162
skipping 383_174
skipping 383_254
skipping 383_323
skipping 383_368
skipping 383_449
skipping 383_468
skipping 383_487
skipping 383_512
skipping 383_631
skipping 383_712
skipping 383_756
skipping 383_767
skipping 383_814
skipping 383_854
skipping 383_996
skipping 384_052
skipping 384_069
skipping 384_1

['run-383_712.parquet',
 'run-385_142.parquet',
 'run-385_094.parquet',
 'run-383_449.parquet',
 'run-385_194.parquet',
 'run-385_168.parquet',
 'run-380_531.parquet',
 'run-383_767.parquet',
 'run-384_413.parquet',
 'run-381_443.parquet',
 'run-381_115.parquet',
 'run-381_417.parquet',
 'run-383_814.parquet',
 'run-384_202.parquet',
 'run-383_756.parquet',
 'run-384_644.parquet',
 'run-384_935.parquet',
 'run-385_286.parquet',
 'run-383_323.parquet',
 'run-382_684.parquet',
 'run-380_074.parquet',
 'run-379_456.parquet',
 'run-380_385.parquet',
 'run-379_660.parquet',
 'run-379_765.parquet',
 'run-384_052.parquet',
 'run-385_127.parquet',
 'run-383_996.parquet',
 'run-380_470.parquet',
 'run-381_191.parquet',
 'run-382_258.parquet',
 'run-384_492.parquet',
 'run-383_512.parquet',
 'run-380_310.parquet',
 'run-379_729.parquet',
 'run-383_368.parquet',
 'run-384_565.parquet',
 'run-383_174.parquet',
 'run-384_981.parquet',
 'run-381_484.parquet',
 'run-384_069.parquet',
 'run-384_188.pa

In [110]:
len(finished)

81

In [None]:
for run in df_golden.run_number:
    skip=False
    
    for fin in finished:
        if f"{run:_}" in fin:
            print(f"skipping {run:_}")
            skip=True
    
    if not skip:
        print(f"Starting download of {run:_}")
        irun_df = dials.h2d.list_all(
                                    LumisectionHistogram2DFilters(**kwargs,run_number=run),
                                    enable_progress=True,
                                    retries=Retry(total=3, backoff_factor=0.1)
        ).to_pandas()
        
        irun_df.to_parquet(f"{base_path}/run-{run:_}.parquet")
        print(f"Done with {run = :,}.") 
        del irun_df

skipping 379_154
skipping 379_456
skipping 379_660
skipping 379_729
skipping 379_765
skipping 379_866
skipping 380_005
skipping 380_074
skipping 380_115
skipping 380_310
skipping 380_360
skipping 380_385
skipping 380_446
skipping 380_470
skipping 380_513
skipping 380_531
skipping 380_567
skipping 381_115
skipping 381_164
skipping 381_190
skipping 381_191
skipping 381_380
skipping 381_384
skipping 381_417
skipping 381_443
skipping 381_480
skipping 381_484
skipping 381_516
skipping 381_544
skipping 382_258
skipping 382_300
skipping 382_343
skipping 382_580
skipping 382_594
skipping 382_654
skipping 382_684
skipping 382_769
skipping 382_913
skipping 382_921
skipping 383_155
skipping 383_162
skipping 383_174
skipping 383_254
skipping 383_323
skipping 383_368
skipping 383_449
skipping 383_468
skipping 383_487
skipping 383_512
skipping 383_631
skipping 383_712
skipping 383_756
skipping 383_767
skipping 383_814
skipping 383_854
skipping 383_996
skipping 384_052
skipping 384_069
skipping 384_1

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,324.
Starting download of 385_390


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,390.
Starting download of 385_422


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,422.
Starting download of 385_443


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,443.
Starting download of 385_515


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,515.
Starting download of 385_604


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,604.
Starting download of 385_620


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,620.
Starting download of 385_728


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,728.
Starting download of 385_738


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,738.
Starting download of 385_764


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,764.
Starting download of 385_842


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,842.
Starting download of 385_889


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,889.
Starting download of 385_934


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,934.
Starting download of 385_986


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 385,986.
Starting download of 386_025


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 386,025.
Starting download of 386_509


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 386,509.
Starting download of 386_554


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 386,554.
Starting download of 386_604


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with run = 386,604.
Starting download of 386_640


Progress:   0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
print(f"{base_path} has been populated with {len(os.listdir(base_path))} runs.")

../files/Ls_ge_1k has been populated with 105 runs.
