# Gather HCAL histos

In [None]:
import pandas as pd
import numpy as np

# import cmsdials
from cmsdials import Dials
from cmsdials.auth.bearer import Credentials
from cmsdials.filters import (
    FileIndexFilters,
    LumisectionHistogram1DFilters,
    LumisectionHistogram2DFilters,
    LumisectionFilters,
    RunFilters,
    MEFilters
)

In [None]:
creds = Credentials.from_creds_file()

dials = Dials(creds,workspace="hcal") # this will load into a DQM workspace by default (i.e. Tracker,HCAL,ECAL,etc.)

In [None]:
kwargs= dict(
    status="FINISHED",
    dataset__regex="ZeroBias/Run2024[A-Z]-PromptReco/*",
    dim = 2,
    page_size = 500,
    me__regex = "OccupancyCut"
            )

In [None]:
allMEs= dials.mes.list(MEFilters(**kwargs))
allMEs

In [None]:
RunsDF = dials.run.list_all(
                    RunFilters(**kwargs),
                    enable_progress = True
).to_pandas()

ls_mask = RunsDF.loc[:,"ls_count"] > 1000
long_runs = RunsDF.loc[ls_mask].reset_index()
del RunsDF

Now selecting only the Golden Json runs that are over 1000 Ls

In [None]:
import requests, json
# getting the 2024 Golden Json
url = "https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions24/Cert_Collisions2024_378981_386951_Golden.json"
res = requests.get(url)

golden2024json=json.loads(res.content)
golden_numbers = [int(i) for i in golden2024json]

In [None]:
df_golden = long_runs[long_runs.run_number.isin(golden_numbers)].drop(columns='index')
df_golden

In [None]:
df_golden.nunique()

# Now downloading all the data and making the files
Let's make parque files of Golden Json runs

In [None]:
kwargs

In [None]:
base_path = '../files/Ls_ge_1k'
os.makedirs(base_path,exist_ok=True)
finished = [file for file in os.listdir(base_path) if "run" in file]
for run in df_golden.run_number:
    for fin in finished:
        if f"{run:_}" in fin:
            print(f"skipping {run:_}")
        
    
# os.path.exists("run-378_239.parquet")
finished


In [None]:
len(finished)

In [None]:
from urllib3 import Retry

In [None]:
for run in df_golden.run_number:
    skip=False
    
    for fin in finished:
        if f"{run:_}" in fin:
            print(f"skipping {run:_}")
            skip=True
    
    if not skip:
        print(f"Starting download of {run:_}")
        irun_df = dials.h2d.list_all(
                                    LumisectionHistogram2DFilters(**kwargs,run_number=run),
                                    enable_progress=True,
                                    retries=Retry(total=3, backoff_factor=0.1)
        ).to_pandas()
        
        irun_df.to_parquet(f"{base_path}/run-{run:_}.parquet")
        print(f"Done with {run = :,}.") 
        del irun_df

In [None]:
print(f"{base_path} has been populated with {len(os.listdir(base_path))} runs.")