# Finding and Downloading Root files for specific runs

In [1]:
import pandas as pd
import numpy as np

# import cmsdials
from cmsdials import Dials
from cmsdials.auth.bearer import Credentials
from cmsdials.filters import (
    FileIndexFilters,
    LumisectionHistogram1DFilters,
    LumisectionHistogram2DFilters,
    LumisectionFilters,
    RunFilters,
    MEFilters
)

In [2]:
creds = Credentials.from_creds_file()

dials = Dials(creds,workspace="hcal") # this will load into a DQM workspace by default (i.e. Tracker,HCAL,ECAL,etc.)

HTTPError: 400 Client Error: Bad Request for url: https://cmsdials-api.web.cern.ch/api/v1/auth/refresh-token/

In [None]:
kwargs= dict(
    status="FINISHED",
    dataset__regex="ZeroBias/Run2024[F-Z]-PromptReco/*",
    dim = 2,
    page_size = 500,
    me__regex="OccupancyCut"
            )

In [None]:
runs = dials.run.list_all(RunFilters(**kwargs),enable_progress=True).to_pandas()
lscount_mask = runs.ls_count > 1000
runs[lscount_mask]

In [None]:
# runs.to_csv("../files/TEMPall2024runs.csv",
#             index=False)

In [None]:
runs.info()

In [None]:
# range of run numbers in 2024
runs.run_number.iloc[0],runs.run_number.iloc[-1] 

In [None]:
runs[lscount_mask].nunique()

In [None]:
# run_numbers = runs[lscount_mask].run_number
files = dials.file_index.list_all(FileIndexFilters(**kwargs),enable_progress=True).to_pandas()

files

In [None]:
files.info()

In [None]:
files.to_csv("../files/Eras_F-J_files2024.csv",index=False)

In [None]:
! ls -alhtr ../files

# Now getting the list of files for a long run

In [None]:
runs[lscount_mask]

In [None]:
me = "Hcal/DigiTask/OccupancyCut/depth/depth1"
h2d_df = dials.h2d.list_all(LumisectionHistogram2DFilters(**kwargs,run_number=runs[lscount_mask].run_number.iloc[0],me=me),
                            enable_progress=True).to_pandas()
# h2d_df = dials.h2d.list(LumisectionHistogram2DFilters(**kwargs,run_number=run_numbers.iloc[0],me__regex="Hcal/DigiTask/OccupancyCut/*"),
#                        ).to_pandas()
h2d_df

# Putting it all together.

 We can use the info given by the h2d endpoint to give us the file.
 
 The run endpoint will give us the run numbers

In [None]:
files

# Finally getting the list of files for the long runs 

Each run *may* have many files attached to it so they are orgainized in their own csv file. I will later merge them all into one big pandas dataframe.

In [3]:
from glob import glob
downloaded_runs = [int(i.split("/")[-1].removesuffix(".csv")) for i in glob("../files/runcsvs/*")]
downloaded_runs


[383067,
 382511,
 384069,
 382913,
 384614,
 382435,
 383756,
 384565,
 385194,
 383162,
 383512,
 384413,
 384935,
 384981,
 383449,
 384963,
 382258,
 385142,
 383487,
 384644,
 384128,
 384188,
 385127,
 383615,
 384202,
 382769,
 384291,
 383468,
 383996,
 383174,
 382684,
 384239,
 383712,
 382654,
 385168,
 382330,
 382921,
 383854,
 384383,
 383368,
 382594,
 382120,
 384492,
 383767,
 383631,
 385054,
 382580,
 383814,
 383254,
 382343,
 382300,
 385094,
 384468,
 381968,
 383323,
 385152,
 383903,
 383155,
 384052]

In [None]:
# df_list = []
for run_number in runs[lscount_mask].run_number:
    
    if run_number in downloaded_runs:
        print(f"{run_number=} in the folder")
    else:
        print(f"Fetching files for {run_number=}")
        h2d_df = dials.h2d.list_all(LumisectionHistogram2DFilters(**kwargs,run_number=run_number,me=me),
                                    enable_progress=True).to_pandas()
        # df_list.append(h2d_df)
        files_list = []
        
        for i in h2d_df.file_id.unique():
            if i in files.file_id.unique():
                files_list.append(files[files.loc[:,"file_id"] == i])
        temp = pd.concat(files_list)
        temp.insert(0,"run_number",value = run_number)
        # temp.loc[:,"logical_file_name"] = temp.logical_file_name.apply(lambda x: redir+x)
        temp.to_csv(f"../files/runcsvs/{run_number}.csv",index=False)
        print(f"Finished {run_number=}")
        del temp
        del h2d_df




# Now lets concatenate all the files into 1 Dataframe

In [4]:
from glob import glob

In [48]:
df_list = [pd.read_csv(i) for i in glob("../files/runcsvs/*.csv") ]
df = pd.concat(df_list)
# df.index = df.index.rename("run_number_part")
# df = df.reset_index(drop=True)
df
# df = df.rename(columns={"Unnamed: 0": "old_index"})

Unnamed: 0,run_number,dataset_id,dataset,file_id,file_size,creation_date,last_modification_date,logical_file_name,status,err_trace
0,383067,14986300,/ZeroBias/Run2024F-PromptReco-v1/DQMIO,16870151637,981512899,2024-07-14 10:36:50+00:00,2024-07-14 10:36:50+00:00,/store/data/Run2024F/ZeroBias/DQMIO/PromptReco...,FINISHED,
1,383067,14986300,/ZeroBias/Run2024F-PromptReco-v1/DQMIO,16871126997,353625170,2024-07-14 13:20:59+00:00,2024-07-14 13:20:59+00:00,/store/data/Run2024F/ZeroBias/DQMIO/PromptReco...,FINISHED,
0,382511,14986300,/ZeroBias/Run2024F-PromptReco-v1/DQMIO,16339152997,939048235,2024-07-01 00:20:16+00:00,2024-07-01 00:20:16+00:00,/store/data/Run2024F/ZeroBias/DQMIO/PromptReco...,FINISHED,
0,384069,15042470,/ZeroBias/Run2024G-PromptReco-v1/DQMIO,17237452437,621348907,2024-08-09 22:42:38+00:00,2024-08-09 22:42:38+00:00,/store/data/Run2024G/ZeroBias/DQMIO/PromptReco...,FINISHED,
1,384069,15042470,/ZeroBias/Run2024G-PromptReco-v1/DQMIO,17237452677,326000139,2024-08-09 22:42:38+00:00,2024-08-09 22:42:38+00:00,/store/data/Run2024G/ZeroBias/DQMIO/PromptReco...,FINISHED,
...,...,...,...,...,...,...,...,...,...,...
9,384052,15042470,/ZeroBias/Run2024G-PromptReco-v1/DQMIO,17211568437,645990728,2024-08-08 19:12:53+00:00,2024-08-08 19:12:53+00:00,/store/data/Run2024G/ZeroBias/DQMIO/PromptReco...,FINISHED,
10,384052,15042470,/ZeroBias/Run2024G-PromptReco-v1/DQMIO,17211568597,508352855,2024-08-08 19:12:53+00:00,2024-08-08 19:12:53+00:00,/store/data/Run2024G/ZeroBias/DQMIO/PromptReco...,FINISHED,
11,384052,15042470,/ZeroBias/Run2024G-PromptReco-v1/DQMIO,17211568397,602417876,2024-08-08 19:12:53+00:00,2024-08-08 19:12:53+00:00,/store/data/Run2024G/ZeroBias/DQMIO/PromptReco...,FINISHED,
12,384052,15042470,/ZeroBias/Run2024G-PromptReco-v1/DQMIO,17211568557,638555974,2024-08-08 19:12:53+00:00,2024-08-08 19:12:53+00:00,/store/data/Run2024G/ZeroBias/DQMIO/PromptReco...,FINISHED,


In [46]:
df.nunique()

run_number                 59
dataset_id                  2
dataset                     2
file_id                   714
file_size                 714
creation_date             107
last_modification_date    107
logical_file_name         714
status                      1
err_trace                   0
dtype: int64

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   run_number              714 non-null    int64  
 1   dataset_id              714 non-null    int64  
 2   dataset                 714 non-null    object 
 3   file_id                 714 non-null    int64  
 4   file_size               714 non-null    int64  
 5   creation_date           714 non-null    object 
 6   last_modification_date  714 non-null    object 
 7   logical_file_name       714 non-null    object 
 8   status                  714 non-null    object 
 9   err_trace               0 non-null      float64
dtypes: float64(1), int64(4), object(5)
memory usage: 55.9+ KB


In [None]:
# df.to_csv("../files/Eras_F-J_runs2024.csv",index=False)