In [3]:
import glob, os, sys, argparse
import numpy as np
import pandas as pd
import subprocess
import datetime

# Idea of the workflow


The idea is that we first have the user generate a list of datasets that match a query for DQMIO files. Then follow the following steps:
1. Make a file that contains the releases for each dataset in the list. You can use any das query that deposits a list of the datasets and (optionally) you can use the `get_datasets.py` to add the releases used by each dataset.
2. User then manually selects which datasets to keep
3. User runs the `list_dasfiles.py` script indicating the location of the `datasets.txt` file and the name of the output file (say `list_files.txt`). 
3. Additionally you can make a dictionary of the run numbers that are in the files listed with `get_runnb.py`.
4. User then runs the `copy_dasfiles.py` script to use `xrdcp` commands to copy the files over to the desired area.

# Das queries listing datasets

In [30]:
# Make a file of all possible ZeroBias datasets of interest (2022 and 2023)
cmd1 = 'dasgoclient -query="dataset=/ZeroBias/Run2022*/DQMIO" | grep -v "Prompt" | grep -v "pilot"'
cmd2 = 'dasgoclient -query="dataset=/ZeroBias/Run2023*/DQMIO"'

CMD = f'{cmd1} > all_datasets.txt ; {cmd2} >> all_datasets.txt'
print(CMD)


dasgoclient -query="dataset=/ZeroBias/Run2022*/DQMIO" | grep -v "Prompt" | grep -v "pilot" > all_datasets.txt ; dasgoclient -query="dataset=/ZeroBias/Run2023*/DQMIO" >> all_datasets.txt


The above command will return a list of datasets of interest, now we can store this list and updated it whenever we want.

```shell
cat list_datasets.txt

/ZeroBias/Run2022A-10Dec2022-v1/DQMIO
/ZeroBias/Run2022A-19Jan2023-v2/DQMIO
/ZeroBias/Run2022A-23Feb2023-v1/DQMIO
/ZeroBias/Run2022B-10Dec2022-v1/DQMIO
/ZeroBias/Run2022B-19Jan2023-v2/DQMIO
/ZeroBias/Run2022B-23Feb2023-v1/DQMIO
/ZeroBias/Run2022C-02Nov22-v1/DQMIO
/ZeroBias/Run2022C-10Dec2022-v1/DQMIO
/ZeroBias/Run2022C-23Feb2023-v1/DQMIO
/ZeroBias/Run2022D-10Dec2022-v1/DQMIO
/ZeroBias/Run2022D-16Jun2023-v1/DQMIO
/ZeroBias/Run2022D-19Jan2023-v2/DQMIO
/ZeroBias/Run2022D-23Feb2023-v1/DQMIO
/ZeroBias/Run2022E-10Dec2022-v2/DQMIO
/ZeroBias/Run2022E-19Jan2023-v2/DQMIO
/ZeroBias/Run2022E-23Feb2023-v1/DQMIO
/ZeroBias/Run2022F-19Jan2023-v2/DQMIO
/ZeroBias/Run2022F-23Feb2023-v1/DQMIO
/ZeroBias/Run2022G-19Jan2023-v2/DQMIO
/ZeroBias/Run2022G-23Feb2023-v1/DQMIO
/ZeroBias/Run2023A-PromptReco-v1/DQMIO
/ZeroBias/Run2023A-PromptReco-v2/DQMIO
/ZeroBias/Run2023B-PromptReco-v1/DQMIO
/ZeroBias/Run2023C-PromptReco-v1/DQMIO
/ZeroBias/Run2023C-PromptReco-v2/DQMIO
/ZeroBias/Run2023C-PromptReco-v3/DQMIO
/ZeroBias/Run2023C-PromptReco-v4/DQMIO
/ZeroBias/Run2023D-PromptReco-v1/DQMIO
```

At this stage the user must take a look a the `all_datasets.txt` and delete the lines that are not of interest. Below we show an example of the subset of datasets that we keep.

In [3]:
file = open("../testing/all_datasets.txt")
datasets= [line.strip() for line in file.readlines()]
file.close()
del file
datasets

['/ZeroBias/Run2022A-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022B-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022D-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022E-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022F-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022G-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2023A-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023A-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023B-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v3/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v4/DQMIO',
 '/ZeroBias/Run2023D-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023D-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023E-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023F-PromptReco-v1/DQMIO']

Making a script that will add the CMSSW release to each dataset

In [5]:
script =[]
for i,dataset in enumerate(datasets):
    
    if i == 0:
        cmd = f'echo {dataset} - $(dasgoclient -query="release dataset={dataset}") > dataset_rel.txt' 
    else:
        cmd = f'echo {dataset} - $(dasgoclient -query="release dataset={dataset}") >> dataset_rel.txt'
    
    print(cmd)
    script.append(cmd+'\n')

file = open('make_dataset_with_release.sh','w')
file.writelines(script)
file.close()
del file

echo /ZeroBias/Run2022A-27Jun2023-v2/DQMIO - $(dasgoclient -query="release dataset=/ZeroBias/Run2022A-27Jun2023-v2/DQMIO") > datset_rel.txt
echo /ZeroBias/Run2022B-27Jun2023-v1/DQMIO - $(dasgoclient -query="release dataset=/ZeroBias/Run2022B-27Jun2023-v1/DQMIO") >> datset_rel.txt
echo /ZeroBias/Run2022C-27Jun2023-v1/DQMIO - $(dasgoclient -query="release dataset=/ZeroBias/Run2022C-27Jun2023-v1/DQMIO") >> datset_rel.txt
echo /ZeroBias/Run2022D-27Jun2023-v1/DQMIO - $(dasgoclient -query="release dataset=/ZeroBias/Run2022D-27Jun2023-v1/DQMIO") >> datset_rel.txt
echo /ZeroBias/Run2022E-27Jun2023-v1/DQMIO - $(dasgoclient -query="release dataset=/ZeroBias/Run2022E-27Jun2023-v1/DQMIO") >> datset_rel.txt
echo /ZeroBias/Run2022F-23Feb2023-v1/DQMIO - $(dasgoclient -query="release dataset=/ZeroBias/Run2022F-23Feb2023-v1/DQMIO") >> datset_rel.txt
echo /ZeroBias/Run2022G-23Feb2023-v1/DQMIO - $(dasgoclient -query="release dataset=/ZeroBias/Run2022G-23Feb2023-v1/DQMIO") >> datset_rel.txt
echo /ZeroBias

## Making a script that chooses only the dataset name from a file.

In [7]:
file = open("../testing/dataset_rel.txt")
text = file.read().splitlines()
print(text)
file.close()

['/ZeroBias/Run2022A-19Jan2023-v2/DQMIO - CMSSW_12_4_12', '/ZeroBias/Run2022B-19Jan2023-v2/DQMIO - CMSSW_12_4_12', '/ZeroBias/Run2022D-19Jan2023-v2/DQMIO - CMSSW_12_4_12', '/ZeroBias/Run2022E-19Jan2023-v2/DQMIO - CMSSW_12_4_12', '/ZeroBias/Run2022F-19Jan2023-v2/DQMIO - CMSSW_12_4_12', '/ZeroBias/Run2022G-19Jan2023-v2/DQMIO - CMSSW_12_4_12', '/ZeroBias/Run2023A-PromptReco-v1/DQMIO - CMSSW_13_0_0', '/ZeroBias/Run2023A-PromptReco-v2/DQMIO - CMSSW_13_0_3', '/ZeroBias/Run2023B-PromptReco-v1/DQMIO - CMSSW_13_0_3 CMSSW_13_0_5_patch1', '/ZeroBias/Run2023C-PromptReco-v1/DQMIO - CMSSW_13_0_5_patch2', '/ZeroBias/Run2023C-PromptReco-v2/DQMIO - CMSSW_13_0_6', '/ZeroBias/Run2023C-PromptReco-v3/DQMIO - CMSSW_13_0_6', '/ZeroBias/Run2023C-PromptReco-v4/DQMIO - CMSSW_13_0_6 CMSSW_13_0_7 CMSSW_13_0_7_TOTEM', '/ZeroBias/Run2023D-PromptReco-v1/DQMIO - CMSSW_13_0_9', '/ZeroBias/Run2023D-PromptReco-v2/DQMIO - CMSSW_13_0_10', '/ZeroBias/Run2023E-PromptReco-v1/DQMIO - CMSSW_13_2_2', '/ZeroBias/Run2023F-PromptR

In [8]:
text[0].rsplit()[0]

'/ZeroBias/Run2022A-19Jan2023-v2/DQMIO'

In [11]:
[i.rsplit()[0] for i in text ]

['/ZeroBias/Run2022A-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022B-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022D-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022E-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022F-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022G-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2023A-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023A-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023B-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v3/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v4/DQMIO',
 '/ZeroBias/Run2023D-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023D-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023E-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023F-PromptReco-v1/DQMIO']

In [28]:
# Testing deletion of intermediate script
!ls
os.remove('make_dataset_with_release.sh')
!ls

api testing.ipynb	      README.md
cronoutput		      scripts
make_dataset_with_release.sh  testing
open_files.ipynb	      test_list_and_copy_filesfromdas.ipynb
__pycache__
api testing.ipynb  __pycache__	testing
cronoutput	   README.md	test_list_and_copy_filesfromdas.ipynb
open_files.ipynb   scripts


# Listing files from datasets

In [13]:
datasets

['/ZeroBias/Run2022A-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022B-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022D-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022E-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022F-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2022G-19Jan2023-v2/DQMIO',
 '/ZeroBias/Run2023A-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023A-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023B-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v3/DQMIO',
 '/ZeroBias/Run2023C-PromptReco-v4/DQMIO',
 '/ZeroBias/Run2023D-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023D-PromptReco-v2/DQMIO',
 '/ZeroBias/Run2023E-PromptReco-v1/DQMIO',
 '/ZeroBias/Run2023F-PromptReco-v1/DQMIO']

In [15]:
# IMPORTANT This file does not run form inside SWAN env. Only run this on a terminal in the working directory.

# import subprocess

# eras = ['A','B','C','D','E','F','G']
for i,dataset in enumerate(datasets):
    
    if i == 0:
        cmd = f'dasgoclient -query="file dataset={dataset}" > listfiles.txt' 
    else:
        
        cmd = f'dasgoclient -query="file dataset={dataset}" >> listfiles.txt'
    
    # subprocess.run(cmd, shell=True, check=False)
    print(cmd)

dasgoclient -query="file dataset=/ZeroBias/Run2022A-19Jan2023-v2/DQMIO" > listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2022B-19Jan2023-v2/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2022D-19Jan2023-v2/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2022E-19Jan2023-v2/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2022F-19Jan2023-v2/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2022G-19Jan2023-v2/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2023A-PromptReco-v1/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2023A-PromptReco-v2/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2023B-PromptReco-v1/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2023C-PromptReco-v1/DQMIO" >> listfiles.txt
dasgoclient -query="file dataset=/ZeroBias/Run2023C-PromptReco-v2/DQMIO" >> listfiles.txt
dasgoclient -quer

## Identifying run numbers in queried files

In [56]:
file=open("testing/listfiles.txt")
lines = file.read().splitlines()
lines
file.close()

In [57]:
dasgoclient = "/cvmfs/cms.cern.ch/common/dasgoclient"

In [None]:
# subprocess.run(dasgoclient + f'-query="run file={lines[1]}"',shell=True)

In [58]:
def get_runnb_query(path):
    cmd = f'{dasgoclient} -query="run file={path}"'
    return cmd
[get_runnb_query(i) for i in lines]

['/cvmfs/cms.cern.ch/common/dasgoclient -query="run file=/store/data/Run2022A/ZeroBias/DQMIO/27Jun2023-v2/60000/10F4724E-FAEE-42FA-B883-D52C960930D8.root"',
 '/cvmfs/cms.cern.ch/common/dasgoclient -query="run file=/store/data/Run2022A/ZeroBias/DQMIO/27Jun2023-v2/60000/16A6C958-77BB-4AF0-A529-21C153023EAE.root"',
 '/cvmfs/cms.cern.ch/common/dasgoclient -query="run file=/store/data/Run2022A/ZeroBias/DQMIO/27Jun2023-v2/60000/E74B3B9A-4F74-4E2E-9F52-2C39EE9E69AD.root"',
 '/cvmfs/cms.cern.ch/common/dasgoclient -query="run file=/store/data/Run2022A/ZeroBias/DQMIO/27Jun2023-v2/60000/85AC0438-F303-4D99-ADA1-C4052FAB5EAB.root"',
 '/cvmfs/cms.cern.ch/common/dasgoclient -query="run file=/store/data/Run2022A/ZeroBias/DQMIO/27Jun2023-v2/60000/E86DC5CA-A865-4D12-A824-C7A17966DC10.root"',
 '/cvmfs/cms.cern.ch/common/dasgoclient -query="run file=/store/data/Run2022A/ZeroBias/DQMIO/27Jun2023-v2/60000/6E72C627-A1C5-4A2E-9A3D-9A0B61350B53.root"',
 '/cvmfs/cms.cern.ch/common/dasgoclient -query="run file=/

## Making a diff between current and new list of runs to copy

In [7]:
ls -lhtr ../testing

total 3.2M
-rw-r--r--. 1 gfidalgo 1399  657 Oct  3 22:58 all_datasets.txt
-rw-r--r--. 1 gfidalgo 1399  978 Oct  3 23:01 dataset_rel.txt
-rw-r--r--. 1 gfidalgo 1399 185K Oct  3 23:50 file_info.txt
-rw-r--r--. 1 gfidalgo 1399 165K Oct 12 16:45 all_files.txt
-rw-r--r--. 1 gfidalgo 1399  147 Oct 31 23:52 HI_datasets.txt
-rw-r--r--. 1 gfidalgo 1399 1.4M Oct 31 23:54 HI_files.txt
drwxr-xr-x. 2 gfidalgo 1399 4.0K Oct 31 23:57 [0m[01;34mdownloads[0m/
-rw-r--r--. 1 gfidalgo 1399    0 Dec 12 18:36 list_dasfilesTue_12_Dec_2023_18_36.log
-rw-r--r--. 1 gfidalgo 1399 1.5M Dec 12 18:36 HI_files_new.txt


In [20]:
file1 = open("../testing/HI_files.txt").readlines()
file2 = open("../testing/HI_files_new.txt").readlines()


In [26]:
len(file1),len(set(file1))

(11282, 11282)

In [27]:
len(file2),len(set(file2))

(11683, 11683)

In [36]:
dir(set(file1))

['__and__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__iand__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__isub__',
 '__iter__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__rsub__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__xor__',
 'add',
 'clear',
 'copy',
 'difference',
 'difference_update',
 'discard',
 'intersection',
 'intersection_update',
 'isdisjoint',
 'issubset',
 'issuperset',
 'pop',
 'remove',
 'symmetric_difference',
 'symmetric_difference_update',
 'union',
 'update']

In [41]:
file1set= set(file1)


In [42]:
file1set.intersection?

[0;31mDocstring:[0m
Return the intersection of two sets as a new set.

(i.e. all elements that are in both sets.)
[0;31mType:[0m      builtin_function_or_method


In [None]:
def diff_files(file1,file2):
    
    return diff

# Copy area


In [16]:
def file_downloaded(filename,directory='./'):
    return os.path.exists(directory.rstrip('/')+"/"+filename)

Need to read lines one by one and copy files. 
Before copy need to check if file is already copied on the area `/eos/project/m/mlplayground`

In [91]:
outputdir = 'dasfiles'

if os.path.exists(outputdir):
    print(f'Output directory {outputdir} already exists\nDownloading to {outputdir}')
else:
    os.makedirs(outputdir)

TESTING = True

if TESTING :
    nfiles = int(input('Input number of files to copy: \t'))
    print(f"Downloading only subset of {nfiles} files into {outputdir}")
else:
    nfiles = None
REDIR='root://cms-xrd-global.cern.ch/'
for file in files[slice(nfiles)]:
    cmd = f'xrdcp {REDIR}{file} {outputdir}'
#     print(cmd)
    fname = file.strip('/').replace('/','_')
    if file_downloaded(fname,outputdir): 
        print(f'{fname} already present. Moving on the the next!!!')
    else:
        print(f'Downloading {fname}')
              
                                                       

Output directory dasfiles already exists
Downloading to dasfiles
Input number of files to copy: 	42
Downloading only subset of 42 files into dasfiles
store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_DD2CC4FA-EF58-49AF-9BFB-F0ABE41BC4F4.root already present. Moving on the the next!!!
Downloading store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_23F53374-C49D-4BBA-BAB7-05412999556A.root
store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_18BE9BCB-8882-4E60-B021-AE2866470FA8.root already present. Moving on the the next!!!
Downloading store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_298E9DED-706B-4E7B-9A84-765AB5B70373.root
store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_2550000_B7C5D067-F538-42CE-9261-033D668E62D0.root already present. Moving on the the next!!!
Downloading store_data_Run2022B_ZeroBias_DQMIO_19Jan2023-v2_2550000_BA306EFA-20CD-4656-9442-DE5DB377E5E4.root
Downloading store_data_Run2022B_ZeroBias_DQMIO_19Jan2023-v2_2550000_9BE1219F-F76E-41AA-B2B6-70E683088D66.

In [2]:
# ! rm /eos/project/m/mlplayground/public/DQMIO/nanodqmio_from_das/*.root
! ls -alhtr /eos/project/m/mlplayground/public/DQMIO/nanodqmio_from_das/

# ! rm dasfiles/*.root

total 7.4G
drwxr-xr-x. 2 145910 2766 4.0K Apr  6 07:37 ..
-rw-r--r--. 1 145910 2766  92M Apr 24 02:08 store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_DD2CC4FA-EF58-49AF-9BFB-F0ABE41BC4F4.root
-rw-r--r--. 1 145910 2766 243M Apr 24 02:08 store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_23F53374-C49D-4BBA-BAB7-05412999556A.root
-rw-r--r--. 1 145910 2766 137M Apr 24 02:09 store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_18BE9BCB-8882-4E60-B021-AE2866470FA8.root
-rw-r--r--. 1 145910 2766 157M Apr 24 02:09 store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_30000_298E9DED-706B-4E7B-9A84-765AB5B70373.root
-rw-r--r--. 1 145910 2766  21M Apr 24 02:09 store_data_Run2022A_ZeroBias_DQMIO_19Jan2023-v2_2550000_B7C5D067-F538-42CE-9261-033D668E62D0.root
-rw-r--r--. 1 145910 2766 893M Apr 24 02:10 store_data_Run2022B_ZeroBias_DQMIO_19Jan2023-v2_2550000_BA306EFA-20CD-4656-9442-DE5DB377E5E4.root
-rw-r--r--. 1 145910 2766 4.9M Apr 24 06:04 store_data_Run2022B_ZeroBias_DQMIO_19Jan2023-v2_255000

## Making a diffing function that will only download new files

In [3]:
'/store/data/Run2023C/ZeroBias/DQMIO/PromptReco-v4/000/369/802/00000/57F5A56A-AF43-4A61-B292-0BFE8C3A749B.root'.replace("/","_")

'_store_data_Run2023C_ZeroBias_DQMIO_PromptReco-v4_000_369_802_00000_57F5A56A-AF43-4A61-B292-0BFE8C3A749B.root'