# A few example Irods querries


In [20]:
# imports
import platform
import subprocess
from getpass import getpass
from pathlib import Path
import pandas as pd

from irods.collection import Collection
from irods.data_object import DataObject
from irods.column import Criterion
from irods.models import Resource, DataObjectMeta

from ibridges import Session
from ibridges.meta import MetaData
from ibridges.util import get_dataobject
from ibridges.path import IrodsPath

### Create a session
Depending on the IDE the prompt to enter the password might appear on the top or bottom of the screen.


In [None]:
env_file = Path("~").expanduser().joinpath(".irods", 'irods_environment.json')
password = getpass('Your iRODS password')
session = Session(irods_env=env_file, password=password)
session
# Check user type
user_type, user_groups = session.get_user_info()
if 'rodsadmin' in user_type:
    print("You are a rodsadmin")

# Check if iccomands are available
icomm = False
is_linux = 'linux' in platform.platform().lower()
if is_linux:
    # Do not use check_output().  It raises an exception.
    if subprocess.call(['which', 'iinit']) == 0:
        print("icommands found on Linux")
        icomm = True
elif subprocess.call(['where', 'iinit']) == 0:
        print("icommands found on Windows")
        icomm = True
else:
    print("icommands not found")

# Check if the default environment json is loaded. 
if env_file.name == 'irods_environment.json' and icomm and 'rodsadmin' in user_type:
    subprocess.call(['iinit', password])
    print("all set, icommands activated")
else:
    icomm = False
    print("icommands automatically loads the default environment file: irods_environment.json")



### Data objects status
All data objects have a status between 0 and 4 which can be used to detect problems: 
- '0': 'stale'
- '1': 'good'
- '2': 'intermediate'
- '3': 'read-locked'
- '4': 'write-locked'
Only the status of '1' indicates the file is correctly uploaded.

The querry of the PRC can querry the status and list all files that are not 'good', but it cannot fix them.
For that the icommands, the python function creates the icommand lines which can directly be executed in the commandline if the icommands are installed. (and iinit has been run on the same server!).

In [None]:
listofmodrepl = []
dict_list = []
query = session.irods_session.query(Collection.name, DataObject.name, DataObject.replica_number).filter(Criterion('<>',DataObject.replica_status,'1'))
for result in query:
    dict_list.append({"Collection": result[Collection.name], 
                      "DataObject": result[DataObject.name], 
                      "ReplicaNumber": result[DataObject.replica_number]})
    #print("f"{result[Collection.name]}/{result[DataObject.name]} {result[DataObject.replica_number]}")
files_df = pd.DataFrame(dict_list)
files_df

For RODSadmins: create the modrepl commands


In [None]:
for result in query:
    command = 'iadmin modrepl logical_path {}/{} replica_number {} DATA_REPL_STATUS 1'.format(result[Collection.name],result[DataObject.name],result[DataObject.replica_number])
    listofmodrepl.append(command)

for item in listofmodrepl:
    print(item)

If icommands are initialized execute the commands

In [None]:
for item in listofmodrepl:
    result = subprocess.call(item, shell=True, capture_output=True)
    if result.returncode != 0:
        print("Error executing command: ", result.stderr)
        break

### Storage insight?
Data is moved around a lot, it starts on a 'hot' storage and is later moved to a 'medium' or 'cold' resoruce like a tape archive.  
However, over time data is also retreived, moved backed. Potentially even updated.   
During this process it is quite easy to forget a file somewhere. And before you know it the resource is full...  
Luckily, even users can list the resources they have acces to and list the freespace and files on them:  

In [None]:
# List the resrouce names and the free space
for resc in session.irods_session.query(Resource.name, Resource.free_space).get_results():
    print(f"{resc[Resource.name]}: \t {resc[Resource.free_space]} \t {Resource.children}")

Querry all files on a specific resource and return them as a pandas dataframe for further processing.

In [None]:
# List all data on a specific resource
hot_resc = 'hot_1'
query = session.irods_session.query(Collection.name, DataObject).filter(DataObject.resource_name == hot_resc).get_results()
total_size = 0
dict_list = []
for result in query:
    dict_list.append({"Collection": result[Collection.name], 
                      "DataObject": result[DataObject.name], 
                      "size": result[DataObject.size], 
                      "checksum": result[DataObject.checksum]})
    # print(f"{result[Collection.name]} {result[DataObject.name]} {result[DataObject.checksum]}")
    total_size += result[DataObject.size]

files_df = pd.DataFrame(dict_list)
print(f"Total number of files: {len(result)} with a size of {total_size} bytes")
print(files_df).head(5)

#### List only the files in a specific collection on a specific resource
The Querries can be extended with filters to get most detailed results.  
As an example this is a filter on the collection name. Below there are also filters on metadata
Note these two filters give the same result:  
- 'filter(DataObject.replica_status != 1)'  
- 'filter(Criterion('<>',DataObject.replica_status,'1'))'   

The Criterion gives more freedom to add specific keywords as: 'like', 'not like', '='. The wildcard is '%' for both.

In [None]:
hot_resc = 'hot_1'
collection_name_filter = Criterion('like', Collection.name , '/NPEC/%')
query = session.irods_session.query(Collection.name, DataObject).filter(DataObject.resource_name == hot_resc).filter(collection_name_filter).get_results()
total_size = 0
for result in query:
    print(f"{result[Collection.name]} {result[DataObject.name]} {result[DataObject.checksum]}")
    total_size += result[DataObject.size]

print(f"Total number of files: {len(result)} with a size of {total_size} bytes")

#### Listing only the files retreived from archive
This is an advanced querry which depends on the spectific setup, the instance I used duplication on the irods level for the tape archive and on the storage level for the 'hot' resources.  
As a result there are two replica's on tape visible in iRODS and one on hot.   

When data is move to tape, replica 1 & 2 become the tape archives. If a file is now retreived this becomes replica number 3. 
By adding an aditional filter on the name of the hot resource the user is 100% sure these are the files that have been retreived from tape.  
Giving an overview that can be used to keepup the old retreived data.

In [15]:
hot_resc = 'hot_1'
replica_number = 3
query = session.irods_session.query(Collection.name, DataObject).filter(DataObject.replica_number==replica_number).filter(DataObject.resource_name == hot_resc).get_results()
dict_list = []
for result in query:
    dict_list.append({"Collection": result[Collection.name], 
                    "DataObject": result[DataObject.name], 
                    "size": result[DataObject.size], 
                    "checksum": result[DataObject.checksum]})
    # print(f"{result[Collection.name]} {result[DataObject.name]} {result[DataObject.checksum]}")
files_df = pd.DataFrame(dict_list)
files_df

### Using querries to search through the metadata
Metadata is added in key, value, unit triples.
On way to search through these is to filter the returns of a querry
- 'DataObjectMeta.name' is the key
- 'DataObjectMeta.value' is the value
- 'DataObjectMeta.units' is the unit


In [None]:
query = session.irods_session.query(Collection.name, DataObject, case_sensitive = False).filter(DataObjectMeta.name == 'Crop').filter(DataObjectMeta.value == 'Potato').get_results()
dict_list = []
for result in query:
    dict_list.append({"Collection": result[Collection.name], 
                "DataObject": result[DataObject.name], 
                "size": result[DataObject.size], 
                "checksum": result[DataObject.checksum]})
    #print(f"{result[Collection.name]} {result[DataObject.name]} {result[DataObject.checksum]}")
files_df = pd.DataFrame(dict_list)
files_df

#### List the metadata of a specific object  
First get the data object, than access it's metadata.  
Additionally new values can be added with: 'obj_meta.add('Test', None, None)'

In [None]:
dataobject_path = '/NPEC/home/path to dataobject'
i_path = IrodsPath(session, dataobject_path)
do = get_dataobject(session, i_path)
obj_meta = MetaData(do)
metadata = obj_meta.to_dict()['metadata']
for key, val, unit in metadata:
    print(f"{key} {val} {unit}")