## Update OiDB metadata using JMMC's CatalogAPI

In [2]:
import pyvo
from a2p2.jmmc import Catalog

prod=False
prod=True

if prod==True:
    oidbMainUrl="http://oidb.jmmc.fr"
    oidbTapUrl="http://tap.jmmc.fr/vollt/tap"
else:    
    oidbMainUrl="http://oidb-beta.jmmc.fr"
    oidbTapUrl="http://tap-preprod.jmmc.fr/vollt/tap"

oidbTap = pyvo.dal.TAPService(oidbTapUrl)
oidbCatalog = Catalog("oidb", prod=prod)

INFO - a2p2.jmmc.catalogs  - 2025-04-01 17:40:43,370 - catalogs.py:51 - Create catalog wrapper to access 'oidb' (production API at https://oidb.jmmc.fr/restxq/catalogs)


### Define helper function to fix content

In [3]:
def fixGranule(granule, constraints=None):    
    todo = {}
    
    # try by collection
    obs_collection=granule["obs_collection"]    
    if obs_collection in collectionReference:
        ref = collectionReference[obs_collection]        
        for col in ref.keys():
            ref_value     = ref[col]
            granule_value = granule[col]
            if ref_value != granule_value:
                todo[col]=ref_value

    # try by granule
    granule_id=granule["id"]    
    if granule_id in granuleReferences:
        print (f"have to check id={granule_id}")
        ref = granuleReferences[granule_id]
        for col in ref.keys():
            ref_value     = ref[col]
            granule_value = granule[col]
            if ref_value != granule_value:
                todo[col]=ref_value
                
    # try by todo if we a some constraints
    if constraints:
        for col in constraints.keys():
            ref_value     = constraints[col]
            granule_value = granule[col]
            if ref_value != granule_value:
                todo[col]=ref_value
            
    if todo:
        todo["id"]=str(granule["id"])
    return todo
        
def fixRecords(query, todo=None):
    """ Walk accross granules selected by given query and return a list of dict so we can update the oidb catalog using api.updateRows()"""
    updates=[]
    granules = oidbTap.search(query).to_table()
    print (f"have to check {len(granules)} granule(s) for '{query}'")

    for granule in granules:
        update = fixGranule(granule,todo)
        if update:
            updates.append(update)
    if updates:
        return updates


### Fix some known cases

In [7]:
collectionReference={
    #"371c145f-7889-47d5-9c14-5f7889c7d510":{"datapi":"KRAUS"},
    #"c5a5d133-a178-4949-a5d1-33a1781949e5":{"datapi":"SOULAIN"},
    #"03a164ca-3110-446c-a164-ca3110746cdd":{"bib_reference":"2022Natur.602..403G" },
    #"12803c68-6124-40ad-803c-68612470ad3d":{"bib_reference":"2022A&A...665A..32D", "calib_level":3}
    #"76120870-3b88-4dfc-9208-703b881dfc57":{"datapi":"LAMBERTS"}
    #"588ac90a-2e92-42f7-8ac9-0a2e92c2f701": {"datapi":"AUGEREAU"},
    #"743b1acc-7463-464a-bb1a-cc7463664a0c": {"datapi":"AUGEREAU"},
    #"a3e9ba2e-cd26-40f6-a9ba-2ecd2600f6b1":{"bib_reference":"2024A&A...681A..47V", "calib_level":3}
    #"d259d56c-7304-4131-99d5-6c7304c13119": {"bib_reference":"2022MNRAS.510...82R"}
    "62aa7b9a-cd64-480e-aa7b-9acd64580eba": {"datapi":"BAINES"}
}

granuleReferences={
    # 1713641:{ "instrument_name":"MYSTIC" }, # instrument name was not present in OIDB
}
byConditionsReferences={
    #"obs_collection='ddf733ce-758e-43c4-b733-ce758e33c4f3' AND em_min > 1.8e-6" : { "instrument_name":"MYSTIC" },
    #"bib_reference='2024arXiv240507821P'" : {"bib_reference":"2024A&A...687A.306P"}
    "bib_reference='2023arXiv230308892C'" : {"bib_reference":"2023A&A...675A..46C"}
}


updates=[] # accumulates json update orders

# Fix case 1 : 
reason="Update reference collections"
for collection_id in collectionReference.keys():
    query = f"SELECT ALL * FROM oidb AS t WHERE obs_collection='{collection_id}'"
    u=fixRecords(query)
    if u:
        updates+=u
        print(f"{len(u)} record(s) to update through catalogAPI for '{reason}'")    
    else:
        print(f"nothing to do for '{reason}'")        

# Fix case 2 : 
reason="Update by granule references"
for granule_id in granuleReferences.keys():
    query = f"SELECT ALL * FROM oidb AS t WHERE id='{granule_id}'"
    u=fixRecords(query)
    if u:
        updates+=u
        print(f"{len(u)} record(s) to update through catalogAPI for '{reason}'")    
    else:
        print(f"nothing to do for '{reason}'")        
        
        
# Fix case 3 : 
reason = "Look for some records that may have bad metadata"
dataPiToFix="Guillaume Mella" # this guy can't be a datapi, he often does hesitate between RA and DEC
query = f"SELECT ALL * FROM oidb AS t WHERE ( t.datapi LIKE '%{dataPiToFix}%' )"
u=fixRecords(query)
if u:
    updates+=u
    print(f"{len(u)} record(s) to update through catalogAPI for '{reason}'")    
else:
    print(f"nothing to do for '{reason}'")   
    

# Fix case 4 : 
reason="Update by conditions references"
for condition in byConditionsReferences.keys():
    query = f"SELECT ALL * FROM oidb AS t WHERE {condition}"
    u=fixRecords(query, byConditionsReferences[condition])
    if u:
        updates+=u
        print(f"{len(u)} record(s) to update through catalogAPI for '{reason}', please check them and run next cell to apply changes.")    
    else:
        print(f"nothing to do for '{reason}'")        

have to check 776 granule(s) for 'SELECT ALL * FROM oidb AS t WHERE obs_collection='62aa7b9a-cd64-480e-aa7b-9acd64580eba''
nothing to do for 'Update reference collections'
have to check 0 granule(s) for 'SELECT ALL * FROM oidb AS t WHERE ( t.datapi LIKE '%Guillaume Mella%' )'
nothing to do for 'Look for some records that may have bad metadata'
have to check 0 granule(s) for 'SELECT ALL * FROM oidb AS t WHERE bib_reference='2023arXiv230308892C''
nothing to do for 'Update by conditions references'


In [8]:
updates

[]

In [9]:
if len(updates) > 0:
    pass
    # uncomment next line to apply changes    
    #oidbCatalog.updateRows(updates)

# That's all folks !!!