In [9]:
from siphon.catalog import TDSCatalog
from pathlib import Path
import os

This code will make bash scripts that can be run to curl down a local copy of each file in the catalog and pipe it up to the bucket on AWS.

You will want to adapt it to either change the links found in the catalog into local file paths, or scan the local directories instead of the catalog

There will be duplicate scripts for the latest revision and the one called "latest" and there will be old revisions.

You only want to run the script for the most recent revision using the revision number (not using the generic "latest").


Instead of the curl piped to the aws command you want:

aws s3 cp PATH_TO_LOCAL_COPY_OF_FILE/btm_co3_ion.nwa.full.hcast.daily.raw.r20230520.199301-201912.nc s3://noaa-oar-cefi-regional-mom6-pds/northwest_atlantic/full_domain/hindcast/daily/raw/r20230520/btm_co3_ion.nwa.full.hcast.daily.raw.r20230520.199301-201912.nc

The AWS authentication can be done using a config file in .aws or by running the aws command to take the keys from the command line.

In [10]:
match = 'cefi_portal'
catalog_root = 'https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/catalog.xml'
bucket = 's3://noaa-oar-cefi-regional-mom6-pds'

os.makedirs("catalogs", exist_ok=True)

catalog_prefix_index = catalog_root.find(match)
catalog_replace = catalog_root[:catalog_prefix_index + len(match) + 1]
print(catalog_replace)

https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/


In [17]:
def get_datasets(root):
    print(root.catalog_url)
    if root.datasets:
        filename = root.catalog_url.replace(catalog_replace,'')
        filename = filename.replace('/','_')
        filename = filename.replace('.xml','.sh')
        file_path = Path(f'catalogs/{filename}')
        # Create parent directories if they don't exist
        file_path.parent.mkdir(parents=True, exist_ok=True)
        with open(file_path, 'w') as fp:
            fp.write('eval "$(conda shell.bash hook)"\n')
            fp.write('mamba activate aws\n')
            for dataset in root.datasets:
                access_url = root.datasets[dataset].access_urls['HTTPServer']   
                dataset_prefix = access_url.find(match)
                url_replace = access_url[:dataset_prefix + len(match) + 1]
                outfile = access_url.replace(url_replace,'')
                fp.write('date\n')
                fp.write(f'echo {root.datasets[dataset].access_urls['HTTPServer']}\n')
                fp.write(f'curl -s -L {root.datasets[dataset].access_urls['HTTPServer']} | aws s3 cp - {bucket}/{outfile}\n')
                fp.write('date\n')
                fp.write('echo +=+=+=+=+=+=+=\n')
            fp.write('echo "==== Done ====="\n')
    for catalog in root.catalog_refs:
        child = root.catalog_refs[catalog].follow()
        get_datasets(child)

In [18]:
cat = TDSCatalog(catalog_root)
get_datasets(cat)

https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/catalog.xml
https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/decadal_forecast/catalog.xml
https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/decadal_forecast/daily/catalog.xml
https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/decadal_forecast/daily/raw/catalog.xml
https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/decadal_forecast/daily/regrid/catalog.xml
https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/decadal_forecast/monthly/catalog.xml
https://psl.noaa.gov/thredds/catalog/Projects/CEFI/regional_mom6/cefi_portal/northwest_atlantic/full_domain/decadal_forecast/monthly/raw/catalog.xml
https://p