# Downloading from the ESGF archive

This Notebook runs through all the steps to download CORDEX data from the Earth System Grid Federation archive, from a single dataset, to multiple ensembles including various models, variables, and experiments. Before you start, check out the README.md and follow steps in the 'Setup' section to ensure you are able to run the notebook.

In [5]:
import os
import ssl
import xarray as xr
import libs.downloader as dl
from pyesgf.logon import LogonManager
from pyesgf.search import SearchConnection

In [6]:
# define your query
query = {
    'project': 'CORDEX',
    'domain': 'EUR-11',
    'experiment': 'rcp85',
    'variable': 'tas',
    'time_frequency': 'mon',
    'ensemble': 'r1i1p1'
}

# ensure the following are saved as environment variables
USERNAME = os.environ['ESGF_USERNAME']
PASSWORD = os.environ['ESGF_PASSWORD']
DATA_PATH = os.environ['DATA_HOME']

In [7]:
# check ESGF for number of datasets that satisfy query
conn = SearchConnection('http://esgf-data.dkrz.de/esg-search', distrib=True)
context = conn.new_context(**query, facets=query.keys())
context.hit_count

46

In [8]:
# login to ESGF and generate SSL context
myproxy_host = 'esgf-data.dkrz.de'

lm = LogonManager()
lm.logon(username=USERNAME, password=PASSWORD, hostname=myproxy_host)

sslcontext = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH)
sslcontext.load_verify_locations(capath=lm.esgf_certs_dir)
sslcontext.load_cert_chain(lm.esgf_credentials)

lm.is_logged_on()

True

In [9]:
# generate results and check an example dataset to verify all is working as expected
results = context.search()
example_dataset = results[0]
example_dataset.dataset_id

'cordex.output.EUR-11.CLMcom.MPI-M-MPI-ESM-LR.rcp85.r1i1p1.CCLM4-8-17.v1.mon.tas.v20140515|esgf1.dkrz.de'

In [64]:
# and now an example file within that dataset, including its http download link
example_files = example_dataset.file_context().search(ignore_facet_check=True)
example_file = example_files[0]
example_file.download_url

'http://esgf1.dkrz.de/thredds/fileServer/cordex/cordex/output/EUR-11/CLMcom/MPI-M-MPI-ESM-LR/rcp85/r1i1p1/CLMcom-CCLM4-8-17/v1/mon/tas/v20140515/tas_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_CLMcom-CCLM4-8-17_v1_mon_200601-201012.nc'

In [2]:
# call download_ensemble on the context generated by your query
downloads = dl.download_ensemble(context, DATA_PATH, ssl=sslcontext, verbose=True)




NB: If you're only downloading a handful of variables/experiments, the above code will be perfectly suitable for your needs. The following code blocks are only for circumstances where you want to leave your code running for a very long time (eg overnight) and would like to queue everything up for one big execution.

In [10]:
# or make multiple queries

queries = {
    'project': ['CORDEX'],
    'domain': ['EUR-11'],
    'experiment': ['historical', 'rcp26', 'rcp85'],
    'variable': ['tas', 'pr'],
    'time_frequency': ['mon'],
    'ensemble': ['r1i1p1']
}

contexts = dl.make_multiple_queries(queries=queries, conn=conn)

querying ESGF...
found the following datasets matching your queries:

project        domain         experiment     variable       time_frequency ensemble        hit_count
CORDEX         EUR-11         historical     tas            mon            r1i1p1          48
CORDEX         EUR-11         historical     pr             mon            r1i1p1          48
CORDEX         EUR-11         rcp26          tas            mon            r1i1p1          23
CORDEX         EUR-11         rcp26          pr             mon            r1i1p1          23
CORDEX         EUR-11         rcp85          tas            mon            r1i1p1          46
CORDEX         EUR-11         rcp85          pr             mon            r1i1p1          46


In [13]:
# and then queue them to download one after the other

# pull the trigger
dl.download_multiple_ensembles(queries=queries, conn=conn)

querying ESGF...
found the following datasets matching your queries:

project        domain         experiment     variable       time_frequency ensemble        hit_count
CORDEX         EUR-11         historical     tas            mon            r1i1p1          48
CORDEX         EUR-11         historical     pr             mon            r1i1p1          48
CORDEX         EUR-11         rcp26          tas            mon            r1i1p1          23
CORDEX         EUR-11         rcp26          pr             mon            r1i1p1          23
CORDEX         EUR-11         rcp85          tas            mon            r1i1p1          46
CORDEX         EUR-11         rcp85          pr             mon            r1i1p1          46


In [12]:
# check all your files have downloaded properly and are able to be opened by xarray

def check_for_corrupt(path_to_data):
    '''
    Very ugly function to verify all datasets have been downloaded properly and
    check for any files that may be corrupt.
    '''

    path = os.path.join(path_to_data, 'cordex', 'EUR-11')
    variables = [var for var in os.listdir(path) if not '.DS_Store' in var]
    errors = []

    for variable in variables:
        new_path = os.path.join(path, variable)
        experiments = [ex for ex in os.listdir(new_path) if not '.DS_Store' in ex]
        for experiment in experiments:
            new2path = os.path.join(new_path, experiment, 'r1i1p1')
            gcms = [gcm for gcm in os.listdir(new2path) if not '.DS_Store' in gcm]
            for gcm in gcms:
                new3path = os.path.join(new2path, gcm)
                rcms = [rcm for rcm in os.listdir(new3path) if not '.DS_Store' in rcm]
                for rcm in rcms:
                    new4path = os.path.join(new3path, rcm)
                    filenames = os.listdir(new4path)
                    filepaths = [os.path.join(new4path, filename) for filename in filenames if '.DS_Store' not in filename]
                    try:
                        xr.open_mfdataset(filepaths)
                    except Exception:
                        errors.append(new4path)

    print(*errors, sep='\n')

check_for_corrupt(DATA_PATH)



