# Download a single file from remote data services

* Query CSW endpoints to fetch the thredds service
* download data using the following utilities
    - requests.get
    - urllib
    - wget
    - xarray.open_dataset()
---


- Authors: NCI Virtual Research Environment Team
- Keywords: CSW, geonetwork, data query, search
- Create Date: 2020-Jun
---

In [22]:
import os
import json
import rasterio
import requests
from owslib import fes
from owslib.fes import PropertyIsEqualTo, PropertyIsLike, BBox
from owslib.csw import CatalogueServiceWeb

import matplotlib.pyplot as plt
%matplotlib inline

In [29]:
def get_csw_records(csw, filter_list, pagesize=10, maxrecords=500):
    """Iterate `maxrecords`/`pagesize` times until the requested value in
    `maxrecords` is reached.
    """
    from owslib.fes import SortBy, SortProperty

    # Iterate over sorted results.
    sortby = SortBy([SortProperty("dc:title")])
    csw_records = {}
    startposition = 0
    nextrecord = getattr(csw, "results", 1)
    while nextrecord != 0:
        csw.getrecords2(
            constraints=filter_list,
            startposition=startposition,
            maxrecords=pagesize,
            sortby=sortby,
        )
        csw_records.update(csw.records)
        if csw.results["nextrecord"] == 0:
            break
        startposition += pagesize + 1  # Last one is included.
        if startposition >= maxrecords:
            break
    csw.records.update(csw_records)

In [28]:
from datetime import datetime
from owslib import fes
from owslib.fes import PropertyIsEqualTo, PropertyIsLike, BBox

# Region: Australia.
min_lon, max_lon = 110, 160
min_lat, max_lat = -45, -5

bbox = [min_lon, min_lat, max_lon, max_lat]

# Sea surface temperature CF names.
words = [
    "dataset",
    "geophysics",
    "GA"
]

kw = dict(wildCard="*", escapeChar="\\", singleChar="?", propertyname="apiso:AnyText")

or_filt = fes.And([fes.PropertyIsEqualTo('csw:AnyText',f'{val}') for val in words])

bbox = fes.BBox(bbox)

filter_list = [
    fes.And(
        [
            bbox,  # bounding box
            or_filt,  # or conditions (searching words)
        ]
    )
]

In [33]:
get_csw_records(csw, filter_list, pagesize=10, maxrecords=1000)

records = "\n".join(csw.records.keys())
print("Found {} records.\n".format(len(csw.records.keys())))
for key, value in list(csw.records.items()):
    print(u"[{}]\n{}\n".format(value.title, key))
    
record_list = [record for record in records]

Found 28 records.

[radmap v3 2015 ratio uranium over thorium grid]
221dcfd8-04fa-5083-e053-10a3070a64e3

[radmap v3 2015 ratio uranium squared over thorium grid]
221dcfd8-04fb-5083-e053-10a3070a64e3

[radmap v3 2015 unfiltered pct potassium grid]
221dcfd8-04fc-5083-e053-10a3070a64e3

[radmap v3 2015 unfiltered ppm thorium grid]
221dcfd8-04fd-5083-e053-10a3070a64e3

[radmap v3 2015 unfiltered ppm uranium grid]
221dcfd8-04fe-5083-e053-10a3070a64e3

[radmap v3 2015 unfiltered terrestrial dose rate grid]
221dcfd8-04ff-5083-e053-10a3070a64e3

[Total Magnetic Intensity (TMI) Grid of Australia 2015 - sixth edition]
221dcfd8-04ee-5083-e053-10a3070a64e3

[Total Magnetic Intensity (TMI) Grid of Australia with Variable Reduction to Pole (VRTP) - sixth edition]
221dcfd8-04ef-5083-e053-10a3070a64e3

[Australian National Geophysical Data Collection]
0a83ee36-d332-4669-a9fc-dfa7cec5c703

[Bouguer Gravity Anomaly Grid of Onshore Australia 2016]
d82dff4d-c1c6-4fd7-83c5-fc1bf39be0d7

[Broadband and lon

In [31]:
 def get_netcdf_urls(self, dataset_dict_generator):
        '''
        Generator to yield flattened dicts containing information for any netCDF distributions (file or OPeNDAP URL, file by preference)
        @param dataset_dict_generator: Generator yeilding dict objects containing information about each record including distributions
        '''
        for record_dict in dataset_dict_generator:
            distribution_dict = None
            for file_distribution in record_dict['distributions']:
                if 'file' in file_distribution['protocol'].lower():
                    match = re.match('(^file://)*(.*\.nc)$', file_distribution['url'])
                    try:
                        file_distribution['url'] = match.group(2) # Ignore any leading "file://"
                        if os.path.isfile(file_distribution['url']) and netCDF4.Dataset(file_distribution['url']): # Test for valid netCDF file
                            #logger.debug('file found')
                            distribution_dict = file_distribution
                            break
                    except:
                        logger.warning('Unable to open netCDF file {}'.format(file_distribution['url']))
                    
            if not distribution_dict:
                # Check for valid OPeNDAP endpoint if no valid file found
                for opendap_distribution in record_dict['distributions']:
                    if 'opendap' in opendap_distribution['protocol'].lower():
                        match = re.match('(.*\.nc)(\.html)*$', opendap_distribution['url'])
                        try:
                            opendap_distribution['url'] = match.group(1) # Ignore any trailing ".html"
                            #if netCDF4.Dataset(opendap_distribution['url']): # Test for valid OPeNDAP endpoint
                            #TODO: Make a better test for a valid OPeNDAP URL
                            response = requests.get(opendap_distribution['url'])
                            if response.status_code == 400: # Test for valid OPeNDAP endpoint
                                distribution_dict = opendap_distribution
                                break
                        except:
                            logger.warning('Unable to open OPeNDAP URL {}'.format(opendap_distribution['url']))
            
            if distribution_dict:
                yield self.flatten_distribution_dict(record_dict, distribution_dict)
                continue

In [34]:
netcdf_list = [distribution['url']
            for distribution in get_netcdf_urls(record_list)
            ]

print('{} NetCDF distributions found'.format(len(netcdf_list)))
    
return netcdf_list

TypeError: get_netcdf_urls() missing 1 required positional argument: 'dataset_dict_generator'

In [2]:
def download_file(in_filename, out_filename):
    if not os.path.exists(out_filename):
        print("Downloading", in_filename)
        response = requests.get(in_filename)
        with open(out_filename, 'wb') as f:
            f.write(response.content)

In [3]:
url = 'http://dapds00.nci.org.au/thredds/fileServer/rr2/national_geophysical_compilations/IR_gravity_anomaly_Australia_V1/IR_gravity_anomaly_Australia_V1.nc'

download_file(url, 'IR.nc')

Downloading http://dapds00.nci.org.au/thredds/fileServer/rr2/national_geophysical_compilations/IR_gravity_anomaly_Australia_V1/IR_gravity_anomaly_Australia_V1.nc


In [4]:
from urllib import request
request.urlretrieve(url,'IR1.nc')

('IR1.nc', <http.client.HTTPMessage at 0x1247d1278>)

In [6]:
!wget $url

--2020-05-20 21:28:24--  http://dapds00.nci.org.au/thredds/fileServer/rr2/national_geophysical_compilations/IR_gravity_anomaly_Australia_V1/IR_gravity_anomaly_Australia_V1.nc
Resolving dapds00.nci.org.au... 130.56.243.202
Connecting to dapds00.nci.org.au|130.56.243.202|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30489513 (29M) [application/x-netcdf]
Saving to: 'IR_gravity_anomaly_Australia_V1.nc'


2020-05-20 21:28:46 (1.31 MB/s) - 'IR_gravity_anomaly_Australia_V1.nc' saved [30489513/30489513]

