# Data Subsetting and Transformation Services in the Cloud

## Using NASA Earthdata Harmony

---

## Timing:
- Exercise: 45 minutes

---

## Summary

Harmony allows you to seamlessly analyze Earth observation data from different NASA data centers... 
https://harmony.earthdata.nasa.gov/

#### Benefits

- Consistent access patterns to EOSDIS holdings make cross-data center data access easier
- Data reduction services allow users to request only the data they want, in the format and projection they want
- Analysis Ready Data and cloud access will help reduce time-to-science
- Community Development helps reduce the barriers for re-use of code and sharing of domain knowledge

### Objectives

.
.
.

___

## Import Packages

In [3]:
pip install -U harmony-py

Collecting harmony-py
  Downloading harmony_py-0.3.0-py3-none-any.whl (21 kB)
Collecting python-dateutil~=2.7.5
  Using cached python_dateutil-2.7.5-py2.py3-none-any.whl (225 kB)
Collecting python-dotenv~=0.1
  Downloading python_dotenv-0.19.1-py2.py3-none-any.whl (17 kB)
Collecting sphinxcontrib-napoleon>=0.7
  Using cached sphinxcontrib_napoleon-0.7-py2.py3-none-any.whl (17 kB)
Collecting progressbar2~=3.5
  Downloading progressbar2-3.55.0-py2.py3-none-any.whl (26 kB)
Collecting curlify~=2.2
  Downloading curlify-2.2.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting python-utils>=2.3.0
  Using cached python_utils-2.5.6-py2.py3-none-any.whl (12 kB)
Collecting pockets>=0.3
  Using cached pockets-0.9.1-py2.py3-none-any.whl (26 kB)
Building wheels for collected packages: curlify
  Building wheel for curlify (setup.py) ... [?25ldone
[?25h  Created wheel for curlify: filename=curlify-2.2.1-py3-none-any.whl size=2672 sha256=958995c3e63f91c070cb6402ecf690d6a

In [102]:
from harmony import BBox, Client, Collection, Request, LinkType
from harmony.config import Environment
import requests
from pprint import pprint
import datetime as dt


## Discovery service options for a given data set

In [29]:
CMR_OPS = 'https://cmr.earthdata.nasa.gov/search'
url = f'{CMR_OPS}/{"collections"}'

response = requests.get(url, 
                        params={
                            'concept_id': 'C1940473819-POCLOUD',
                            },
                        headers={
                            'Accept': 'application/json'
                            }
                       )
response = response.json()
services = response['feed']['entry'][0]['associations']['services']
output_format = "umm_json"
service_url = "https://cmr.earthdata.nasa.gov/search/services"
for i in range(len(services)):
    response = requests.get(f"{service_url}.{output_format}?concept-id={services[i]}")
    response = response.json()
    pprint(response['items'][0]['umm']['Type'])
    pprint(response['items'][0]['umm']['ServiceOptions'])

'Harmony'
{'Subset': {'SpatialSubset': {'BoundingBox': {'AllowMultipleValues': False}},
            'TemporalSubset': {'AllowMultipleValues': False},
            'VariableSubset': {'AllowMultipleValues': True}},
 'SupportedReformattings': [{'SupportedInputFormat': 'HDF5',
                             'SupportedOutputFormats': ['NETCDF-4']},
                            {'SupportedInputFormat': 'NETCDF-4',
                             'SupportedOutputFormats': ['NETCDF-4']}]}
'OPeNDAP'
{'Subset': {'SpatialSubset': {'BoundingBox': {'AllowMultipleValues': False}},
            'TemporalSubset': {'AllowMultipleValues': False},
            'VariableSubset': {'AllowMultipleValues': True}},
 'SupportedReformattings': [{'SupportedInputFormat': 'NETCDF-4',
                             'SupportedOutputFormats': ['ASCII',
                                                        'CSV',
                                                        'NETCDF-3',
                                                

## Discover variable names

In [47]:
response = requests.get(url, 
                        params={
                            'concept_id': 'C1940473819-POCLOUD',
                            },
                        headers={
                            'Accept': 'application/json'
                            }
                       )
response = response.json()
variables = response['feed']['entry'][0]['associations']['variables']
output_format = "umm_json"
var_url = "https://cmr.earthdata.nasa.gov/search/variables"
for i in range(len(variables)):
    response = requests.get(f"{var_url}.{output_format}?concept-id={variables[i]}")
    response = response.json()
    # pprint(response['items'][0]['umm'])
    if 'Name' in response['items'][0]['umm']: pprint(response['items'][0]['umm']['Name'])

'sses_standard_deviation_4um'
'l2p_flags'
'time'
'dt_analysis'
'sses_standard_deviation'
'sst_dtime'
'sses_bias_4um'
'lat'
'sea_surface_temperature_4um'
'sses_bias'
'lon'
'sea_surface_temperature'
'quality_level'
'wind_speed'
'quality_level_4um'


In [35]:
variables = ['sea_surface_temperature','lat','lon']

In [34]:
# cmr_url = "https://"+"cmr.earthdata.nasa.gov"+"/search/granules.umm_json?collection_concept_id="+"C1940473819-POCLOUD"+"&sort_key=-start_date&bounding_box=-90,-45.75,90,-45"

# response = requests.get(cmr_url)

# gid=response.json()['items'][0]['meta']['concept-id']
# print(response.json()['items'][0])
# print(gid)

## Harmony-Py set up

In [6]:
harmony_client = Client()

In [48]:
request = Request(
    collection=Collection(id='MODIS_A-JPL-L2P-v2019.0'),
    spatial=BBox(-90,-45.75,90,-45), # bounding box example that can be used as an alternative to shapefile input
    temporal={
        'start': dt.datetime(2021, 11, 1),
        'stop': dt.datetime(2021, 11, 2),
    },
    # variables=variables,
)

### Valid request?

In [49]:
print(f"Request valid? {request.is_valid()}")
for m in request.error_messages():
    print(" * " + m)

Request valid? True


### Submit request

In [50]:
job_id = harmony_client.submit(request)
job_id

'31aba3ae-91d2-4c32-89e1-8f90c1cd9e63'

In [51]:
harmony_client.status(job_id)

{'status': 'running',
 'message': 'There were 2 collections that matched the provided short name MODIS_A-JPL-L2P-v2019.0. See https://cmr.earthdata.nasa.gov/concepts/C1940473819-POCLOUD for details on the selected collection. The version ID for the selected collection is 2019.0. To use a different collection submit a new request specifying the desired CMR concept ID instead of the collection short name.',
 'progress': 0,
 'created_at': datetime.datetime(2021, 11, 1, 23, 3, 27, 194000, tzinfo=tzlocal()),
 'updated_at': datetime.datetime(2021, 11, 1, 23, 3, 27, 194000, tzinfo=tzlocal()),
 'request': 'https://harmony.earthdata.nasa.gov/MODIS_A-JPL-L2P-v2019.0/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&subset=lat(-45.75%3A-45)&subset=lon(-90%3A90)&subset=time(%222021-11-01T00%3A00%3A00%22%3A%222021-11-02T00%3A00%3A00%22)',
 'num_input_granules': 19}

In [52]:
harmony_client.wait_for_processing(job_id, show_progress=True)


 [ Processing: 100% ] |###################################################| [|]


In [53]:
data = harmony_client.result_json(job_id)
pprint(data)

{'createdAt': '2021-11-01T23:03:27.194Z',
 'jobID': '31aba3ae-91d2-4c32-89e1-8f90c1cd9e63',
 'links': [{'href': 'https://harmony.earthdata.nasa.gov/stac/31aba3ae-91d2-4c32-89e1-8f90c1cd9e63/',
            'rel': 'stac-catalog-json',
            'title': 'STAC catalog',
            'type': 'application/json'},
           {'bbox': [-1.4, -45.7, 30.6, -45],
            'href': 'https://harmony.earthdata.nasa.gov/service-results/harmony-prod-staging/public/podaac/l2-subsetter/b76c2b8a-efde-4565-9fb2-aeba1064babf/20211101000000-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0_subsetted.nc4',
            'rel': 'data',
            'temporal': {'end': '2021-11-01T00:04:58.000Z',
                         'start': '2021-11-01T00:00:00.000Z'},
            'title': '20211101000000-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0_subsetted.nc4',
            'type': 'application/x-netcdf4'},
           {'bbox': [-26.1, -45.8, 5.8, -45],
            'href': 'https://harmony.earthdata.nasa.gov/service-resu

## Direct S3

In [103]:
results = harmony_client.result_urls(job_id, link_type=LinkType.s3)
print(results)
creds = harmony_client.aws_credentials()

<generator object Client.result_urls at 0x7ff58929e190>


In [104]:
import boto3

s3 = boto3.client('s3', **creds)
for url in results:
    bucket, obj, fn = s3_components(url)
    with open(fn, 'wb') as f:
        s3.download_fileobj(bucket, obj, f)

ModuleNotFoundError: No module named 'boto3'

In [56]:
stac_catalog_url = harmony_client.stac_catalog_url(job_id)
stac_catalog_url

'https://harmony.earthdata.nasa.gov/stac/31aba3ae-91d2-4c32-89e1-8f90c1cd9e63/?linktype=https'

In [97]:
from pystac import STAC_IO

def requests_read_method(uri):
    parsed = urlparse(uri)
    if parsed.hostname.startswith('harmony.'):
        return harmony_client.read_text(uri)
    else:
        return STAC_IO.default_read_text_method(uri)

STAC_IO.read_text_method = requests_read_method

ImportError: cannot import name 'STAC_IO' from 'pystac' (/srv/conda/envs/nsidc/lib/python3.9/site-packages/pystac/__init__.py)

In [100]:
from pystac_client import Client 

STAC_URL = 'https://harmony.earthdata.nasa.gov/stac/31aba3ae-91d2-4c32-89e1-8f90c1cd9e63/'

cat = Client.open(STAC_URL)

print(cat.title)
s3_links = []
for item in cat.get_items():
    print(item.datetime, [asset.href for asset in item.assets.values()])
    s3_links.append([asset.href for asset in item.assets.values()])

Harmony output for 31aba3ae-91d2-4c32-89e1-8f90c1cd9e63


NotImplementedError: ConformanceClasses.ITEM_SEARCH not supported

In [99]:
print(s3_links)

[]


<map at 0x7ff5891274f0>

In [81]:
# products = [c for c in cat.get_children()]


print(cat.title)

# search = cat.search()
# item_collection = search.get_all_items()
# list(item_collection)


# for item in cat.get_all_items():
#     print(item.datetime, [asset.href for asset in item.assets.values()])
#     s3_links.append([asset.href for asset in item.assets.values()])

print(cat.title)
cat.get_all_items()
# s3_links = []
# for item in cat.get_all_items():
#     print(item.datetime, [asset.href for asset in item.assets.values()])
#     s3_links.append([asset.href for asset in item.assets.values()])

Harmony output for 31aba3ae-91d2-4c32-89e1-8f90c1cd9e63
Harmony output for 31aba3ae-91d2-4c32-89e1-8f90c1cd9e63


<generator object Client.get_all_items at 0x7ff58b176350>

In [57]:
from pystac import Catalog

cat = Catalog.from_file(stac_catalog_url)

print(cat.title)
s3_links = []
for item in cat.get_all_items():
    print(item.datetime, [asset.href for asset in item.assets.values()])
    s3_links.append([asset.href for asset in item.assets.values()])

Exception: Could not read uri https://harmony.earthdata.nasa.gov/stac/31aba3ae-91d2-4c32-89e1-8f90c1cd9e63/?linktype=https

In [None]:
ds = xr.open_dataset('ogc_temp.nc')
ds

# Determine the lat/lon coordinate names
for coord_name, coord in ds.coords.items():
    if 'units' not in coord.attrs:
        continue
    if coord.attrs['units'] == 'degrees_north':
        lat_var = coord_name
    if coord.attrs['units'] == 'degrees_east':
        lon_var = coord_name

print(f'lat_var={lat_var}')
print(f'lon_var={lon_var}')