# Data Subsetting and Transformation Services in the Cloud

## Using NASA Earthdata Harmony

---

## Timing:
- Exercise: 45 minutes

---

## Summary

Harmony allows you to seamlessly analyze Earth observation data from different NASA data centers... 
https://harmony.earthdata.nasa.gov/

#### Benefits

- Consistent access patterns to EOSDIS holdings make cross-data center data access easier
- Data reduction services allow users to request only the data they want, in the format and projection they want
- Analysis Ready Data and cloud access will help reduce time-to-science
- Community Development helps reduce the barriers for re-use of code and sharing of domain knowledge

### Objectives

.
.
.

___

## Import Packages

In [1]:
pip install -U harmony-py

Note: you may need to restart the kernel to use updated packages.


In [2]:
from harmony import BBox, Client, Collection, Request, LinkType
from harmony.config import Environment
import requests
from pprint import pprint
import datetime as dt


## Discovery service options for a given data set

In [3]:
CMR_OPS = 'https://cmr.earthdata.nasa.gov/search'
url = f'{CMR_OPS}/{"collections"}'

response = requests.get(url, 
                        params={
                            'concept_id': 'C1940473819-POCLOUD',
                            },
                        headers={
                            'Accept': 'application/json'
                            }
                       )
response = response.json()
services = response['feed']['entry'][0]['associations']['services']
output_format = "umm_json"
service_url = "https://cmr.earthdata.nasa.gov/search/services"
for i in range(len(services)):
    response = requests.get(f"{service_url}.{output_format}?concept-id={services[i]}")
    response = response.json()
    pprint(response['items'][0]['umm']['Type'])
    pprint(response['items'][0]['umm']['ServiceOptions'])

'Harmony'
{'Subset': {'SpatialSubset': {'BoundingBox': {'AllowMultipleValues': False}},
            'TemporalSubset': {'AllowMultipleValues': False},
            'VariableSubset': {'AllowMultipleValues': True}},
 'SupportedReformattings': [{'SupportedInputFormat': 'HDF5',
                             'SupportedOutputFormats': ['NETCDF-4']},
                            {'SupportedInputFormat': 'NETCDF-4',
                             'SupportedOutputFormats': ['NETCDF-4']}]}
'OPeNDAP'
{'Subset': {'SpatialSubset': {'BoundingBox': {'AllowMultipleValues': False}},
            'TemporalSubset': {'AllowMultipleValues': False},
            'VariableSubset': {'AllowMultipleValues': True}},
 'SupportedReformattings': [{'SupportedInputFormat': 'NETCDF-4',
                             'SupportedOutputFormats': ['ASCII',
                                                        'CSV',
                                                        'NETCDF-3',
                                                

## Discover variable names

In [4]:
response = requests.get(url, 
                        params={
                            'concept_id': 'C1940473819-POCLOUD',
                            },
                        headers={
                            'Accept': 'application/json'
                            }
                       )
response = response.json()
variables = response['feed']['entry'][0]['associations']['variables']
output_format = "umm_json"
var_url = "https://cmr.earthdata.nasa.gov/search/variables"
for i in range(len(variables)):
    response = requests.get(f"{var_url}.{output_format}?concept-id={variables[i]}")
    response = response.json()
    # pprint(response['items'][0]['umm'])
    if 'Name' in response['items'][0]['umm']: pprint(response['items'][0]['umm']['Name'])

'sses_standard_deviation_4um'
'l2p_flags'
'time'
'dt_analysis'
'sses_standard_deviation'
'sst_dtime'
'sses_bias_4um'
'lat'
'sea_surface_temperature_4um'
'sses_bias'
'lon'
'sea_surface_temperature'
'quality_level'
'wind_speed'
'quality_level_4um'


In [5]:
variables = ['sea_surface_temperature','lat','lon']

In [6]:
# cmr_url = "https://"+"cmr.earthdata.nasa.gov"+"/search/granules.umm_json?collection_concept_id="+"C1940473819-POCLOUD"+"&sort_key=-start_date&bounding_box=-90,-45.75,90,-45"

# response = requests.get(cmr_url)

# gid=response.json()['items'][0]['meta']['concept-id']
# print(response.json()['items'][0])
# print(gid)

## Harmony-Py set up

In [7]:
harmony_client = Client()

In [15]:
request = Request(
    collection=Collection(id='MODIS_A-JPL-L2P-v2019.0'),
    spatial=BBox(60,-45.75,90,-45), # bounding box example that can be used as an alternative to shapefile input
    temporal={
        'start': dt.datetime(2021, 11, 1),
        'stop': dt.datetime(2021, 11, 2),
    },
    # variables=variables,
)

### Valid request?

In [16]:
print(f"Request valid? {request.is_valid()}")
for m in request.error_messages():
    print(" * " + m)

Request valid? True


### Submit request

In [17]:
job_id = harmony_client.submit(request)
job_id

'686999c5-8659-4bf7-a1f6-e637737292f4'

In [18]:
harmony_client.status(job_id)

{'status': 'running',
 'message': 'There were 2 collections that matched the provided short name MODIS_A-JPL-L2P-v2019.0. See https://cmr.earthdata.nasa.gov/concepts/C1940473819-POCLOUD for details on the selected collection. The version ID for the selected collection is 2019.0. To use a different collection submit a new request specifying the desired CMR concept ID instead of the collection short name.',
 'progress': 0,
 'created_at': datetime.datetime(2021, 11, 2, 15, 40, 41, 478000, tzinfo=tzlocal()),
 'updated_at': datetime.datetime(2021, 11, 2, 15, 40, 41, 478000, tzinfo=tzlocal()),
 'request': 'https://harmony.earthdata.nasa.gov/MODIS_A-JPL-L2P-v2019.0/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&subset=lat(-45.75%3A-45)&subset=lon(60%3A90)&subset=time(%222021-11-01T00%3A00%3A00%22%3A%222021-11-02T00%3A00%3A00%22)',
 'num_input_granules': 7}

In [19]:
harmony_client.wait_for_processing(job_id, show_progress=True)


 [ Processing: 100% ] |###################################################| [|]


In [20]:
data = harmony_client.result_json(job_id)
pprint(data)

{'createdAt': '2021-11-02T15:40:41.478Z',
 'jobID': '686999c5-8659-4bf7-a1f6-e637737292f4',
 'links': [{'href': 'https://harmony.earthdata.nasa.gov/stac/686999c5-8659-4bf7-a1f6-e637737292f4/',
            'rel': 'stac-catalog-json',
            'title': 'STAC catalog',
            'type': 'application/json'},
           {'bbox': [66.801, -66.443, 113.49, -44.231],
            'href': 'https://harmony.earthdata.nasa.gov/service-results/harmony-prod-staging/public/podaac/l2-subsetter/b4790bda-ed62-4303-9b17-4c2ff89655fc/20211101083501-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc4',
            'rel': 'data',
            'temporal': {'end': '2021-11-01T08:39:58.000Z',
                         'start': '2021-11-01T08:35:01.000Z'},
            'title': '20211101083501-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc4',
            'type': 'application/x-netcdf4'},
           {'bbox': [66.7, -45.7, 90, -45],
            'href': 'https://harmony.earthdata.nasa.gov/service-results/harmony-

## Direct S3 using STAC

In [21]:
results = harmony_client.result_urls(job_id, link_type=LinkType.s3)
print(results)
creds = harmony_client.aws_credentials()

<generator object Client.result_urls at 0x7fa918632900>


In [22]:
stac_catalog_url = harmony_client.stac_catalog_url(job_id)
stac_catalog_url

'https://harmony.earthdata.nasa.gov/stac/686999c5-8659-4bf7-a1f6-e637737292f4/?linktype=https'

### Following Aaron's steps from cmr-stac tutorial:

In [62]:
from pystac_client import Client  

In [63]:
catalog = Client.open(stac_catalog_url)

In [64]:
links = catalog.get_links()

In [65]:
# links[0].to_dict()
for link in range(len(links)): print(links[link].to_dict())

{'rel': <RelType.SELF: 'self'>, 'href': 'https://harmony.earthdata.nasa.gov/stac/686999c5-8659-4bf7-a1f6-e637737292f4/?linktype=https', 'type': <MediaType.JSON: 'application/json'>}
{'rel': <RelType.ROOT: 'root'>, 'href': './', 'type': <MediaType.JSON: 'application/json'>}
{'rel': 'item', 'href': './0?linkType=https', 'title': '20211101083501-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc4'}
{'rel': 'item', 'href': './1?linkType=https', 'title': '20211101084001-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0_subsetted.nc4'}
{'rel': 'item', 'href': './2?linkType=https', 'title': '20211101101501-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0_subsetted.nc4'}
{'rel': 'item', 'href': './3?linkType=https', 'title': '20211101180501-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0.nc4'}
{'rel': 'item', 'href': './4?linkType=https', 'title': '20211101181001-JPL-L2P_GHRSST-SSTskin-MODIS_A-N-v02.0-fv01.0_subsetted.nc4'}
{'rel': 'item', 'href': './5?linkType=https', 'title': '20211101194501-JPL-L2P_GH

In [68]:
item_collection = catalog.get_all_items()

In [71]:
list(item_collection)

[]

In [69]:
links = []

for i in item_collection:
        for a in i.assets:
            links.append(i.assets[a].href)

NotImplementedError: ConformanceClasses.ITEM_SEARCH not supported

In [72]:
# Trying using Harmony example code


print(catalog.title)
s3_links = []
for item in catalog.get_all_items():
    print(item.datetime, [asset.href for asset in item.assets.values()])
    s3_links.append([asset.href for asset in item.assets.values()])

Harmony output for 686999c5-8659-4bf7-a1f6-e637737292f4


NotImplementedError: ConformanceClasses.ITEM_SEARCH not supported

In [None]:
## direct s3 using boto

# import boto3

# s3 = boto3.client('s3', **creds)
# for url in results:
#     bucket, obj, fn = s3_components(url)
#     with open(fn, 'wb') as f:
#         s3.download_fileobj(bucket, obj, f)

### Integrate with PO.DAAC l2ss tutorial to bring in STAC items into xarray directly:

In [None]:
# ds = xr.open_dataset('ogc_temp.nc')
# ds

# # Determine the lat/lon coordinate names
# for coord_name, coord in ds.coords.items():
#     if 'units' not in coord.attrs:
#         continue
#     if coord.attrs['units'] == 'degrees_north':
#         lat_var = coord_name
#     if coord.attrs['units'] == 'degrees_east':
#         lon_var = coord_name

# print(f'lat_var={lat_var}')
# print(f'lon_var={lon_var}')