In [1]:
'''
Query CMR STAC for HLS data given a point location and date range
Return a list of asset filenames for AWS or HTTPS access
Translate to local filenames and download
'''
from datetime import datetime, timedelta
import json
import os
import time
import pandas
import requests
import boto3
from botocore.exceptions import ClientError
from pystac_client import Client


In [39]:
# which collections to search
collections = ['HLSL30.v2.0', 'HLSS30.v2.0']

# define the point location/centroid for the HLS tile we want
# pt = json.loads('{"type":"Point", "coordinates":[-121.812838, 46.802158]}') # mountain Rainier 
pt = json.loads('{"type":"Point", "coordinates":[-119.256, 37.901]}') # Dana meadow

# define the dates we want to query
# 20210501

start_date = datetime(year=2021, day=1, month=5)
end_date = datetime(year=2021, day=1, month=5)

In [40]:
def search_stac_for_HLS(pt, dt_min, dt_max, cloudcover_max=80, lim=100, url='https://cmr.earthdata.nasa.gov/stac/LPCLOUD', collections=['HLSL30.v2.0', 'HLSS30.v2.0']):
    # open the catalog
    catalog = Client.open(f'{url}')
    
    # perform the search
    search = catalog.search(
        collections=collections,
        intersects=pt,
        datetime=dt_min + '/' + dt_max,
        limit=lim
    )

    links = []

    if search.matched() == 0:
        print('No granules found at point', pt, 'from', dt_min, 'to', dt_max)
    else:
        print('Found', search.matched(), 'granules at point', pt, 'from', dt_min, 'to', dt_max)
        item_collection = search.get_all_items()
        
        for i in item_collection:
            if i.properties['eo:cloud_cover'] <= cloudcover_max:
                if len(links) == 0:
                    print(i.properties)
                for a in i.assets:
                    links.append(i.assets[a].href)

    return(links)

In [41]:
hls_links = search_stac_for_HLS(pt, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))

Found 4 granules at point {'type': 'Point', 'coordinates': [-119.256, 37.901]} from 2021-05-01 to 2021-05-01
{'eo:cloud_cover': 4, 'datetime': '2021-05-01T18:53:43.424Z', 'start_datetime': '2021-05-01T18:53:43.424Z', 'end_datetime': '2021-05-01T18:53:43.424Z'}


In [42]:
print(hls_links[0:10])

['https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B05.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B11.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.SAA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B08.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.VZA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.SZA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/H

In [43]:
# convert https links to s3 links
s3_links = [l.replace('https://data.lpdaac.earthdatacloud.nasa.gov/', 's3://') for l in hls_links]
s3_links = [l.replace('https://cmr.earthdata.nasa.gov/', 's3://') for l in s3_links]
print(s3_links[0:20])

['s3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B05.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B11.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.SAA.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B08.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.VZA.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.SZA.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.Fmask.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B03.tif', 's3://lp-prod-protected/HLSS30.020/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30

In [44]:
def fix_links(src_link, src_dirs, dst_dir, meta_dir, add_tile_dir=True):
    dst_link = src_link

    if '.xml' in dst_link:
        dst_link2 = os.path.join(meta_dir, os.path.basename(dst_link))
    else:
        for src_dir in src_dirs:
            dst_link = dst_link.replace(src_dir, dst_dir)
        dst_splits = dst_link.split('/')
        dst_link2 = '/'.join(dst_splits[0:2]) + \
            '/' + dst_splits[3].split('.')[2] + \
            '/' + '/'.join(dst_splits[3:])
    
    return(dst_link2)

#fix_links(s3_links[0], 's3://lp-prod-protected', './HLS_data')
local_links = [fix_links(src_link=l, src_dirs=['s3://lp-prod-protected', 's3://lp-prod-public'], dst_dir='./HLS_data', meta_dir='./HLS_metadata') for l in s3_links]
print(local_links[0:10])

['./HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B05.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B11.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.SAA.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B08.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.VZA.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.SZA.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.Fmask.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B03.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B10.tif', './HLS_data/T11SLC/HLS.S30.T11SLC.2021121T183919.v2.0/HLS.S30.T11SLC.2021121T183919.v2.0.B07.tif']


In [45]:
from getpass import getpass

bucket = 'lp-prod-protected'

user = getpass(prompt='Enter your NASA Earthdata Login Username')
password = getpass(prompt='Enter your NASA Earthdata Login Password')

s3_cred_endpoint = 'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials'
url = requests.get(s3_cred_endpoint, allow_redirects=False).headers['Location']

creds = requests.get(url, auth=(user, password)).json()
session = boto3.Session(aws_access_key_id=creds['accessKeyId'], 
                        aws_secret_access_key=creds['secretAccessKey'], 
                        aws_session_token=creds['sessionToken'], 
                        region_name='us-west-2')



Enter your NASA Earthdata Login Username ········
Enter your NASA Earthdata Login Password ········


In [46]:
# maybe only download the data that is needed 
def make_dirs(dst_links):
    for dst_link in dst_links:
        os.makedirs(os.path.dirname(dst_link), exist_ok=True)
        
make_dirs(local_links)

In [47]:
%%time

s3 = session.client('s3')

def download_data(s3_links, local_links, s3_session):
    s3_links = [l.replace('s3://', '') for l in s3_links]
    
    for i in range(0, len(s3_links)):
        s3_link = s3_links[i]
        s3_bucket = s3_link.split('/')[0]
        s3_link = s3_link.replace(s3_bucket +'/', '')        
        local_link = local_links[i]
        
        # ignore XML files for now, figure out how to get them later because they contain useful information
        if not '.xml' in local_link:
            with open(local_link, 'wb') as f:
                #print(i, s3_bucket, s3_link, local_link)
                s3.download_fileobj(s3_bucket, s3_link, f)


download_data(s3_links, local_links, s3)

CPU times: user 2.94 s, sys: 2.3 s, total: 5.25 s
Wall time: 26.6 s


In [49]:
# create a .csv file of S3 links, local links, and information about each file
stack_df = pandas.DataFrame().from_dict({'S3_links':s3_links, 'local_links':local_links})

# add sensor, tile, dates, bands
for i in range(0, stack_df.shape[0]):
    link = s3_links[i]
    if (not '.xml' in link) & (not '.png' in link):
        stack_df.loc[i, 'sensor'] = os.path.basename(link).split('.')[1]
        stack_df.loc[i, 'tile'] = os.path.basename(link).split('.')[2]
        stack_df.loc[i, 'date'] = os.path.basename(link).split('.')[3]
        stack_df.loc[i, 'band'] = os.path.basename(link).split('.')[6]
        
stack_df.to_csv('./HLS_data/stack_train.csv', index=False)