In [1]:
import cProfile
import earthaccess
import numpy as np
import xarray as xr

try:
    import h5coro
except:
    !mamba install -c conda-forge h5coro --yes
    import h5coro

from h5coro import h5coro, s3driver, filedriver
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

# Benchmarks

These initial benchmarks compare different methods for access

# Setup credentials

In [2]:
auth = earthaccess.login()

EARTHDATA_USERNAME and EARTHDATA_PASSWORD are not set in the current environment, try setting them or use a different strategy (netrc, interactive)
No .netrc found in /home/jovyan


Enter your Earthdata Login username:  aimeeb
Enter your Earthdata password:  ········


You're now authenticated with NASA Earthdata Login
Using token with expiration date: 08/25/2023
Using user provided credentials for EDL


In [3]:
s3access = earthaccess.get_s3fs_session(daac="NSIDC", provider=auth)
s3_creds = auth.get_s3_credentials(daac='NSIDC')

# Search for data

In [4]:
results = earthaccess.search_data(
    short_name='ATL03',
    cloud_hosted=True,
    bounding_box=(-10, 20, 10, 50),
    temporal=("2020-01", "2020-02"),
    count=10
)

Granules found: 231


In [5]:
file_parts = results[0].data_links()[0].split('/')
bucket = file_parts[3]
path_to_hdf5_file = '/'.join(file_parts[4:])
s3url = f"s3://{bucket}/{path_to_hdf5_file}"
s3url

's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/005/2020/01/01/ATL03_20200101053635_00840606_005_01.h5'

# Benchmark time to get the photon height as a numpy array from one granule using different libraries.

We return the mean just to check the data is the same.

## Xarray

In [9]:
%%time
#cProfile.run("xr.open_dataset(s3access.open(s3url, 'rb'), group='/gt2l/heights')")
ds = xr.open_dataset(s3access.open(s3url, 'rb'), group='/gt2l/heights')
ds['h_ph'].values.mean()

CPU times: user 18.9 s, sys: 4.86 s, total: 23.7 s
Wall time: 1min 4s


681.68933

## h5coro

In [7]:
%%time
h5obj = h5coro.H5Coro(f'{bucket}/{path_to_hdf5_file}', 
                      s3driver.S3Driver,
                      credentials={"aws_access_key_id": s3_creds["accessKeyId"],
                                 "aws_secret_access_key": s3_creds["secretAccessKey"],
                                 "aws_session_token": s3_creds["sessionToken"], })

dataset = '/gt2l/heights/h_ph'
output = h5obj.readDatasets(datasets=[dataset], block=True)
h5obj[dataset].values.mean()

CPU times: user 136 ms, sys: 57 ms, total: 193 ms
Wall time: 811 ms


681.68933

# Future

* Add in benchmarks from https://github.com/nsidc/cloud-optimized-icesat2/issues/2
* Subsetting to bounding box
* Merge data from different beams
* Creation of different python data objects (xarray dataset, dataframe)

In [8]:
# A few different variables we may want.
beams = [f"gt{beam}" for beam in ['1l', '1r', '2l', '2r', '3l', '3r']]
beam = beams[0]
delta_time = f'{beam}/heights/delta_time'
photo_along_track = f'{beam}/heights/ds_ph_along'
lat = f'{beam}/heights/lat_ph'
lon = f'{beam}/heights/lon_ph'
height = f'{beam}/heights/h_ph'