In [1]:
import cProfile
import earthaccess
import numpy as np
import s3fs
import xarray as xr
import h5py

try:
    import h5coro
except:
    !mamba install -c conda-forge h5coro --yes
    import h5coro

try:
    from gedi_subset.h5frame import H5DataFrame
except ImportError:
    !pip install git+https://github.com/MAAP-Project/gedi-subsetter.git@0.6.0
    from gedi_subset.h5frame import H5DataFrame

from h5coro import h5coro, s3driver, filedriver
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

# Benchmark Original HDF5

This is the original format available from NSIDC, downloaded to the `nasa-cryo-scratch` bucket.

We will benchmark the following workflows"

1. Reading 1 full group (`h_ph`)
2. Spatially subsetting that group (tbd)

via the following libraries: h5py, h5coro and xarray

## Workflow 1 - Read full group

### Setup Steps

In [2]:
# Checkout the files
# !mamba install -c conda-forge awscli -y
!aws s3 ls s3://nasa-cryo-scratch/h5cloud/original/

2023-08-08 23:45:34 7754735138 ATL03_20181120182818_08110112_006_02.h5
2023-08-08 23:47:04 6997123664 ATL03_20190219140808_08110212_006_02.h5
2023-08-08 23:47:04 6925710500 ATL03_20200217204710_08110612_006_01.h5
2023-08-08 23:47:04 8392279594 ATL03_20211114142614_08111312_006_01.h5
2023-08-08 23:47:04 7954039827 ATL03_20230211164520_08111812_006_01.h5


In [3]:
# Set the variables
bucket = 'nasa-cryo-scratch'
directory = 'h5cloud/original/'
group = '/gt2l/heights'
variable = 'h_ph'

In [4]:
# Create a list of s3 objects
s3 = s3fs.S3FileSystem(anon=False)

# This generates a list of strings with filenames
s3path = f's3://{bucket}/{directory}*'
remote_files_no_protocol = s3.glob(s3path)
remote_files = [f's3://{path}' for path in remote_files_no_protocol]
remote_files, remote_files_no_protocol

(['s3://nasa-cryo-scratch/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5',
  's3://nasa-cryo-scratch/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5',
  's3://nasa-cryo-scratch/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5',
  's3://nasa-cryo-scratch/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5',
  's3://nasa-cryo-scratch/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5'],
 ['nasa-cryo-scratch/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5',
  'nasa-cryo-scratch/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5',
  'nasa-cryo-scratch/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5',
  'nasa-cryo-scratch/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5',
  'nasa-cryo-scratch/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5'])

### Option 1a: Read the group with `h5py`

In [5]:
%%time
final_h5py_array = []
for file in remote_files:
    with h5py.File(s3.open(file, 'rb')) as f:
        data = f[f'{group}/{variable}'][:]
        # Need to test if using concatenate is faster
        final_h5py_array = np.insert(final_h5py_array, len(final_h5py_array), data, axis=None)

CPU times: user 14.6 s, sys: 3.59 s, total: 18.2 s
Wall time: 1min 2s


In [6]:
len(final_h5py_array)

298271231

## Option 1b: Read the group with gedi subsetter

### Option 2: Read the group with `xarray`

In [7]:
%%time
s3_fileset = [s3.open(file) for file in remote_files]
xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf')
final_xr_array = xrds['h_ph']

CPU times: user 2min, sys: 33.6 s, total: 2min 33s
Wall time: 7min 9s


In [8]:
len(final_xr_array)

298271231

In [1]:
from datetime import datetime
datetime.now()

datetime.datetime(2023, 8, 10, 20, 11, 58, 245306)

## h5coro

In [9]:
%load_ext autoreload
%autoreload 
from h5coro import h5coro, s3driver, filedriver

In [10]:
%%time
final_h5coro_array = []
for file in remote_files_no_protocol:
    h5obj = h5coro.H5Coro(file, s3driver.S3Driver)
    output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)
    data = h5obj[f'{group}/{variable}'].values
    final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None)    

CPU times: user 12 s, sys: 3.68 s, total: 15.7 s
Wall time: 1min 21s


In [11]:
len(final_h5coro_array)

298271231

# IGNORE - Code for looping through all fields we may want

In [13]:
# beams = [f"gt{beam}" for beam in ['1l', '1r', '2l', '2r', '3l', '3r']]
# fields = ['h_ph', 'lat_ph', 'lon_ph', 'delta_time']
# h_ph, lat_ph, lon_ph, delta_time = [h5obj[f'gt2l/heights/{field}'].values for field in fields]
# data_xr = xr.DataArray(h_ph, 
#                        coords={
#                            'lat': lat_ph,
#                            'lon': lon_ph,
#                            'delta_time': delta_time
#                        }, 
#                        dims=["lat_ph", "lon_ph", "delta_time"])

KeyError: 'gt2l/heights/h_ph'