In [1]:
import cProfile
import earthaccess
import h5py
import numpy as np
import pandas as pd
import s3fs
import tqdm
import xarray as xr

try:
    import h5coro
except:
    !pip install -U git+https://github.com/ICESat2-SlideRule/h5coro.git@8035f01c4b793313496e246870a53f40056407c8
    import h5coro

try:
    from gedi_subset.h5frame import H5DataFrame
except ImportError:
    !pip install git+https://github.com/MAAP-Project/gedi-subsetter.git@0.6.0
    from gedi_subset.h5frame import H5DataFrame

from h5coro import h5coro, s3driver, filedriver
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

# Hi!

# Benchmarking h5repack Format

This is a HDF5 format that has been rechunked from the original format from NSIDC,
done by Luis Lopez.

We will benchmark the following workflows:

1. Reading 1 full group (`h_ph`)
2. Spatially subsetting that group (tbd)

via the following libraries: h5py, h5coro and xarray

## Workflow 1 - Read full group

### Setup Steps

NOTE, below we use the `h5repack` directory, this should be replaced if you are working with a different dataset.

In [2]:
# Checkout the files
# !mamba install -c conda-forge awscli -y
!aws s3 ls s3://nasa-cryo-scratch/h5cloud/h5repack/

2023-08-09 05:30:04 7760000000 ATL03_20181120182818_08110112_006_02_repacked.h5
2023-08-09 05:30:04 7008000000 ATL03_20190219140808_08110212_006_02_repacked.h5
2023-08-09 05:30:04 6936000000 ATL03_20200217204710_08110612_006_01_repacked.h5
2023-08-09 05:30:04 8400000000 ATL03_20211114142614_08111312_006_01_repacked.h5
2023-08-09 05:30:04 7960000000 ATL03_20230211164520_08111812_006_01_repacked.h5


In [3]:
# Set the variables
bucket = 'nasa-cryo-scratch'
directory = 'h5cloud/h5repack/'
group = '/gt2l/heights'
variable = 'h_ph'

In [4]:
# Create a list of s3 objects
s3 = s3fs.S3FileSystem(anon=False)

# This generates a list of strings with filenames
s3path = f's3://{bucket}/{directory}*'
remote_files_no_protocol = s3.glob(s3path)
remote_files = [f's3://{path}' for path in remote_files_no_protocol]
remote_files, remote_files_no_protocol

(['s3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5',
  's3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5',
  's3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5',
  's3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5',
  's3://nasa-cryo-scratch/h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5'],
 ['nasa-cryo-scratch/h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5',
  'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5',
  'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5',
  'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5',
  'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5'])

### Option 1a: Read the group with `h5py`

In [5]:
%%time
final_h5py_array = []
for file in tqdm.tqdm(remote_files):
    with h5py.File(s3.open(file, 'rb')) as f:
        data = f[f'{group}/{variable}'][:]
        # Need to test if using concatenate is faster
        final_h5py_array = np.insert(final_h5py_array, len(final_h5py_array), data, axis=None)

100%|██████████| 5/5 [03:07<00:00, 37.52s/it]

CPU times: user 30.1 s, sys: 7.16 s, total: 37.2 s
Wall time: 3min 7s





In [6]:
len(final_h5py_array)

298271231

### Option 1b: Read the group with gedi subsetter

In [7]:
%%time
dataframes = []
for file in tqdm.tqdm(remote_files):
    with h5py.File(name=s3.open(file, 'rb')) as h5:
        df = H5DataFrame(h5[f"{group[1:]}"])
        dataframes.append(df[variable])
final_dataframe: pd.Series = pd.concat(objs=dataframes, axis="index")

100%|██████████| 5/5 [02:15<00:00, 27.02s/it]


CPU times: user 39.3 s, sys: 13.5 s, total: 52.8 s
Wall time: 2min 16s


In [8]:
len(final_dataframe)

298271231

### Option 2: Read the group with `xarray`

In [9]:
%%time
s3_fileset = [s3.open(file) for file in remote_files]
xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf')
final_xr_array = xrds['h_ph']

CPU times: user 2min 22s, sys: 38.5 s, total: 3min 1s
Wall time: 7min 59s


In [10]:
len(final_xr_array)

298271231

### Option 3: Read the group with `h5coro`

In [11]:
%%time
final_h5coro_array = []
for file in tqdm.tqdm(remote_files_no_protocol):
    h5obj = h5coro.H5Coro(file, s3driver.S3Driver)
    output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)
    data = h5obj[f'{group}/{variable}'].values
    final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None)

100%|██████████| 5/5 [01:12<00:00, 14.50s/it]

CPU times: user 11.4 s, sys: 3.94 s, total: 15.3 s
Wall time: 1min 12s





In [12]:
len(final_h5coro_array)

298271231

# Workflow 2 - Spatially Subset

## Option 1: Spatially subset with `h5py`

## Option 2: Spatially subset with `xarray`

## Option 3: Spatially subset with `h5coro`