In [None]:
import cProfile
import earthaccess
import h5py
import numpy as np
import s3fs
import tqdm
import xarray as xr

try:
    import h5coro
except:
    !mamba install -c conda-forge h5coro --yes
    import h5coro

try:
    from gedi_subset.h5frame import H5DataFrame
except ImportError:
    !pip install git+https://github.com/MAAP-Project/gedi-subsetter.git@0.6.0
    from gedi_subset.h5frame import H5DataFrame

from h5coro import h5coro, s3driver, filedriver
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

# Hi!

This is a template for benchmarking 2 workflows.

# Benchmarking XX Format

We will benchmark the following workflows using the XX format.

[ADD A BIT OF RELEVANT INFORMATION ABOUT THAT FORMAT]

1. Reading 1 full group (`h_ph`)
2. Spatially subsetting that group (tbd)

via the following libraries: h5py, h5coro and xarray

## Workflow 1 - Read full group

### Setup Steps

NOTE, below we use the `original` directory, this should be replaced if you are working with a different dataset.

In [None]:
# Checkout the files
# !mamba install -c conda-forge awscli -y
!aws s3 ls s3://nasa-cryo-scratch/h5cloud/original/

In [None]:
# Set the variables
bucket = 'nasa-cryo-scratch'
directory = 'h5cloud/original/'
group = '/gt2l/heights'
variable = 'h_ph'

In [None]:
# Create a list of s3 objects
s3 = s3fs.S3FileSystem(anon=False)

# This generates a list of strings with filenames
s3path = f's3://{bucket}/{directory}*'
remote_files_no_protocol = s3.glob(s3path)
remote_files = [f's3://{path}' for path in remote_files_no_protocol]
remote_files, remote_files_no_protocol

### Option 1a: Read the group with `h5py`

In [None]:
%%time
final_h5py_array = []
for file in tqdm.tqdm(remote_files):
    with h5py.File(s3.open(file, 'rb')) as f:
        data = f[f'{group}/{variable}'][:]
        # Need to test if using concatenate is faster
        final_h5py_array = np.insert(final_h5py_array, len(final_h5py_array), data, axis=None)

In [None]:
len(final_h5py_array)

### Option 1b: Read the group with gedi subsetter

In [None]:
%%time
dataframes = []
for file in tqdm.tqdm(remote_files):
    with h5py.File(name=s3.open(file, 'rb')) as h5:
        df = H5DataFrame(h5[f"{group[1:]}"])
        dataframes.append(df[variable])
final_dataframe: pd.Series = pd.concat(objs=dataframes, axis="index")

In [None]:
len(final_dataframe)

### Option 2: Read the group with `xarray`

In [None]:
%%time
s3_fileset = [s3.open(file) for file in remote_files]
xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf')
final_xr_array = xrds['h_ph']

In [None]:
len(final_xr_array)

### Option 3: Read the group with `h5coro`

In [None]:
%%time
final_h5coro_array = []
for file in tqdm.tqdm(remote_files_no_protocol):
    h5obj = h5coro.H5Coro(file, s3driver.S3Driver)
    output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)
    data = h5obj[f'{group}/{variable}'].values
    final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None)

In [None]:
len(final_h5coro_array)

# Workflow 2 - Spatially Subset

## Option 1: Spatially subset with `h5py`

## Option 2: Spatially subset with `xarray`

## Option 3: Spatially subset with `h5coro`