**1. List the bucket contents**

In [1]:
import os
import sys
from typing import Union, Dict, List

import boto3

import geopandas as gpd
import numpy as np

current = os.path.abspath('..')
sys.path.append(current)

#from helpers.dataset_lists import BEAM_GROUP
from helpers.links import S3Links, glob_s3bucket
import helpers.dataset_lists as dsl


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
s3 = boto3.resource('s3')

bucket_name = "nasa-cryo-scratch"
bucket = s3.Bucket(dsl.S3BUCKET.split("/")[2])

In [3]:
for my_bucket_object in bucket.objects.all():
    print(my_bucket_object)

s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='dzdt/utiaqvik.parquet')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/benchmark_results/2023-08-10-201833_H5pyArrLen_original_results.csv')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/benchmark_results/2023-08-10-202230_H5pyArrLen_h5repack_results.csv')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/benchmark_results/2023-08-10-202931_XarrayArrLen_original_results.csv')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/benchmark_results/2023-08-10-204033_XarrayArrLen_h5repack_results.csv')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/benchmark_results/2023-08-10-204827_H5CoroArrLen_original_results.csv')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/benchmark_results/2023-08-10-205013_H5CoroArrLen_h5repack_results.csv')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/benchmark_results/2023-08-10-212257_H5DataFrameArrLe

**2. Create lists of default and repacked hdf5 keys**

In [4]:
s3_link = S3Links()
s3_link.formats

['geoparquet',
 'h5repack',
 'kerchunk-original',
 'kerchunk-repacked',
 'original']

In [5]:
default_key = s3_link.get_links_by_format('original')
default_key = [k.split('s3://')[1] for k in default_key]
default_key

['nasa-cryo-scratch/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5']

In [6]:
repack_key = s3_link.get_links_by_format('h5repack')
repack_key = [k.split('s3://')[1] for k in repack_key]
repack_key

['nasa-cryo-scratch/h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5',
 'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5',
 'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5',
 'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5',
 'nasa-cryo-scratch/h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5']

**Access data with h5coro**

Code taken from : [https://github.com/ICESat2-SlideRule/h5coro/](https://github.com/ICESat2-SlideRule/h5coro/)

In [7]:
try: 
    from h5coro import h5coro, s3driver, filedriver
except:
    !mamba install -c conda-forge h5coro --yes
    from h5coro import h5coro, s3driver, filedriver

In [8]:
datasets = [dsl.ONE_BEAM_GROUP[3]]
datasets

['gt1l/heights/h_ph']

In [9]:
def h5coro_load_subset_aoi(s3_uri: str, datasets: List[Union[str, Dict]], bounds: List=None):
    """
    Parameters:
      s3_uri: string path to hdf5 dataset in an S3 bucket
      datasets: list of string dataset names or list of dataset Dicts
      bounds: (Optional) list of bounding box coordinates [llx, lly, urx, ury]
      
    Returns: list of value array lengths for read datasets (not including lat/lon unless passed in)
    """
    h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)
    
    # find indices based on AOI bounds
    if bounds:
        h5obj = h5coro.H5Coro(s3_uri, s3driver.S3Driver)
        h5obj.readDatasets(datasets=[dsl.ONE_BEAM_GROUP[4], dsl.ONE_BEAM_GROUP[5]], block=True)
        lat = h5obj[dsl.ONE_BEAM_GROUP[4]].values
        lon = h5obj[dsl.ONE_BEAM_GROUP[5]].values

        ph_in_aoi = np.where((lat > bounds[1]) & (lat < bounds[3]) \
                             & (lon > bounds[0]) & (lon < bounds[2]))[0]

        idx_start = ph_in_aoi[0]
        idx_end = ph_in_aoi[-1]

        # access datasets
        datasets = [{"dataset": k, 'startrow': idx_start, 'numrows': idx_end} for k in datasets]
        
    h5obj = h5coro.H5Coro(s3_uri, s3driver.S3Driver)   
    h5obj.readDatasets(datasets=datasets, block=True)
    
    # return list of value array lengths for each dataset
    return [len(h5obj[ds].values) for ds in h5obj.keys()]

In [10]:
# read in the area of interest geojson
aoi = gpd.read_file('/home/jovyan/h5cloud/notebooks/antarctic_aoi.geojson', crs='EPSG:4326')
bounds = [v for v in aoi.bounds.values[0]] 


In [11]:
subset_test = h5coro_load_subset_aoi(default_key[0], datasets, bounds)
subset_test

[13227693]

In [12]:
no_subset_test = h5coro_load_subset_aoi(default_key[0], datasets)
no_subset_test

[46484912]

In [None]:
# default_benchmark = {}
# for key in default_key:
#     print(key)
#     default_benchmark[key] = %timeit -n2 -r2 -o h5coro_load(key, datasets)

In [None]:
# default_benchmark

In [None]:
# repack_benchmark = {}
# for key in repack_key:
#     print(key)
#     repack_benchmark[key] = %timeit -n2 -r2 -o h5coro_load(key, datasets)