**1. List the bucket contents**

In [1]:
import boto3
import links
from pprint import pprint

import dataset_lists as dsl

In [2]:
s3 = boto3.resource('s3')

bucket_name = "nasa-cryo-scratch"
bucket = s3.Bucket(dsl.S3BUCKET.split("/")[2])

In [3]:
for my_bucket_object in bucket.objects.all():
    print(my_bucket_object)

s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='dzdt/utiaqvik.parquet')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/geoparquet/ATL03_20181120182818_08110112_006_02.h5.gpq')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/geoparquet/ATL03_20190219140808_08110212_006_02.h5.gpq')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/geoparquet/ATL03_20200217204710_08110612_006_01.h5.gpq')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/geoparquet/ATL03_20211114142614_08111312_006_01.h5.gpq')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/geoparquet/ATL03_20230211164520_08111812_006_01.h5.gpq')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL

**2. Create lists of default and repacked hdf5 keys**

In [4]:
repack_links = links.S3Links()
repack_links.formats

['h5repack', 'original']

In [5]:
default_key = repack_links.get_links_by_format('original')
default_key

['h5cloud/original/ATL03_20181120182818_08110112_006_02.h5',
 'h5cloud/original/ATL03_20190219140808_08110212_006_02.h5',
 'h5cloud/original/ATL03_20200217204710_08110612_006_01.h5',
 'h5cloud/original/ATL03_20211114142614_08111312_006_01.h5',
 'h5cloud/original/ATL03_20230211164520_08111812_006_01.h5']

In [6]:
repack_key = repack_links.get_links_by_format('h5repack')
repack_key

['h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5',
 'h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5',
 'h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5',
 'h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5',
 'h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5']

**Access data with h5coro**

Code taken from : [https://github.com/ICESat2-SlideRule/h5coro/](https://github.com/ICESat2-SlideRule/h5coro/)

In [7]:
# (1) import
try: 
    from h5coro import h5coro, s3driver, filedriver
except:
    !mamba install -c conda-forge h5coro --yes
    from h5coro import h5coro, s3driver, filedriver

# (2) configure
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

In [8]:
dsl.ONE_BEAM_GROUP[3]

'gt1l/heights/h_ph'

In [9]:
# determine appropriate `n` and `r` for timeit

default_benchmarks = {}
for key in default_key:
    print(key)
    default_h5obj = h5coro.H5Coro(f'{dsl.S3BUCKET.split("/")[2]}/{key}', s3driver.S3Driver)
    default_benchmarks[key] = %timeit -n2 -r2 -o default_h5obj.readDatasets(datasets=[dsl.ONE_BEAM_GROUP[3]], block=True)

h5cloud/original/ATL03_20181120182818_08110112_006_02.h5
The slowest run took 6.14 times longer than the fastest. This could mean that an intermediate result is being cached.
4.44 s ± 3.2 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_20190219140808_08110212_006_02.h5
The slowest run took 5.63 times longer than the fastest. This could mean that an intermediate result is being cached.
6.61 s ± 4.62 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_20200217204710_08110612_006_01.h5
The slowest run took 4.97 times longer than the fastest. This could mean that an intermediate result is being cached.
3.58 s ± 2.38 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_20211114142614_08111312_006_01.h5
The slowest run took 6.16 times longer than the fastest. This could mean that an intermediate result is being cached.
8.28 s ± 5.96 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_202302

In [10]:
default_benchmarks

{'h5cloud/original/ATL03_20181120182818_08110112_006_02.h5': <TimeitResult : 4.44 s ± 3.2 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20190219140808_08110212_006_02.h5': <TimeitResult : 6.61 s ± 4.62 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20200217204710_08110612_006_01.h5': <TimeitResult : 3.58 s ± 2.38 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20211114142614_08111312_006_01.h5': <TimeitResult : 8.28 s ± 5.96 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20230211164520_08111812_006_01.h5': <TimeitResult : 8.11 s ± 5.89 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>}

In [11]:
# determine appropriate `n` and `r` for timeit

repack_benchmarks = {}
for key in repack_key:
    repack_h5obj = h5coro.H5Coro(f'{dsl.S3BUCKET.split("/")[2]}/{key}', s3driver.S3Driver)
    repack_benchmarks[key] = %timeit -n2 -r2 -o repack_h5obj.readDatasets(datasets=[dsl.ONE_BEAM_GROUP[3]], block=True)

The slowest run took 5.98 times longer than the fastest. This could mean that an intermediate result is being cached.
4.33 s ± 3.09 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 6.03 times longer than the fastest. This could mean that an intermediate result is being cached.
6.97 s ± 4.99 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 6.33 times longer than the fastest. This could mean that an intermediate result is being cached.
4.62 s ± 3.36 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 5.96 times longer than the fastest. This could mean that an intermediate result is being cached.
8.06 s ± 5.74 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 6.53 times longer than the fastest. This could mean that an intermediate result is being cached.
8.59 s ± 6.31 s per loop (mean ± std. dev. of 2 runs, 2 loops each)


In [12]:
repack_benchmarks

{'h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5': <TimeitResult : 4.33 s ± 3.09 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5': <TimeitResult : 6.97 s ± 4.99 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5': <TimeitResult : 4.62 s ± 3.36 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5': <TimeitResult : 8.06 s ± 5.74 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5': <TimeitResult : 8.59 s ± 6.31 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>}