**1. List the bucket contents**

In [1]:
import boto3

import dataset_lists as dsl

In [2]:
s3 = boto3.resource('s3')

bucket_name = "nasa-cryo-scratch"
bucket = s3.Bucket(dsl.S3BUCKET.split("/")[2])

In [3]:
for my_bucket_object in bucket.objects.all():
    print(my_bucket_object)

s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='dzdt/utiaqvik.parquet')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/original/ATL03_20181120182818_08110112_006_02.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/original/ATL03_20190219140808_08110212_006_02.h5')
s3.ObjectSummary(bucket_name='nasa-cryo-scratch', key='h5cloud/original/ATL03_

**2. Create lists of default and repacked hdf5 keys**

In [4]:
default_key = [k.key for k in bucket.objects.all() if 'original' in k.key]
default_key

['h5cloud/original/ATL03_20181120182818_08110112_006_02.h5',
 'h5cloud/original/ATL03_20190219140808_08110212_006_02.h5',
 'h5cloud/original/ATL03_20200217204710_08110612_006_01.h5',
 'h5cloud/original/ATL03_20211114142614_08111312_006_01.h5',
 'h5cloud/original/ATL03_20230211164520_08111812_006_01.h5']

In [5]:
repack_key = [k.key for k in bucket.objects.all() if 'h5repack' in k.key]
repack_key

['h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5',
 'h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5',
 'h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5',
 'h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5',
 'h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5']

In [6]:
import links

**Access data with h5coro**

Code taken from : [https://github.com/ICESat2-SlideRule/h5coro/](https://github.com/ICESat2-SlideRule/h5coro/)

In [8]:
# (1) import
try: 
    from h5coro import h5coro, s3driver, filedriver
except:
    !mamba install -c conda-forge h5coro --yes
    from h5coro import h5coro, s3driver, filedriver

# (2) configure
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

In [9]:
dsl.ONE_BEAM_GROUP[3]

'gt1l/heights/h_ph'

In [18]:
# need to increase `n` and `r` in timeit

default_benchmarks = {}
for key in default_key:
    print(key)
    default_h5obj = h5coro.H5Coro(f'{dsl.S3BUCKET.split("/")[2]}/{key}', s3driver.S3Driver)
    default_benchmarks[key] = %timeit -n2 -r2 -o default_h5obj.readDatasets(datasets=[dsl.ONE_BEAM_GROUP[3]], block=True)

h5cloud/original/ATL03_20181120182818_08110112_006_02.h5
The slowest run took 5.57 times longer than the fastest. This could mean that an intermediate result is being cached.
4.14 s ± 2.88 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_20190219140808_08110212_006_02.h5
The slowest run took 5.08 times longer than the fastest. This could mean that an intermediate result is being cached.
6.08 s ± 4.08 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_20200217204710_08110612_006_01.h5
The slowest run took 5.68 times longer than the fastest. This could mean that an intermediate result is being cached.
4.02 s ± 2.82 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_20211114142614_08111312_006_01.h5
The slowest run took 5.49 times longer than the fastest. This could mean that an intermediate result is being cached.
7.65 s ± 5.29 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
h5cloud/original/ATL03_20230

In [16]:
default_benchmarks

{'h5cloud/original/ATL03_20181120182818_08110112_006_02.h5': <TimeitResult : 4.19 s ± 2.95 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20190219140808_08110212_006_02.h5': <TimeitResult : 6.34 s ± 4.35 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20200217204710_08110612_006_01.h5': <TimeitResult : 4.42 s ± 3.22 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20211114142614_08111312_006_01.h5': <TimeitResult : 7.99 s ± 5.63 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>,
 'h5cloud/original/ATL03_20230211164520_08111812_006_01.h5': <TimeitResult : 7.55 s ± 5.29 s per loop (mean ± std. dev. of 2 runs, 2 loops each)>}

In [17]:
# need to increase `n` and `r` in timeit

repack_benchmarks = {}
for key in repack_key:
    repack_h5obj = h5coro.H5Coro(f'{dsl.S3BUCKET.split("/")[2]}/{key}', s3driver.S3Driver)
    repack_benchmarks[key] = %timeit -n2 -r2 -o repack_h5obj.readDatasets(datasets=[dsl.ONE_BEAM_GROUP[3]], block=True)

The slowest run took 5.91 times longer than the fastest. This could mean that an intermediate result is being cached.
4.29 s ± 3.05 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 5.85 times longer than the fastest. This could mean that an intermediate result is being cached.
6.94 s ± 4.91 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 5.73 times longer than the fastest. This could mean that an intermediate result is being cached.
4.09 s ± 2.87 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 5.42 times longer than the fastest. This could mean that an intermediate result is being cached.
7.48 s ± 5.15 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
The slowest run took 6.12 times longer than the fastest. This could mean that an intermediate result is being cached.
7.96 s ± 5.72 s per loop (mean ± std. dev. of 2 runs, 2 loops each)


In [None]:
repack_benchmarks