# Compare `h5py` read with `xarray`

`xarray` does quite a lot under the hood.  When it reads a HDF5 dataset, it also reads dimension and coordinate variables, and attributes.  `h5py` can do this but each step needs to be coded.  Currently, `h5coro` reads just the data.  None of the attributes or metadata associated with a dataset is read.

In this notebook, I benchmark an xarray-type read using `h5py` against `xarray.open_dataset`.  

I should probably need to add an `h5coro` example.

In [21]:
import earthaccess

import h5py
import xarray as xr

from dataset_lists import BEAM_GROUP, XARRAY_LIKE_ONE_DATASET

from pprint import pprint

In [None]:
auth = earthaccess.login()

In [3]:
more s3files.txt

2023-08-09 05:30:04 7760000000 h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5
2023-08-09 05:30:04 7008000000 h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5
2023-08-09 05:30:04 6936000000 h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5
2023-08-09 05:30:04 8400000000 h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5
2023-08-09 05:30:04 7960000000 h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5
2023-08-08 23:45:34 7754735138 h5cloud/original/ATL03_20181120182818_08110112_006_02.h5
2023-08-08 23:47:04 6997123664 h5cloud/original/ATL03_20190219140808_08110212_006_02.h5
2023-08-08 23:47:04 6925710500 h5cloud/original/ATL03_20200217204710_08110612_006_01.h5
2023-08-08 23:47:04 8392279594 h5cloud/original/ATL03_20211114142614_08111312_006_01.h5
2023-08-08 23:47:04 7954039827 h5cloud/original/ATL03_20230211164520_08111812_006_01.h5


In [106]:
import json
from pathlib import Path

S3FILELINKS = Path("../s3filelinks.json")

class S3Links:
    
    def __init__(self):
        self.json_file = S3FILELINKS
        self.table = load_s3testfile(S3FILELINKS)
        self.formats = list(self.table.keys())
        
    
    def get_links_by_format(self, file_format):
        try:
            return list(self.table[file_format].values())
        except KeyError:
            print(f"Unknown file_format.  Expects one of {self.formats}")
            return None
        
    def get_link_by_name(self, name):
        try:
            return self.dict_by_name()[name]
        except:
            print(f"{name} not found in self.table")
            return None
        
    def get_link_by_fileid(self, file_format, fileid):
        return self.get_links_by_format(file_format)[fileid]
      
    def dict_by_name(self):
        return dict([make_entry(link) for fmt in self.table.values() for link in fmt.values()])
    
    
def load_s3testfile(file_format, filename=None, fileid=None):
    with open(S3FILELINKS, 'r') as f:
        json_obj = json.load(f)
    return json_obj

def make_entry(path):
    name = path.split('/')[-1]
    return name, path

In [107]:
s3links = S3Links()

In [111]:
s3links.formats

['h5repack', 'original']

In [108]:
s3links.get_links_by_format('h5repack')

['h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5',
 'h5cloud/h5repack/ATL03_20190219140808_08110212_006_02_repacked.h5',
 'h5cloud/h5repack/ATL03_20200217204710_08110612_006_01_repacked.h5',
 'h5cloud/h5repack/ATL03_20211114142614_08111312_006_01_repacked.h5',
 'h5cloud/h5repack/ATL03_20230211164520_08111812_006_01_repacked.h5']

In [110]:
s3links.get_link_by_name('ATL03_20181120182818_08110112_006_02_repacked.h5')

'h5cloud/h5repack/ATL03_20181120182818_08110112_006_02_repacked.h5'

In [112]:
s3links.get_link_by_fileid('original', 0)

'h5cloud/original/ATL03_20181120182818_08110112_006_02.h5'