# H5PY with Subsetting Benchmarking

Note, reading in lat/lon uses the full photon resolution data for simplicity right now, but could probably be improved by an order of magnitude by subsetting at the segment level using ref_photon_lat/lon or ph_index_beg parameters from ATL03.

In [None]:
import cProfile
import numpy as np
import s3fs
import xarray as xr
import h5py
import boto3
import os
import geopandas as gpd


In [None]:
# !mamba install -c conda-forge awscli -y

## Setting up Data

In [None]:
# take a look of the files in our s3 bucket

s3_r = boto3.resource('s3')

bucket_name = "nasa-cryo-scratch"

bucket = s3_r.Bucket(bucket_name)

objects = list(bucket.objects.all())

for my_bucket_object in objects:
    print(my_bucket_object)

In [None]:
s3 = boto3.resource('s3')

In [None]:
# Checkout the files
!aws s3 ls s3://nasa-cryo-scratch/h5cloud/original/

In [None]:
# Set the variables
bucket = 'nasa-cryo-scratch'
directory = 'h5cloud/original/'

# Create a list of s3 objects
s3 = s3fs.S3FileSystem(anon=False)

# This generates a list of strings with filenames
s3path_original = f's3://{bucket}/{directory}*'
remote_files_original = [f's3://{path}' for path in s3.glob(s3path_original)]

s3path_repacked = f's3://{bucket}/{directory}*'
remote_files_repacked = [f's3://{path}' for path in s3.glob(s3path_repacked)]

remote_files_original

# Workflow 1 - Read Data H5PY

In [None]:
group = '/gt2l/heights'

# variables = [
#     "delta_time", "dist_ph_across", "dist_ph_along", "h_ph", "lat_ph", "lon_ph", "pce_mframe_cnt", "ph_id_channel","ph_id_count", "ph_id_pulse", "quality_ph","signal_conf_ph",
# ]

# using the full list of variables takes a very long time to load for all files (~ 8 min on hackweek jupyterhub)
# for now, using just lat, lon, heights for

variables = ['lat_ph', 'lon_ph', 'h_ph']


In [None]:
def read_h5py(file, group, variables, verbose=False):
    
    with h5py.File(s3.open(file, 'rb')) as f:
        
        
        if verbose: print(f'opening {file}...')
        
        group_data = []
        
        for variable in variables:
            
            if verbose: print(f'... reading {variable}')
            
            data = f[f'{group}/{variable}'][:]
            
            group_data.append(data)
            
    return group_data

In [None]:
def test_h5py(files, group='/gt2l/heights', 
              variables=["delta_time", "dist_ph_across", "dist_ph_along",
                         "h_ph", "lat_ph", "lon_ph", 
                         "pce_mframe_cnt", "ph_id_channel",
                         "ph_id_count", "ph_id_pulse", 
                         "quality_ph","signal_conf_ph"],
             verbose=False):
    
    original_data = []

    for file in files:
        data = read_h5py(file, group, variables, verbose)
        original_data.append(data)
    
    
    return original_data


# Workflow 2 - Spatially Subset H5PY

In [None]:
# read in the area of interest geojson
aoi = gpd.read_file('/home/jovyan/h5cloud/notebooks/antarctic_aoi.geojson', crs='EPSG:4326')
bounds = aoi.bounds.values[0] 

In [None]:
def read_spatial_subset_h5py(file, group, subset_variables, min_lon, max_lon, min_lat, max_lat, verbose=False):
    
    with h5py.File(s3.open(file, 'rb')) as f:
        
        if verbose: print(f'opening {file}...')
        
        group_data = []
        
        # read in the photon data to use for indexing
        # may be sped up significantly by using a segment rate parameter instead
        lat = f[f'{group}/lat_ph'][:]
        lon = f[f'{group}/lon_ph'][:]
        
        ph_in_aoi = np.where((lat > min_lat) & (lat < max_lat) \
                             & (lon > min_lon) & (lon < max_lon))[0]

        idx_start = ph_in_aoi[0]
        idx_end = ph_in_aoi[-1]
        
        group_data.append(lat[idx_start:idx_end])
        group_data.append(lon[idx_start:idx_end])
        
        for variable in subset_variables:
            
            if verbose: print(f'... reading {variable}')
            
            data = f[f'{group}/{variable}'][idx_start:idx_end]
            
            group_data.append(data)

    return group_data

In [None]:
def test_h5py_subset(remote_files, bounds, group='/gt2l/heights', 
              variables=["delta_time", "dist_ph_across", "dist_ph_along",
                         "h_ph", "pce_mframe_cnt", "ph_id_channel",
                         "ph_id_count", "ph_id_pulse", 
                         "quality_ph","signal_conf_ph"], verbose=False):
    
    subset_data = []
    
    # specify the lat lon of the bounding box
    min_lon = bounds[0]
    min_lat = bounds[1]
    max_lon = bounds[2]
    max_lat = bounds[3]

    # Loop through files / read
    for file in remote_files:
        data = read_spatial_subset_h5py(file, group, 
                                        variables, min_lon, max_lon, min_lat, max_lat, verbose)
        subset_data.append(data)

    return subset_data
    

# Executing Workflows 1/2 for Original and Repacked H5 Files

Running the test functions for the lists of original and repacked h5 files, and saving the time results.

In [None]:
h5py_original_time = %timeit -n3 -r1 -o test_h5py(remote_files_original, verbose=True)
h5py_subset_timeit = %timeit -n3 -r1 -o test_h5py_subset(remote_files_original, bounds, verbose=True)

In [None]:
h5py_repacked_timeit = %timeit -n3 -r3 -o test_h5py(remote_files_repacked, verbose=True)
h5py_repacked_subset_timeit = %timeit -n3 -r3 -o test_h5py_subset(remote_files_repacked, verbose=True)

In [None]:
plt.plot(h5py_original_time.timings)
plt.ylabel('seconds')