In [79]:
from functools import reduce

import geopandas as gpd
import h5py
import numpy as np
import pandas as pd

In [80]:
path = "/projects/maap-documentation-examples/output/GEDI04_A_2019111040155_O02008_02_T04616_02_002_02_V002.h5"
columns = [
    "agbd",
    "agbd_se",
    "l2_quality_flag",
    "l4_quality_flag",
    "lat_lowestmode",
    "lon_lowestmode",
    "sensitivity",
    "sensitivity_a2",
]
query = "l2_quality_flag == 1 and l4_quality_flag == 1 and sensitivity > 0.95 and sensitivity_a2 > 0.95"

In [103]:
def subset_h5(path, filter_cols, query):
    """
    Extract the beam data only for the aoi and only columns of interest
    """

    subset_df = pd.DataFrame()

    with h5py.File(path, "r") as hf_in:
        # loop through BEAMXXXX groups
        for k, beam in ((k, v) for k, v in hf_in.items() if k.startswith("BEAM")):
            col_names = []
            col_val = []
            #             indices = spatial_filter(beam, aoi)
            items = (
                (k, v)
                for k, v in beam.items()
                if k in filter_cols or isinstance(v, h5py.Group)
            )

            for key, value in items:
                # looping through subgroups
                if isinstance(value, h5py.Group):
                    items2 = ((k2, _) for k2, _ in value.items() if k2 in filter_cols)
                    for key2, value2 in items2:
                        # xvar variables have 2D
                        if key2.startswith("xvar"):
                            for r in range(4):
                                col_names.append(key2 + "_" + str(r + 1))
                                col_val.append(value2[(), r].tolist())
                        else:
                            col_names.append(key2)
                            col_val.append(value2[()])
                # looping through base group
                elif key.startswith("xvar"):
                    # xvar variables have 2D
                    for r in range(4):
                        col_names.append(key + "_" + str(r + 1))
                        col_val.append(value[(), r])
                else:
                    col_names.append(key)
                    col_val.append(value[()])

            # create a pandas dataframe
            beam_df = pd.DataFrame(map(list, zip(*col_val)), columns=col_names)
            beam_df.query(query, inplace=True)
            # Inserting BEAM names
            beam_df.insert(0, "BEAM", np.repeat(str(k), len(beam_df.index)))
            # Appending to the subset_df dataframe
            subset_df = pd.concat([subset_df, beam_df])

    return subset_df

In [104]:
%%time

result = subset_h5(path, columns, query)

print(result.head())
print(result.describe())

       BEAM        agbd    agbd_se  sensitivity_a2  l2_quality_flag  \
0  BEAM0000  141.049683  17.123022        0.978424                1   
1  BEAM0000   95.756226  17.124018        0.979147                1   
2  BEAM0000   90.346252  17.123966        0.966114                1   
3  BEAM0000  113.583145  17.124689        0.977122                1   
4  BEAM0000   93.160324  17.123558        0.983254                1   

   l4_quality_flag  lat_lowestmode  lon_lowestmode  sensitivity  
0                1        0.097697        9.372999     0.988014  
1                1        0.098120        9.373297     0.986098  
2                1        0.098542        9.373594     0.966114  
3                1        0.098964        9.373892     0.983985  
4                1        0.099386        9.374189     0.990431  
                agbd        agbd_se  sensitivity_a2  l2_quality_flag  \
count  290661.000000  290661.000000   290661.000000         290661.0   
mean        8.946183       3.4487

In [110]:
def subset_beam(beam, columns, query):
    def append_series(path, value):
        if (name := path.split("/")[-1]) in columns:
            series.append(pd.Series(value, name=name))

    series = []
    beam.visititems(append_series)

    df = pd.concat(series, axis=1)
    df.query(query, inplace=True)
    df.insert(0, "BEAM", np.repeat(beam.name[5:], len(df.index)))

    return df


def subset_hdf5(path, columns, query):
    with h5py.File(path) as hdf5:
        beams = (value for key, value in hdf5.items() if key.startswith("BEAM"))
        beam_dfs = (subset_beam(beam, columns, query) for beam in beams)
        beams_df = reduce(lambda df0, df1: pd.concat([df0, df1], copy=False), beam_dfs)

    return beams_df

In [111]:
%%time

result = subset_hdf5(path, columns, query)

print(result.head())
print(result.describe())

   BEAM        agbd    agbd_se  sensitivity_a2  l2_quality_flag  \
0  0000  141.049683  17.123022        0.978424                1   
1  0000   95.756226  17.124018        0.979147                1   
2  0000   90.346252  17.123966        0.966114                1   
3  0000  113.583145  17.124689        0.977122                1   
4  0000   93.160324  17.123558        0.983254                1   

   l4_quality_flag  lat_lowestmode  lon_lowestmode  sensitivity  
0                1        0.097697        9.372999     0.988014  
1                1        0.098120        9.373297     0.986098  
2                1        0.098542        9.373594     0.966114  
3                1        0.098964        9.373892     0.983985  
4                1        0.099386        9.374189     0.990431  
                agbd        agbd_se  sensitivity_a2  l2_quality_flag  \
count  290661.000000  290661.000000   290661.000000         290661.0   
mean        8.946183       3.448750        0.986348      