## AB testing access time for ICESat-2 HDF5 files on the cloud.

This notebook requires that we have 2 versions of the same file:
  * Original A: The original file with no modifications on a S3 location.
  * Test Case B: A modified version of the orignal file to test for metadata consolidation, rechunking and other strategies to speed up access to the data in the file.


In [1]:
import xarray as xr
import h5py
import fsspec
import s3fs
import boto3
import logging
import re
import time
from datetime import datetime
import pandas as pd
import numpy as np
import os

class RegexFilter(logging.Filter):
    def __init__(self, regex_pattern):
        super(RegexFilter, self).__init__()
        self.regex_pattern = re.compile(regex_pattern)

    def filter(self, record):
        # Apply the regex pattern to the log message
        return not bool(self.regex_pattern.search(record.msg))

    
def timer_decorator(func):
    """
    A decorator to measure the execution time of the wrapped function.
    """
    def __setup_logging(self, tstamp):
        log_filename = f"logs/{self.data_format}-{tstamp}.log"
        logger = logging.getLogger("fsspec")
        logger.setLevel(logging.DEBUG)
        self.regex_filter = RegexFilter(self.logs_regex)
        # add regerx to root logger
        logging.getLogger().addFilter(self.regex_filter )
        self._file_handler = logging.FileHandler(log_filename)
        self._file_handler.setLevel(logging.DEBUG)
        # Add the handler to the root logger
        logging.getLogger().addHandler(self._file_handler)
        
    def __turnoff_logging(self):
        logging.getLogger().removeFilter(self.regex_filter)
        logging.getLogger().removeHandler(self._file_handler)
        self._file_handler.close()
        
    def wrapper(self, *args, **kwargs):
        tstamp = datetime.now().strftime('%Y-%m-%d-%H%M%S')
        if self.logs_regex:
            __setup_logging(self, tstamp)
        start_time = time.time()
        result = func(self, *args, **kwargs)
        end_time = time.time()
        if self.logs_regex:
            __turnoff_logging(self)
        execution_time = end_time - start_time
        # Call the store method here
        if self.store_results:
            results_key = f"{tstamp}_{self.name}_{self.data_format}_results.csv"
            s3_key = f"{self.results_directory}/{results_key}"
            self.store(run_time=execution_time, result=result, bucket=self.bucket, s3_key=s3_key)
        return result, execution_time
    return wrapper  


    
class H5Test:
    def __init__(self,
                 data_format: str,
                 files=None,
                 store_results=True,
                 logs_regex=None):
        self.name = self.__class__.__name__
        self.data_format = data_format
        self.logs_regex = logs_regex
        if files:
            self.files = files
        else:
            self.files = S3Links().get_links_by_format(data_format)
        self.s3_client = boto3.client('s3')  # Ensure AWS credentials are configured
        self.s3_fs = s3fs.S3FileSystem(anon=False)
        self.store_results = store_results
        self.bucket = "nasa-cryo-persistent"
        self.results_directory = "h5cloud/benchmark_results"
        
      

    @timer_decorator
    def run(self, io_params):
        raise NotImplementedError("The run method has not been implemented")

    def store(self, run_time: float, result: str, bucket: str, s3_key: str):
        """
        Store test results to an S3 bucket as a CSV file.

        :param run_time: The runtime of the test
        :param result: The result of the test
        :param bucket: The name of the S3 bucket where the CSV will be uploaded
        :param s3_key: The S3 key (filename) where the CSV will be stored
        """
        # Create a CSV in-memory
        csv_buffer = StringIO()
        csv_writer = csv.writer(csv_buffer)
        csv_writer.writerow(['Name', 'Data Format', 'Run Time', 'Result'])  # Headers
        csv_writer.writerow([self.name, self.data_format, run_time, result])

        # Reset the buffer's position to the beginning
        csv_buffer.seek(0)

        # Upload the CSV to S3
        self.s3_client.put_object(Bucket=bucket, Key=s3_key, Body=csv_buffer.getvalue())

for library in (xr, h5py, fsspec):
    print(f'{library.__name__} v{library.__version__}')

xarray v2023.12.0
h5py v3.10.0
fsspec v2023.6.0


In [11]:
class H5pyArrMean(H5Test):
    
    @timer_decorator
    def run(self, io_params):
        final_h5py_array = []  
        # TODO: Do we need to make this configurable or consistent?
        group = '/gt1l/heights'
        variable = 'h_ph'
        fsspec_params = io_params["fsspec_params"]
        h5py_params = io_params["h5py_params"]
        for file in self.files:
            with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo:
                with h5py.File(fo, **h5py_params) as f:
                    data = f[f"{group}/{variable}"][:]
                    final_h5py_array = np.insert(
                        final_h5py_array,
                        len(final_h5py_array),
                        data, axis=None
                    )
        return np.mean(final_h5py_array)

In [None]:
class H5pyROS3ArrMean(H5Test):
    """
        This will only work for public buckets for now
    """
    
    @timer_decorator
    def run(self, io_params):
        final_h5py_array = []  
        # TODO: Do we need to make this configurable or consistent?
        group = '/gt1l/heights'
        variable = 'h_ph'
        h5py_params = io_params["h5py_params"]
        for file in self.files:
            with h5py.File(file, driver="ros3", **h5py_params) as f:
                data = f[f"{group}/{variable}"][:]
                final_h5py_array = np.insert(
                    final_h5py_array,
                    len(final_h5py_array),
                    data, axis=None
                )
        return np.mean(final_h5py_array)

In [3]:
class XarrayArrMean(H5Test):
    def open_reference_ds(self, file):
        fs = fsspec.filesystem(
            'reference', 
            fo=file, 
            remote_protocol='s3', 
            remote_options=dict(anon=False), 
            skip_instance_cache=True
        )
        return xr.open_dataset(fs.get_mapper(""), engine='zarr', consolidated=False, group='gt1l/heights')

    @timer_decorator
    def run(self, io_params):
        group = '/gt1l/heights'
        variable = 'h_ph'

        if 'kerchunk' in self.data_format:            
            datasets = [self.open_reference_ds(file) for file in self.files]
            h_ph_values = []
            for dataset in datasets:
                h_ph_values = np.append(h_ph_values, dataset['h_ph'].values)
            return np.mean(h_ph_values)
        else:
            if "repacked" in self.data_format:
                fsspec_params = {
                    # "skip_instance_cache": True
                    "cache_type": "first",
                    "block_size": 16*1024*1024
                }
                h5py_params = {
                    "driver_kwds" :{
                        "page_buf_size": 32*1024*1024,
                        "rdcc_nbytes": 8*1024*1024
                    }
                }            
            s3_fileset = [self.s3_fs.open(file, **fsspec_params) for file in self.files]
            xrds = xr.open_mfdataset(s3_fileset, group=group, combine='by_coords', engine='h5netcdf', **h5py_params)
            h_ph_values = xrds['h_ph']
            return float(np.mean(h_ph_values).values)

In [57]:
repacked_granules = [
    "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20181120182818_08110112_006_02_repacked.h5",
    "s3://nasa-cryo-persistent/h5cloud/atl03/big/repacked/ATL03_20190219140808_08110212_006_02_repacked.h5",
]
test_cloud = H5pyArrMean('atl03-bigsize-repacked',
                         files=repacked_granules,
                         store_results=False)

In [12]:

original_granules = [
    "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20181120182818_08110112_006_02.h5",
    "s3://nasa-cryo-persistent/h5cloud/atl03/big/original/ATL03_20190219140808_08110212_006_02.h5",
]

logs_regex = r"<File-like object S3FileSystem, .*?>\s*(read: \d+ - \d+)"

test_original = H5pyArrMean('atl03-bigsize-original',
                            files=original_granules,
                            store_results=False)

In [14]:
# logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)
io_params ={
    "fsspec_params": {
        "skip_instance_cache": True
        # "cache_type": "blockcache",
        # "block_size": 4*1024*1024
    },
    "h5py_params": {
        # "rdcc_nbytes": 2*1024*1024        
    }
}
for runs in range(5):
    print(test_original.run(io_params))

(1032.9840463639412, 12.149354219436646)
(1032.9840463639412, 12.194729566574097)
(1032.9840463639412, 12.10885739326477)
(1032.9840463639412, 11.940461874008179)
(1032.9840463639412, 12.063915252685547)


In [None]:
io_params ={
    "fsspec_params": {
        # "skip_instance_cache": True
        # "cache_type": "blockcache",
        # "block_size": 4*1024*1024
    },
    "h5py_params": {
        # "page_buf_size": 32*1024*1024,
        # "rdcc_nbytes": 2*1024*1024
    }
}
for runs in range(5):
    print(test_cloud.run(io_params))

(1032.9840463639412, 34.1772985458374)
(1032.9840463639412, 30.96499228477478)
(1032.9840463639412, 31.00865602493286)
(1032.9840463639412, 31.207276821136475)


In [20]:
print(f'Registered drivers: {h5py.registered_drivers()}')

Registered drivers: frozenset({'mpio', 'family', 'ros3', 'split', 'core', 'sec2', 'fileobj', 'direct', 'stdio'})


In [None]:
import matplotlib.pyplot as plt

df = pd.DataFrame.from_dict(benchmarks)

fig, ax = plt.subplots(figsize=(10, 6))

for name, group in df.groupby(['tool', 'dataset', 'format']):
    tool, dataset, formated = name
    x = f'{tool}, {dataset}, {formated}'
    y = group['time'].mean()
    ax.bar(f'{tool}, {dataset}, {formated}', group['time'].mean(), label=f'{tool}, {dataset}, {formated}', align='center')
    ax.text(x, y + 0.05, f'{group["time"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=8)

# Set labels and title
ax.set_xlabel('Combination')
ax.set_ylabel('Time in Seconds')
ax.set_title('mean() on photon data for a single IS2 track, less is better')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# # Show legend
# ax.legend()

# Show the plot
with plt.xkcd():
    # This figure will be in XKCD-style
    fig1 = plt.figure()



In [None]:
df

In [None]:
import matplotlib.pyplot as plt

df = pd.DataFrame.from_dict(benchmarks)

fig, ax = plt.subplots(figsize=(10, 6))

for name, group in df.groupby(['tool', 'dataset', 'format']):
    tool, dataset, formated = name
    x = f'{tool}, {dataset}, {formated}'
    y = group['time'].mean()
    ax.bar(f'{tool}, {dataset}, {formated}', group['time'].mean(), label=f'{tool}, {dataset}, {formated}', align='center')
    ax.text(x, y + 0.05, f'{group["time"].mean():.2f}', ha='center', va='bottom', color='black', fontsize=8)

# Set labels and title
ax.set_xlabel('Combination')
ax.set_ylabel('Time in Seconds')
ax.set_title('mean() on photon data for a single IS2 track, less is better')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# # Show legend
# ax.legend()

# Show the plot
plt.show()


In [None]:
df

In [None]:
# def normalize_log(log_file):
#     with open(log_file, 'r') as input_file:
#     # Open the output file in write mode
#         with open(f'{log_file.replace(".log", "-ros-compatible.log")}', 'w') as output_file:
#             # Iterate through each line in the input file
#             for line in input_file:
#                 # Strip leading and trailing whitespaces from the line
#                 stripped_line = line.strip()

#                 # Write the stripped line to the output file
#                 output_file.write(stripped_line + '\n') 

In [None]:
    # "ATL08": {
    #     "links": {
    #         "original": "s3://nasa-cryo-persistent/h5cloud/atl08/original/ATL08_20200404075919_01340707_006_03.h5",
    #         "optimized": "s3://nasa-cryo-persistent/h5cloud/atl08/repacked/ATL08_20200404075919_01340707_006_03_repacked.h5",
    #     },
    #     "group": "/gt1l/signal_photons",
    #     "variable": "ph_h",
    #     "processing": [
    #         "h5repack -S PAGE -G 4000000"
    #     ]
    # },
    # "ATL03": {
    #     "links": {
    #         "original": "s3://nasa-cryo-persistent/h5cloud/atl03/average/original/ATL03_20230618223036_13681901_006_01.h5",
    #         "optimized": "s3://nasa-cryo-persistent/h5cloud/atl03/average/repacked/ATL03_20230618223036_13681901_006_01.h5"
    #     },
    #     "group": "/gt1l/heights",
    #     "variable": "h_ph",
    #     "processing": [
    #         "h5repack -S PAGE -G 4000000"
    #     ]
    # },