# (Optional) Decompress adn Read Hierarchical Data Format (HDF) files 
The CWAM data has been compressed and save as hdf arrays because of their large files.
Although it may not be necessary to interact with these files for the competition, I wanted to provide a few quick steps to extract the data from these files and read their content.
It may be useful for visualization or future modeling. 

It's particularly challenging to interact with these files, as PySpark does not have a native library to automatically read them

# Interacting with the h5.bz2 files
## Decompressing the bz2 files
## Reading the hdf5 files

!pip install h5py

In [0]:
import bz2
from bz2 import decompress
import os
import h5py

In [0]:
dbutils.fs.ls("/tmp/")

[FileInfo(path='dbfs:/tmp/2022_09_01_23_45_GMT.Forecast.h5.CWAM.h5.bz2', name='2022_09_01_23_45_GMT.Forecast.h5.CWAM.h5.bz2', size=7822871, modificationTime=1730953550000),
 FileInfo(path='dbfs:/tmp/2022_09_01_23_45_GMT.Forecast.h5.CWAM.h5.compressed', name='2022_09_01_23_45_GMT.Forecast.h5.CWAM.h5.compressed', size=50616880, modificationTime=1730953860000)]

In [0]:
filepath = "/dbfs/tmp/2022_09_01_23_45_GMT.Forecast.h5.CWAM.h5.bz2"
newfilepath = filepath.replace(".bz2", ".compressed")
with open(newfilepath, 'wb') as new_file, bz2.BZ2File(filepath, 'rb') as file:
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(data)

In [0]:
def traverse_datasets(hdf_file):

    def h5py_dataset_iterator(g, prefix=''):
        for key in g.keys():
            item = g[key]
            path = f'{prefix}/{key}'
            if isinstance(item, h5py.Dataset): # test for dataset
                yield (path, item)
            elif isinstance(item, h5py.Group): # test for group (go down)
                yield from h5py_dataset_iterator(item, path)

    for path, _ in h5py_dataset_iterator(hdf_file):
        yield path

In [0]:
with h5py.File(newfilepath, 'r') as f:
    for dset in traverse_datasets(f):
        print('Path:', dset)
        print('Shape:', f[dset].shape)
        print('Data type:', f[dset].dtype)