In [1]:
#default_exp io_utils

In [2]:
#export

from alphapept.__main__ import VERSION_NO

# Storing and accessing MS data

As MS hardware has continued to improve over the years, MS data has become more complex. To deal with this complexity, the MS community hase already used many different [data formats](https://onlinelibrary.wiley.com/doi/full/10.1002/mas.21522) to store and access. [HDF](https://www.hdfgroup.org/solutions/hdf5/) containers are one option, but they have not yet gained widespread support.

## HDF containers
In general, an HDF container can be viewed as compressed folder with metadata (i.e. attributes) associated to each single subfolder or file (i.e. data arrays of various types and sizes) within this container. A container might for instance have contents that look like e.g.:
```
HDF_Container
    {
        meta_data_1: "Some string",
        meta_data_2: 1234567890,
        ...
    }
    array_1
        {
            meta_data_of_array1_1: "Some other string",
            ...
        },
        100x2 int8
    array_2
        1000x2 float64
    subfolder_1
        {
            meta_data_of_subfolder_1_1: "Really any string of any length",
            ...
        }
        array_1_of_subfolder_1
        subfolder_1_1
        ...
    subfolder_n
    ...
```


A few of the advantages of HDF are e.g.:

* It has no upper limit with regards to file size
* It can be used on disk without consuming RAM memory
* It is capable of fast IO operations
* It allows data to be structured very transparent, while still providing a flexible way to store metadata
* ...

For these reasons, HDF containers have gained popularity in several scientific fields, including (astro)physics and geology. It is therefore no surprise that python has excellent support for HDF containers. The two most used packages are `h5py` and `tables`, where the former has generic API and the second is frequently used with `pandas` dataframes.

### Using HDF containers for MS data
We will use `h5py` to store MS data in HDF containers, inspired by the [ion_networks](https://github.com/swillems/ion_networks) repository.

* First we define a generic class that will serve as an API for HDF containers. To ensure full transparancy, we will include immutable metadata such as `creation time`, `original_file_name` and `version`.
* The constructor of an HDF_File will be passed the `file_name` of an HDF container, an `is_read_only` flag and `is_new_file` flag.
* To compare HDF_Files, several (magic) functions need to be defined.
* Traceabilty and reproducibility are ensured by storing a `last_updated` and a `check` function to warn users about potential compatability issues.

In [3]:
#export

import h5py
import os
import time


class HDF_File(object):
    '''
    A generic class to store and retrieve on-disk
    data with an HDF container.
    '''

    @property
    def original_file_name(self):
        return self.read(
            attr_name="original_file_name"
        )  # See below for function definition

    @property
    def file_name(self):
        return self.__file_name
    
    @property
    def directory(self):
        return os.path.dirname(self.file_name)
    
    @property
    def creation_time(self):
        return self.read(
            attr_name="creation_time"
        )  # See below for function definition

    @property
    def last_updated(self):
        return self.read(
            attr_name="last_updated"
        )  # See below for function definition

    @property
    def version(self):
        return self.read(
            attr_name="version"
        )  # See below for function definition
        
    @property
    def is_read_only(self):
        return self.__is_read_only
    
    def __init__(
        self,
        file_name:str,
        is_read_only:bool=True,
        is_new_file:bool=False,
    ):
        self.__file_name = os.path.abspath(file_name)
        if is_new_file:
            is_read_only = False
            if not os.path.exists(self.directory):
                os.makedirs(self.directory)
            with h5py.File(self.file_name, "w") as hdf_file:
                current_time = time.asctime()
                hdf_file.attrs["creation_time"] = current_time
                hdf_file.attrs["original_file_name"] = self.__file_name
                hdf_file.attrs["version"] = VERSION_NO
                hdf_file.attrs["last_updated"] = current_time
        else:
            with h5py.File(self.file_name, "r") as hdf_file:
                self.check()
        self.__is_read_only = is_read_only
               
    def __eq__(self, other):
        return self.file_name == other.file_name

    def __hash__(self):
        return hash(self.file_name)

    def __str__(self):
        return f"<HDF_File {self.file_name}>"

    def __repr__(self):
        return str(self)

    def check(
        self,
        version:bool=True,
        file_name:bool=True,
    ):
        '''
        Check if the `version` or `file_name` of this HDF_File have changed.
        This requires to define a global LOGGER and VERSION_NO variable.
        '''
        warning_messages = []
        if version:
            current_version = VERSION_NO
            creation_version = self.version
            if creation_version != current_version:
                warning_messages.append(
                    f"{self} was created with version "
                    f"{creation_version} instead of {current_version}."
                )
        if file_name:
            if self.file_name != self.original_file_name:
                warning_messages.append(
                    f"The file name of {self} has been changed from"
                    f"{self.original_file_name} to {self.file_name}."
                )
        if len(warning_messages) != 0:
            try:
                printer = LOGGER.warning
            except NameError:
                printer = print
                warning_messages.append(
                    "No LOGGER has been defined, using normal print instead."
                )   
            printer("\n".join(warning_messages))

Contents of HDF containers come in three variants:

1. `Groups`: folders
2. `Datasets`: arrays
3. `Attributes`: metadata associated to individual datasets or groups (with the root folder also considered as a normal group)

These contents can be accessed with `read` and `write` funtions.

In [22]:
#export

import pandas as pd
from fastcore.foundation import patch


@patch
def read(
    self:HDF_File,
    group_name:str=None,
    dataset_name:str=None,
    attr_name:str=None,
    return_dataset_shape:bool=False,
    return_dataset_dtype:bool=False,
    return_dataset_slice:slice=slice(None),
):
    '''
    Read the contents of an HDF_File. If no `group_name` has been provided,
    read directly from the root group. If no `dataset_name` has been provided,
    read directly from the group. If `attr_name` is not None,
    read the attribute value instead of the contents of a group or dataset.
    If `attr_name` == "", read all attributes as a dict.
    The options `return_dataset_shape`, `return_dataset_dtype` and
    `return_dataset_slice` allow to minimize IO and RAM usage by reading
    datasets only partially.
    '''
    with h5py.File(self.file_name, "r") as hdf_file:
        if group_name is None:
            group = hdf_file
            group_name = "/"
        else:
            try:
                group = hdf_file[group_name]
            except KeyError:
                raise KeyError(
                    f"Group {group_name} does not exist in {self}."
                )
        if dataset_name is None:
            if attr_name is None:
                return sorted(group)
            elif attr_name != "":
                try:
                    return group.attrs[attr_name]
                except KeyError:
                    raise keyError(
                        f"Attribute {attr_name} does not exist for "
                        f"group {group_name} of {self}."
                    )
            else:
                return dict(group.attrs)
        else:
            try:
                dataset = group[dataset_name]
            except KeyError:
                raise KeyError(
                    f"Dataset {dataset_name} does not exist for "
                    f"group {group_name} of {self}."
                )
            if attr_name is None:
                if isinstance(dataset, h5py.Dataset):
                    if return_dataset_shape:
                        return dataset.shape
                    elif return_dataset_dtype:
                        return dataset.dtype
                    else:
                        return dataset[return_dataset_slice]
                else:
                    raise NotImplementedError(
                        "Use group as pandas dataframe container?"
                    )
            elif attr_name != "":
                try:
                    return dataset.attrs[attr_name]
                except KeyError:
                    raise KeyError(
                        f"Attribute {attr_name} does not exist for "
                        f"dataset {dataset_name} of group "
                        f"{group_name} of {self}."
                    )
            else:
                return dict(dataset.attrs)


@patch
def write(
    self:HDF_File,
    value,
    group_name:str=None,
    dataset_name:str=None,
    attr_name:str=None,
    overwrite:bool=False,
    dataset_compression=None
):
    '''
    Write a `value` to an HDF_File. If an 'attr_name' is provided,
    `value` will be stored for this attribute.
    If no `group_name` is provided, write directly to the root group.
    If no `dataset_name` is provided, create a new group with `value`
    as name. If a 'dataset_name' is provided, a 'dataset_compression`
    can be defined to minimize disk usage, at the cost of slower IO.
    If the `overwrite` flag is True, overwrite the given attribute
    or dataset and truncate groups.
    '''
    if self.is_read_only:
        raise IOError(
            f"Trying to write to {self}, which is read_only."
        )
    with h5py.File(self.file_name, "a") as hdf_file:
        if group_name is None:
            group = hdf_file
            group_name = "/"
        else:
            try:
                group = hdf_file[group_name]
            except KeyError:
                raise KeyError(
                    f"Group {group_name} does not exist in {self}."
                )
        if dataset_name is None:
            if attr_name is None:
                if value in group:
                    if overwrite:
                        del group[value]
                    else:
                        raise ValueError(
                            f"New group {value} already exists in group "
                            f"{group_name} of {self}."
                        )
                group.create_group(value)
            else:
                if (attr_name in group.attrs) and not overwrite:
                    raise ValueError(
                        f"Attribute {attr_name} already exists in group "
                        f"{group_name} of {self}."
                    )
                try:
                    group.attrs[attr_name] = value
                except TypeError:
                    group.attrs[attr_name] = str(value)
        else:
            if attr_name is None:
                if dataset_name in group:
                    if overwrite:
                        del group[dataset_name]
                    else:
                        raise ValueError(
                            f"Dataset {dataset_name} already exists in group "
                            f"{group_name} of {self}."
                        )
                if isinstance(value, pd.core.frame.DataFrame):
                    raise NotImplementedError(
                        "Use group as pandas dataframe container?"
                    )
                if value.dtype.type == np.str_:
                    value = value.astype(np.dtype('O'))
                if value.dtype == np.dtype('O'):
                    hdf_dataset = group.create_dataset(
                        dataset_name,
                        data=value,
                        compression=dataset_compression,
                        dtype=h5py.string_dtype()
                    )
                else:
                    hdf_dataset = group.create_dataset(
                        dataset_name,
                        data=value,
                        compression=dataset_compression,
                    )
            else:
                try:
                    dataset = group[dataset_name]
                except KeyError:
                    raise KeyError(
                        f"Dataset {dataset_name} does not exist for "
                        f"group {group_name} of {self}."
                    )
                if (attr_name in dataset.attrs) and not overwrite:
                    raise ValueError(
                        f"Attribute {attr_name} already exists in "
                        f"dataset {dataset_name} of group "
                        f"{group_name} of {self}."
                    )
                try:
                    dataset.attrs[attr_name] = value
                except TypeError:
                    dataset.attrs[attr_name] = str(value) # e.g. dicts
        hdf_file.attrs["last_updated"] = time.asctime()

Unit tests for this generic HDF class include:

* Creation and truncation of file with various access.
* Writing and reading data from the container.

In [60]:
#hide

import numpy as np
import unittest


def define_new_test_files(test_folder):
    test_file_names = [
        os.path.abspath(
            os.path.join(test_folder, f"{file_name}.hdf")
        ) for file_name in [
            "test0",
            "test1",
            "test2",
        ]
    ]
    for file_name in test_file_names:
        if os.path.isfile(file_name):
            os.remove(file_name)
    return test_file_names

def test_hdf_file_creation(test_folder):
    test_file_names = define_new_test_files(test_folder)
    try:
        f0 = HDF_File(test_file_names[0])
    except OSError:
        assert True
    else:
        assert False, "Non-existing file should raise an error"
    f0 = HDF_File(test_file_names[0], is_new_file=True)
    assert f0.is_read_only == False, "New files should never be read-only"
    del f0
    try:
        f0 = HDF_File(test_file_names[0])
    except OSError:
        assert False, "Newly created file should exist on disk"
    else:
        assert True
    assert f0.is_read_only == True, "Existing files should be read-only"
    assert f0.file_name == test_file_names[0], "File name should match given file name"
    assert f0.original_file_name == test_file_names[0], "Original file name should match given file name"
    assert f0.version == VERSION_NO, "Versions should match"
    assert str(f0) == f"<HDF_File {test_file_names[0]}>", "File name should match"
    del f0
    os.rename(test_file_names[0], test_file_names[1])
    f1 = HDF_File(test_file_names[1])
    assert f1.original_file_name != test_file_names[1], "Original file name should not match given file name"
    assert f1.directory == os.path.abspath(test_folder), "Directory should match"
    f0 = HDF_File(test_file_names[0], is_new_file=True)
    assert f0 != f1, "Different file names should be different HDF_Files"
    del f0
    f1_copy = HDF_File(test_file_names[1], is_read_only=False)
    assert f1 == f1_copy, "Same file names should be same HDF_Files"
    assert f1_copy.is_read_only == False, "File should not be read-only"


def test_hdf_file_read_and_write(test_folder):
    test_file_names = define_new_test_files(test_folder)
    f0 = HDF_File(test_file_names[0], is_new_file=True)
    try:
        f0.read(group_name="subgroup")
    except KeyError:
        assert True
    else:
        assert False, "subgroup should not exist"
    f0.write("subgroup")
    try:
        f0.read(group_name="subgroup")
    except KeyError:
        assert False, "Subgroup should exist"
    else:
        assert True
    z = np.random.random((100, 4))
    f0.write(z, group_name="subgroup", dataset_name="random")
    f0_copy = HDF_File(test_file_names[0])
    assert f0_copy.read(
        dataset_name="random",
        group_name="subgroup",
        return_dataset_shape=True
    ) == z.shape, "Shape of dataset is not correct"
    assert f0_copy.read(
        dataset_name="random",
        group_name="subgroup",
        return_dataset_dtype=True
    ) == z.dtype, "Type of dataset is not correct"
    assert np.all(
        f0_copy.read(dataset_name="random", group_name="subgroup") == z
    ), "Contents of dataset are not correct"
    try:
        f0.write(z, group_name="subgroup", dataset_name="random")
    except ValueError:
        assert True
    else:
        assert False, "Should not overwrite dataset"
    try:
        f0.write(z, group_name="subgroup", dataset_name="random", overwrite=True)
    except ValueError:
        assert False, "Should be able to overwrite dataset"
    else:
        assert True
    f0.write(
        4,
        dataset_name="random",
        group_name="subgroup",
        attr_name="numeric_attr",
    )
    assert f0.read(
        dataset_name="random",
        group_name="subgroup",
        attr_name="numeric_attr",
    ) == 4, "Attr shoud match"
    f0.write(
        "test",
        dataset_name="random",
        group_name="subgroup",
        attr_name="string_attr",
    )
    assert f0.read(
        dataset_name="random",
        group_name="subgroup",
        attr_name="string_attr",
    ) == "test", "String attr shoud match"
    f0.write(
        list(range(5)),
        dataset_name="random",
        group_name="subgroup",
        attr_name="int_list_attr",
    )
    assert np.all(
        f0.read(
            dataset_name="random",
            group_name="subgroup",
            attr_name="int_list_attr",
        ) == list(range(5))
    ), "Attr shoud match"
    mixed_list =  ["test", "mixed", 2, 4.9]
    try:
        f0.write(
            mixed_list,
            dataset_name="random",
            group_name="subgroup",
            attr_name="int_list_attr",
        )
    except ValueError:
        assert True
    else:
        assert False, "Should not be able to overwrite attr"
    f0.write(
        mixed_list,
        dataset_name="random",
        group_name="subgroup",
        attr_name="int_list_attr",
        overwrite=True
    )
    assert np.all(
        f0.read(
            dataset_name="random",
            group_name="subgroup",
            attr_name="int_list_attr",
        ) == mixed_list
    ), "Attr of mixed list should match"
    f0.write(
        {"t": 1},
        dataset_name="random",
        group_name="subgroup",
        attr_name="dict_attr",
        overwrite=True
    )
    assert np.all(
        f0.read(
            dataset_name="random",
            group_name="subgroup",
            attr_name="dict_attr",
        ) == str({"t": 1})
    ), "Attr shoud match"
    

    
test_hdf_file_creation(test_folder="tmp")
test_hdf_file_read_and_write(test_folder="tmp")

The file name of <HDF_File /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf> has been changed from/Users/swillems/Documents/software/alphapept/nbs/tmp/test0.hdf to /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf.
No LOGGER has been defined, using normal print instead.
The file name of <HDF_File /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf> has been changed from/Users/swillems/Documents/software/alphapept/nbs/tmp/test0.hdf to /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf.
No LOGGER has been defined, using normal print instead.


In [None]:
#hide
from nbdev.export import *
notebook2script()