# HDF5 structure

Every AlphaRaw HDF5 file should adhere to the following structure:

* /
    * *alpharaw_version*: str
    * *compression*: str
    * *chunk_size*: int
    * *creation_timestamp*: date
    * **raw**
        * *vendor*: str
        * *instrument_name*: str
        * *acquisition_mode*: str
        * *sample_name*: str
        * *original_raw_data_file_name*: path
        * *acquisition_timestamp*: date
        * *acquisition_software*: str
        * **reference_dimensions**
            * *number_of_dimensions*: int
            * rt_values: float64[:]
                * *dimension_index*: int8
                * *cyclic*: bool
            * mobility_values: float64[:]
                * *dimension_index*: int8
                * *cyclic*: bool
            * quad_mz_values: float64[:, 2]
                * *dimension_index*: int8
                * *cyclic*: bool   
        * **indices**
            * spectrum_offsets: int64[:]
            * dimension_order: int8[:]
        * **measurements**
            * *detector_event_count*: int64
            * intensity_indices: uint16[:]
                * *min*: float64
                * *max*: float64
                * *scale*: str 
            * mz_indices: uint32[:]
                * *min*: float64
                * *max*: float64
                * *scale*: str 
    * **centroided**
        * *paramaters*: ?
    * **features**
        * *paramaters*: ?
        
    

In [1]:
import numba
import numpy as np

def mz_index_to_value_factory(
    min_mz: float = 100.0,
    max_mz: float = 1700.0,
    length: int = 2**32,
    scale: str = "exponential",
    custom_array: np.ndarray = None,
) -> float:
    if custom_array is not None:
        def func(mz_index):
            return custom_array[mz_index]
    elif scale in ["exponential", "exp", "expx"]:
        def func(mz_index):
            relative_mz_index = mz_index / (length - 1)
            return min_mz * (max_mz / min_mz)**(relative_mz_index)
    elif scale in ["lin", "linear", "x"]:
        def func(mz_index):
            relative_mz_index = mz_index / (length - 1)
            return min_mz + (relative_mz_index) * (max_mz - min_mz)
    elif scale in ["quadratic", "x2"]:
        def func(mz_index):
            relative_mz_index = mz_index / (length - 1)
            return (
                np.sqrt(min_mz) + (np.sqrt(max_mz) - np.sqrt(min_mz)) * relative_mz_index
            )**2
    else:
        raise ValueError(f"Scale {scale} is invalid")
    return numba.vectorize(nopython=True)(func)

In [2]:
mz_index_to_value = mz_index_to_value_factory(scale="linear")
a = mz_index_to_value(0)
b = mz_index_to_value(1)
a, b, 2 * (b - a) / (a + b) * 10**6

(100.0, 100.00000037252903, 0.00372529029152302)

In [3]:
mz_index_to_value = mz_index_to_value_factory(scale="quadratic")
a = mz_index_to_value(0)
b = mz_index_to_value(1)
a, b, 2 * (b - a) / (a + b) * 10**6

(100.0, 100.00000014543095, 0.0014543094540270048)

In [4]:
mz_index_to_value = mz_index_to_value_factory(scale="exponential")
a = mz_index_to_value(0)
b = mz_index_to_value(1)
a, b, 2 * (b - a) / (a + b) * 10**6

(100.0, 100.0000000659659, 0.0006596590649321786)

In [5]:
size = 400000
custom_array = np.linspace(100, 1700, size)
mz_index_to_value = mz_index_to_value_factory(custom_array=custom_array)
a = mz_index_to_value(0)
b = mz_index_to_value(1)
a, b, 2 * (b - a) / (a + b) * 10**6

(100.0, 100.00400001000003, 39.999300012270105)

In [6]:
mz_index_to_value = mz_index_to_value_factory(scale="linear")
a, b = mz_index_to_value(np.array([2**32 - 2, 2**32 - 1], dtype=np.uint32))
a, b, 2 * (b - a) / (a + b) * 10**6

(1699.999999627471, 1700.0, 0.00021913472346294616)

In [7]:
mz_index_to_value = mz_index_to_value_factory(scale="quadratic")
a, b = mz_index_to_value(np.array([2**32 - 2, 2**32 - 1], dtype=np.uint32))
a, b, 2 * (b - a) / (a + b) * 10**6

(1699.999999400373, 1700.0000000000002, 0.0003527218402944335)

In [8]:
mz_index_to_value = mz_index_to_value_factory(scale="exponential")
a, b = mz_index_to_value(np.array([2**32 - 2, 2**32 - 1], dtype=np.uint32))
a, b, 2 * (b - a) / (a + b) * 10**6

(1699.9999988785798, 1700.0, 0.0006596589148994551)

In [9]:
size = 400000
custom_array = np.linspace(100, 1700, size)
mz_index_to_value = mz_index_to_value_factory(custom_array=custom_array)
a, b = mz_index_to_value(np.array([size - 2, size - 1], dtype=np.uint32))
a, b, 2 * (b - a) / (a + b) * 10**6

(1699.99599999, 1700.0, 2.352949827022618)