In [None]:
# default_exp io

# Input / Output

> Functions related to input and output

This notebook contains all functions related to importing and exporting files. To access proprietary data formats, we have import functions to access `Bruker` and `Thermo` data.

# Storing and accessing MS data

As MS hardware has continued to improve over the years, MS data has become more complex. To deal with this complexity, the MS community hase already used many different [data formats](https://onlinelibrary.wiley.com/doi/full/10.1002/mas.21522) to store and access. [HDF](https://www.hdfgroup.org/solutions/hdf5/) containers are one option, but they have not yet gained widespread support.

## HDF containers
In general, an HDF container can be viewed as compressed folder with metadata (i.e. attributes) associated to each single subfolder or file (i.e. data arrays of various types and sizes) within this container. A container might for instance have contents that look like e.g.:
```
HDF_Container
    {
        meta_data_1: "Some string",
        meta_data_2: 1234567890,
        ...
    }
    array_1
        {
            meta_data_of_array1_1: "Some other string",
            ...
        },
        100x2 int8
    array_2
        1000x2 float64
    subfolder_1
        {
            meta_data_of_subfolder_1_1: "Really any string of any length",
            ...
        }
        array_1_of_subfolder_1
        subfolder_1_1
        ...
    subfolder_n
    ...
```


A few of the advantages of HDF are e.g.:

* It has no upper limit with regards to file size
* It can be used on disk without consuming RAM memory
* It is fully portable on different machines and systems
* It is capable of fast IO operations
* It allows data to be structured very transparent, while still providing a flexible way to store metadata
* ...

For these reasons, HDF containers have gained popularity in several scientific fields, including (astro)physics and geology. It is therefore no surprise that python has excellent support for HDF containers. The two most used packages are `h5py` and `tables`, where the former has generic API and the second is frequently used with `pandas` dataframes. An excellent viewer for HDF files is [HDF Compass](https://support.hdfgroup.org/projects/compass/).

### Using HDF containers for MS data
We will use `h5py` to store MS data in HDF containers, inspired by the [ion_networks](https://github.com/swillems/ion_networks) repository.

* First we define a generic class that will serve as an API for HDF containers. To ensure full transparancy, we will include immutable metadata such as `creation time`, `original_file_name` and `version`.
* The constructor of an HDF_File will be passed the `file_name` of an HDF container, an `is_read_only` flag and `is_new_file` flag.
* To compare HDF_Files, several (magic) functions need to be defined.
* Traceabilty and reproducibility are ensured by storing a `last_updated` and a `check` function to warn users about potential compatability issues.

In [None]:
#export

import h5py
import os
import time
from alphapept.__main__ import VERSION_NO


class HDF_File(object):
    '''
    A generic class to store and retrieve on-disk
    data with an HDF container.
    '''

    @property
    def original_file_name(self):
        return self.read(
            attr_name="original_file_name"
        )  # See below for function definition

    @property
    def file_name(self):
        return self.__file_name
    
    @property
    def directory(self):
        return os.path.dirname(self.file_name)
    
    @property
    def creation_time(self):
        return self.read(
            attr_name="creation_time"
        )  # See below for function definition

    @property
    def last_updated(self):
        return self.read(
            attr_name="last_updated"
        )  # See below for function definition

    @property
    def version(self):
        return self.read(
            attr_name="version"
        )  # See below for function definition
        
    @property
    def is_read_only(self):
        return self.__is_read_only
    
    def __init__(
        self,
        file_name:str,
        is_read_only:bool=True,
        is_new_file:bool=False,
    ):
        self.__file_name = os.path.abspath(file_name)
        if is_new_file:
            is_read_only = False
            if not os.path.exists(self.directory):
                os.makedirs(self.directory)
            with h5py.File(self.file_name, "w") as hdf_file:
                current_time = time.asctime()
                hdf_file.attrs["creation_time"] = current_time
                hdf_file.attrs["original_file_name"] = self.__file_name
                hdf_file.attrs["version"] = VERSION_NO
                hdf_file.attrs["last_updated"] = current_time
        else:
            with h5py.File(self.file_name, "r") as hdf_file:
                self.check()
        self.__is_read_only = is_read_only
               
    def __eq__(self, other):
        return self.file_name == other.file_name

    def __hash__(self):
        return hash(self.file_name)

    def __str__(self):
        return f"<HDF_File {self.file_name}>"

    def __repr__(self):
        return str(self)

    def check(
        self,
        version:bool=True,
        file_name:bool=True,
    ):
        '''
        Check if the `version` or `file_name` of this HDF_File have changed.
        This requires to define a global LOGGER and VERSION_NO variable.
        '''
        warning_messages = []
        if version:
            current_version = VERSION_NO
            creation_version = self.version
            if creation_version != current_version:
                warning_messages.append(
                    f"{self} was created with version "
                    f"{creation_version} instead of {current_version}."
                )
        if file_name:
            if self.file_name != self.original_file_name:
                warning_messages.append(
                    f"The file name of {self} has been changed from"
                    f"{self.original_file_name} to {self.file_name}."
                )
        if len(warning_messages) != 0:
            try:
                printer = LOGGER.warning
            except NameError:
                printer = print
                warning_messages.append(
                    "No LOGGER has been defined, using normal print instead."
                )   
            printer("\n".join(warning_messages))

Contents of HDF containers come in three variants:

1. `Groups`: folders
2. `Datasets`: arrays
3. `Attributes`: metadata associated to individual datasets or groups (with the root folder also considered as a normal group)

These contents can be accessed with `read` and `write` funtions.

In [None]:
#export

import pandas as pd
from fastcore.foundation import patch


@patch
def read(
    self:HDF_File,
    group_name:str=None,
    dataset_name:str=None,
    attr_name:str=None,
    return_dataset_shape:bool=False,
    return_dataset_dtype:bool=False,
    return_dataset_slice:slice=slice(None),
):
    '''
    Read the contents of an HDF_File. If no `group_name` has been provided,
    read directly from the root group. If no `dataset_name` has been provided,
    read directly from the group. If `attr_name` is not None,
    read the attribute value instead of the contents of a group or dataset.
    If `attr_name` == "", read all attributes as a dict.
    The options `return_dataset_shape`, `return_dataset_dtype` and
    `return_dataset_slice` allow to minimize IO and RAM usage by reading
    datasets only partially.
    '''
    with h5py.File(self.file_name, "r") as hdf_file:
        if group_name is None:
            group = hdf_file
            group_name = "/"
        else:
            try:
                group = hdf_file[group_name]
            except KeyError:
                raise KeyError(
                    f"Group {group_name} does not exist in {self}."
                )
        if dataset_name is None:
            if attr_name is None:
                return sorted(group)
            elif attr_name != "":
                try:
                    return group.attrs[attr_name]
                except KeyError:
                    raise keyError(
                        f"Attribute {attr_name} does not exist for "
                        f"group {group_name} of {self}."
                    )
            else:
                return dict(group.attrs)
        else:
            try:
                dataset = group[dataset_name]
            except KeyError:
                raise KeyError(
                    f"Dataset {dataset_name} does not exist for "
                    f"group {group_name} of {self}."
                )
            if attr_name is None:
                if isinstance(dataset, h5py.Dataset):
                    if return_dataset_shape:
                        return dataset.shape
                    elif return_dataset_dtype:
                        return dataset.dtype
                    else:
                        return dataset[return_dataset_slice]
                else:
                    raise NotImplementedError(
                        "Use group as pandas dataframe container?"
                    )
            elif attr_name != "":
                try:
                    return dataset.attrs[attr_name]
                except KeyError:
                    raise KeyError(
                        f"Attribute {attr_name} does not exist for "
                        f"dataset {dataset_name} of group "
                        f"{group_name} of {self}."
                    )
            else:
                return dict(dataset.attrs)


@patch
def write(
    self:HDF_File,
    value,
    group_name:str=None,
    dataset_name:str=None,
    attr_name:str=None,
    overwrite:bool=False,
    dataset_compression=None
):
    '''
    Write a `value` to an HDF_File. If an 'attr_name' is provided,
    `value` will be stored for this attribute.
    If no `group_name` is provided, write directly to the root group.
    If no `dataset_name` is provided, create a new group with `value`
    as name. If a 'dataset_name' is provided, a 'dataset_compression`
    can be defined to minimize disk usage, at the cost of slower IO.
    If the `overwrite` flag is True, overwrite the given attribute
    or dataset and truncate groups.
    '''
    if self.is_read_only:
        raise IOError(
            f"Trying to write to {self}, which is read_only."
        )
    with h5py.File(self.file_name, "a") as hdf_file:
        if group_name is None:
            group = hdf_file
            group_name = "/"
        else:
            try:
                group = hdf_file[group_name]
            except KeyError:
                raise KeyError(
                    f"Group {group_name} does not exist in {self}."
                )
        if dataset_name is None:
            if attr_name is None:
                if value in group:
                    if overwrite:
                        del group[value]
                    else:
                        raise ValueError(
                            f"New group {value} already exists in group "
                            f"{group_name} of {self}."
                        )
                group.create_group(value)
            else:
                if (attr_name in group.attrs) and not overwrite:
                    raise ValueError(
                        f"Attribute {attr_name} already exists in group "
                        f"{group_name} of {self}."
                    )
                try:
                    group.attrs[attr_name] = value
                except TypeError:
                    group.attrs[attr_name] = str(value)
        else:
            if attr_name is None:
                if dataset_name in group:
                    if overwrite:
                        del group[dataset_name]
                    else:
                        raise ValueError(
                            f"Dataset {dataset_name} already exists in group "
                            f"{group_name} of {self}."
                        )
                if isinstance(value, pd.core.frame.DataFrame):
                    raise NotImplementedError(
                        "Use group as pandas dataframe container?"
                    )
                if value.dtype.type == np.str_:
                    value = value.astype(np.dtype('O'))
                if value.dtype == np.dtype('O'):
                    hdf_dataset = group.create_dataset(
                        dataset_name,
                        data=value,
                        compression=dataset_compression,
                        dtype=h5py.string_dtype()
                    )
                else:
                    hdf_dataset = group.create_dataset(
                        dataset_name,
                        data=value,
                        compression=dataset_compression,
                    )
            else:
                try:
                    dataset = group[dataset_name]
                except KeyError:
                    raise KeyError(
                        f"Dataset {dataset_name} does not exist for "
                        f"group {group_name} of {self}."
                    )
                if (attr_name in dataset.attrs) and not overwrite:
                    raise ValueError(
                        f"Attribute {attr_name} already exists in "
                        f"dataset {dataset_name} of group "
                        f"{group_name} of {self}."
                    )
                try:
                    dataset.attrs[attr_name] = value
                except TypeError:
                    dataset.attrs[attr_name] = str(value) # e.g. dicts
        hdf_file.attrs["last_updated"] = time.asctime()

Unit tests for this generic HDF class include:

* Creation and truncation of file with various access.
* Writing and reading data from the container.

In [None]:
#hide

import numpy as np
import unittest


def define_new_test_files(test_folder):
    test_file_names = [
        os.path.abspath(
            os.path.join(test_folder, f"{file_name}.hdf")
        ) for file_name in [
            "test0",
            "test1",
            "test2",
        ]
    ]
    for file_name in test_file_names:
        if os.path.isfile(file_name):
            os.remove(file_name)
    return test_file_names

def test_hdf_file_creation(test_folder):
    test_file_names = define_new_test_files(test_folder)
    try:
        f0 = HDF_File(test_file_names[0])
    except OSError:
        assert True
    else:
        assert False, "Non-existing file should raise an error"
    f0 = HDF_File(test_file_names[0], is_new_file=True)
    assert f0.is_read_only == False, "New files should never be read-only"
    del f0
    try:
        f0 = HDF_File(test_file_names[0])
    except OSError:
        assert False, "Newly created file should exist on disk"
    else:
        assert True
    assert f0.is_read_only == True, "Existing files should be read-only"
    assert f0.file_name == test_file_names[0], "File name should match given file name"
    assert f0.original_file_name == test_file_names[0], "Original file name should match given file name"
    assert f0.version == VERSION_NO, "Versions should match"
    assert str(f0) == f"<HDF_File {test_file_names[0]}>", "File name should match"
    del f0
    os.rename(test_file_names[0], test_file_names[1])
    f1 = HDF_File(test_file_names[1])
    assert f1.original_file_name != test_file_names[1], "Original file name should not match given file name"
    assert f1.directory == os.path.abspath(test_folder), "Directory should match"
    f0 = HDF_File(test_file_names[0], is_new_file=True)
    assert f0 != f1, "Different file names should be different HDF_Files"
    del f0
    f1_copy = HDF_File(test_file_names[1], is_read_only=False)
    assert f1 == f1_copy, "Same file names should be same HDF_Files"
    assert f1_copy.is_read_only == False, "File should not be read-only"


def test_hdf_file_read_and_write(test_folder):
    test_file_names = define_new_test_files(test_folder)
    f0 = HDF_File(test_file_names[0], is_new_file=True)
    try:
        f0.read(group_name="subgroup")
    except KeyError:
        assert True
    else:
        assert False, "subgroup should not exist"
    f0.write("subgroup")
    try:
        f0.read(group_name="subgroup")
    except KeyError:
        assert False, "Subgroup should exist"
    else:
        assert True
    z = np.random.random((100, 4))
    f0.write(z, group_name="subgroup", dataset_name="random")
    f0_copy = HDF_File(test_file_names[0])
    assert f0_copy.read(
        dataset_name="random",
        group_name="subgroup",
        return_dataset_shape=True
    ) == z.shape, "Shape of dataset is not correct"
    assert f0_copy.read(
        dataset_name="random",
        group_name="subgroup",
        return_dataset_dtype=True
    ) == z.dtype, "Type of dataset is not correct"
    assert np.all(
        f0_copy.read(dataset_name="random", group_name="subgroup") == z
    ), "Contents of dataset are not correct"
    try:
        f0.write(z, group_name="subgroup", dataset_name="random")
    except ValueError:
        assert True
    else:
        assert False, "Should not overwrite dataset"
    try:
        f0.write(z, group_name="subgroup", dataset_name="random", overwrite=True)
    except ValueError:
        assert False, "Should be able to overwrite dataset"
    else:
        assert True
    f0.write(
        4,
        dataset_name="random",
        group_name="subgroup",
        attr_name="numeric_attr",
    )
    assert f0.read(
        dataset_name="random",
        group_name="subgroup",
        attr_name="numeric_attr",
    ) == 4, "Attr shoud match"
    f0.write(
        "test",
        dataset_name="random",
        group_name="subgroup",
        attr_name="string_attr",
    )
    assert f0.read(
        dataset_name="random",
        group_name="subgroup",
        attr_name="string_attr",
    ) == "test", "String attr shoud match"
    f0.write(
        list(range(5)),
        dataset_name="random",
        group_name="subgroup",
        attr_name="int_list_attr",
    )
    assert np.all(
        f0.read(
            dataset_name="random",
            group_name="subgroup",
            attr_name="int_list_attr",
        ) == list(range(5))
    ), "Attr shoud match"
    mixed_list =  ["test", "mixed", 2, 4.9]
    try:
        f0.write(
            mixed_list,
            dataset_name="random",
            group_name="subgroup",
            attr_name="int_list_attr",
        )
    except ValueError:
        assert True
    else:
        assert False, "Should not be able to overwrite attr"
    f0.write(
        mixed_list,
        dataset_name="random",
        group_name="subgroup",
        attr_name="int_list_attr",
        overwrite=True
    )
    assert np.all(
        f0.read(
            dataset_name="random",
            group_name="subgroup",
            attr_name="int_list_attr",
        ) == mixed_list
    ), "Attr of mixed list should match"
    f0.write(
        {"t": 1},
        dataset_name="random",
        group_name="subgroup",
        attr_name="dict_attr",
        overwrite=True
    )
    assert np.all(
        f0.read(
            dataset_name="random",
            group_name="subgroup",
            attr_name="dict_attr",
        ) == str({"t": 1})
    ), "Attr shoud match"
    

    
test_hdf_file_creation(test_folder="tmp")
test_hdf_file_read_and_write(test_folder="tmp")

The file name of <HDF_File /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf> has been changed from/Users/swillems/Documents/software/alphapept/nbs/tmp/test0.hdf to /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf.
No LOGGER has been defined, using normal print instead.
The file name of <HDF_File /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf> has been changed from/Users/swillems/Documents/software/alphapept/nbs/tmp/test0.hdf to /Users/swillems/Documents/software/alphapept/nbs/tmp/test1.hdf.
No LOGGER has been defined, using normal print instead.


## Conversion functions

`get_most_abundant`: In order to save spectra in a more memory efficient form, we only keep the n most abundant peaks. This allows us to save data in a fast accessible matrix format. 

In [None]:
#export
from alphapept.chem import calculate_mass

  @jitclass(spec)
  @jitclass(spec)


In [None]:
#export
from tqdm import tqdm
import numpy as np
from numba.typed import List
from numba import njit
import gzip
import sys
import os
import logging


def get_most_abundant(mass, intensity, n_max):
    """
    Returns the n_max most abundant peaks of a spectrum
    """
    if len(mass) < n_max:
        return mass, intensity
    else:
        sortindex = np.argsort(intensity)[::-1][:n_max]
        sortindex.sort()

    return mass[sortindex], intensity[sortindex]

## Reading Thermo Files

> The current implementation uses a lot of lists and fills them with list comprehensions. This creates a lot of variables but seems to work reasonably fast. This code could be refactored as all variables end up in a dictionary-type container anyhow.

In [None]:
#export
def load_thermo_raw(raw_file, most_abundant, callback=None, **kwargs):
    """
    Load thermo raw file and extract spectra
    """

    from .pyrawfilereader import RawFileReader
    rawfile = RawFileReader(raw_file)

    spec_indices = np.array(
        range(rawfile.FirstSpectrumNumber, rawfile.LastSpectrumNumber + 1)
    )

    scan_list = []
    rt_list = []
    mass_list = []
    int_list = []
    ms_list = []
    prec_mzs_list = []
    mono_mzs_list = []
    charge_list = []

    for idx, i in enumerate(spec_indices):
        ms_order = rawfile.GetMSOrderForScanNum(i)
        rt = rawfile.RTFromScanNum(i)

        prec_mz = rawfile.GetPrecursorMassForScanNum(i, 0)

        trailer_extra = rawfile.GetTrailerExtraForScanNum(i)
        mono_mz = float(trailer_extra["Monoisotopic M/Z:"])
        charge = int(trailer_extra["Charge State:"])
        # if mono_mz == 0: mono_mz = prec_mz
        # if mono_mz != 0 and abs(mono_mz - prec_mz) > 0.1:
        #    print(f'MSn={ms_order}, mono_mz={mono_mz}, perc_mz={prec_mz}, charge={charge}')

        # may be centroid for MS2 and profile for MS1 is better？
        masses, intensity = rawfile.GetCentroidMassListFromScanNum(i)

        if ms_order == 2:
            masses, intensity = get_most_abundant(masses, intensity, most_abundant)

        scan_list.append(i)
        rt_list.append(rt)
        mass_list.append(np.array(masses))
        int_list.append(np.array(intensity, dtype=np.int64))
        ms_list.append(ms_order)
        prec_mzs_list.append(prec_mz)
        mono_mzs_list.append(mono_mz)
        charge_list.append(charge)

        if callback:
            callback((idx+1)/len(spec_indices))

    scan_list_ms1 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    rt_list_ms1 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    mass_list_ms1 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    int_list_ms1 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    ms_list_ms1 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 1]

    scan_list_ms2 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    rt_list_ms2 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mass_list_ms2 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    int_list_ms2 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    ms_list_ms2 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mono_mzs2 = [mono_mzs_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    charge2 = [charge_list[i] for i, _ in enumerate(ms_list) if _ == 2]

    prec_mass_list2 = [
        calculate_mass(mono_mzs_list[i], charge_list[i])
        for i, _ in enumerate(ms_list)
        if _ == 2
    ]

    check_sanity(mass_list)

    query_data = {}

    query_data["scan_list_ms1"] = np.array(scan_list_ms1)
    query_data["rt_list_ms1"] = np.array(rt_list_ms1)
    query_data["mass_list_ms1"] = np.array(mass_list_ms1)
    query_data["int_list_ms1"] = np.array(int_list_ms1)
    query_data["ms_list_ms1"] = np.array(ms_list_ms1)

    query_data["scan_list_ms2"] = np.array(scan_list_ms2)
    query_data["rt_list_ms2"] = np.array(rt_list_ms2)
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    query_data["ms_list_ms2"] = np.array(ms_list_ms2)
    query_data["prec_mass_list2"] = np.array(prec_mass_list2)
    query_data["mono_mzs2"] = np.array(mono_mzs2)
    query_data["charge2"] = np.array(charge2)

    return query_data

In [None]:
#hide
# TODO: Is there any really small (<20MB) raw files that we can upload and test, instead of our local raw files?

In [None]:
#export
def load_thermo_raw_MSFileReader(raw_file, most_abundant, callback=None, **kwargs):
    """
    Load thermo raw file and extract spectra
    """

    from pymsfilereader import MSFileReader
    rawfile = MSFileReader(raw_file)

    spec_indices = np.array(
        range(rawfile.FirstSpectrumNumber, rawfile.LastSpectrumNumber + 1)
    )

    scan_list = []
    rt_list = []
    mass_list = []
    int_list = []
    ms_list = []
    prec_mzs_list = []
    mono_mzs_list = []
    charge_list = []

    for idx, i in enumerate(spec_indices):
        ms_order = rawfile.GetMSOrderForScanNum(i)
        rt = rawfile.RTFromScanNum(i)

        prec_mz = rawfile.GetPrecursorMassForScanNum(i, 2)

        trailer_extra = rawfile.GetTrailerExtraForScanNum(i)
        mono_mz = trailer_extra["Monoisotopic M/Z"]
        charge = trailer_extra["Charge State"]

        label_data = rawfile.GetLabelData(i)

        # if labeled data is not available extract else
        # Todo: check for centroided or not 
        
        if label_data[0][0] == ():
            mlist = rawfile.GetMassListFromScanNum(i)
            masses = np.array(mlist[0][0])
            intensity = np.array(mlist[0][1])
        else:
            intensity = np.array(label_data[0][1])
            masses = np.array(label_data[0][0])

        if ms_order == 2:
            masses, intensity = get_most_abundant(masses, intensity, most_abundant)

        scan_list.append(i)
        rt_list.append(rt)
        mass_list.append(np.array(masses))
        int_list.append(np.array(intensity, dtype=np.int64))
        ms_list.append(ms_order)
        prec_mzs_list.append(prec_mz)
        mono_mzs_list.append(mono_mz)
        charge_list.append(charge)
        
        if callback:
            callback((idx+1)/len(spec_indices))

    scan_list_ms1 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    rt_list_ms1 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    mass_list_ms1 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    int_list_ms1 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    ms_list_ms1 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 1]

    scan_list_ms2 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    rt_list_ms2 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mass_list_ms2 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    int_list_ms2 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    ms_list_ms2 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mono_mzs2 = [mono_mzs_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    charge2 = [charge_list[i] for i, _ in enumerate(ms_list) if _ == 2]

    prec_mass_list2 = [
        calculate_mass(mono_mzs_list[i], charge_list[i])
        for i, _ in enumerate(ms_list)
        if _ == 2
    ]

    check_sanity(mass_list)
    
    query_data = {}

    query_data["scan_list_ms1"] = np.array(scan_list_ms1)
    query_data["rt_list_ms1"] = np.array(rt_list_ms1)
    query_data["mass_list_ms1"] = np.array(mass_list_ms1)
    query_data["int_list_ms1"] = np.array(int_list_ms1)
    query_data["ms_list_ms1"] = np.array(ms_list_ms1)

    query_data["scan_list_ms2"] = np.array(scan_list_ms2)
    query_data["rt_list_ms2"] = np.array(rt_list_ms2)
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    query_data["ms_list_ms2"] = np.array(ms_list_ms2)
    query_data["prec_mass_list2"] = np.array(prec_mass_list2)
    query_data["mono_mzs2"] = np.array(mono_mzs2)
    query_data["charge2"] = np.array(charge2)
    
    return query_data


## Wrapper

We use `multiprocessing - pool` to be able to convert multiple files to raw in parallel.

In [None]:
#export

def raw_to_npz(to_process, callback = None):
    """
    Wrapper function to convert raw to npz
    """

    path, settings = to_process

    base, ext = os.path.splitext(path)
    
    if ext.lower() == '.raw':
        logging.info('File {} has extension {} - converting from Thermo.'.format(base, ext))
        query_data = load_thermo_raw(path, callback=callback, **settings['raw'])
    elif ext.lower() == '.d':
        logging.info('File {} has extension {} - converting from Bruker.'.format(base, ext))
        query_data = load_bruker_raw(path, callback=callback, **settings['raw'])
    else:
        raise NotImplementedError('File extension {} not understood.'.format(ext))
        
    logging.info('File conversion complete. Extracted {:,} precursors.'.format(len(query_data['prec_mass_list2'])))
        
    save_path = base + ".npz"
    save_query_as_npz(save_path, query_data)
    logging.info('Converted file saved to {}'.format(save_path))
    

from multiprocessing import Pool

def raw_to_npz_parallel(path_list, settings, callback=None):
    
    n_processes = settings['general']['n_processes']
    
    to_process = [(_, settings) for _ in path_list]
    
    if len(to_process) == 1:
        raw_to_npz(to_process[0], callback=callback)
    
    else:
        with Pool(n_processes) as p:
            max_ = len(to_process)
            for i, _ in enumerate(p.imap_unordered(raw_to_npz, to_process)):
                if callback:
                    callback((i+1)/max_)

## Bruker

For accessing Bruker files, we rely on the external `timsdata` library. 
For `ccs` values, we need some functions from this library. As the live feature-finder might not be able to determine some charge values, it is intended to perform this calculation at a later stage once we have charge values from the post-processing feature finder. 

In [None]:
#export
def load_bruker_raw(raw_file, most_abundant, callback=None, **kwargs):
    """
    Load bruker raw file and extract spectra
    """
    import sqlalchemy as db
    import pandas as pd
    
    from alphapept.ext.bruker import timsdata

    tdf = os.path.join(raw_file, 'analysis.tdf')
    engine = db.create_engine('sqlite:///{}'.format(tdf))
    prec_data = pd.read_sql_table('Precursors', engine)
    frame_data = pd.read_sql_table('Frames', engine)
    frame_data = frame_data.set_index('Id')
    
    from alphapept.constants import mass_dict

    tdf = timsdata.TimsData(raw_file)

    M_PROTON = mass_dict['Proton']

    prec_data['Mass'] = prec_data['MonoisotopicMz'].values * prec_data['Charge'].values - prec_data['Charge'].values*M_PROTON

    from alphapept.io import list_to_numpy_f32, get_most_abundant

    mass_list_ms2 = []
    int_list_ms2 = []
    scan_list_ms2 = []
    
    prec_data = prec_data.sort_values(by='Mass', ascending=True)
    
    precursor_ids = prec_data['Id'].tolist()

    for idx, key in enumerate(precursor_ids):

        ms2_data = tdf.readPasefMsMs([key])
        masses, intensity = ms2_data[key]

        masses, intensity = get_most_abundant(np.array(masses), np.array(intensity), most_abundant)

        mass_list_ms2.append(masses)
        int_list_ms2.append(intensity)
        scan_list_ms2.append(key)
        
        if callback:
            callback((idx+1)/len(precursor_ids))
            

    check_sanity(mass_list_ms2)
                               
    query_data = {}

    query_data['prec_mass_list2'] = prec_data['Mass'].values
    query_data['prec_id'] = prec_data['Id'].values
    query_data['mono_mzs2'] = prec_data['MonoisotopicMz'].values
    query_data['rt_list_ms2'] = frame_data.loc[prec_data['Parent'].values]['Time'].values / 60 #convert to minutes
    query_data['scan_list_ms2'] = prec_data['Parent'].values
    query_data['charge2'] = prec_data['Charge'].values
    query_data['mobility'] = tdf.scanNumToOneOverK0(1, prec_data['ScanNumber'].to_list()) #check if its okay to always use first frame
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    
    
    return query_data

def one_over_k0_to_CCS(one_over_k0s, charges, mzs):
    """
    convert one_over_k0 to CCS
    """
    from alphapept.ext.bruker import timsdata
    
    ccs = np.empty(len(one_over_k0s))
    ccs[:] = np.nan
    
    for idx, (one_over, charge, mz) in enumerate(zip(one_over_k0s, charges, mzs)):
        try:
            ccs[idx] =timsdata.oneOverK0ToCCSforMz(one_over, int(charge), mz)
        except ValueError:
            pass
    return ccs


In [None]:
#hide
#Test if timsdata can be called
def test_one_over_k0_to_CCS():
    one_over_k0_to_CCS([1], [1], [1])

## MZML 

To access mzml files, we rely on the pyteomics package.

In [None]:
#export

def check_sanity(mass_list):
    """
    Sanity check for mass list to make sure the masses are sorted
    """
    
    if not all(
        mass_list[0][i] <= mass_list[0][i + 1] for i in range(len(mass_list[0]) - 1)
    ):
        raise ValueError("Masses are not sorted.")
        
        
def extract_mzml_info(input_dict):
    rt = float(input_dict.get('scanList').get('scan')[0].get('scan start time'))  # rt_list_ms1/2
    masses = input_dict.get('m/z array')
    intensities = input_dict.get('intensity array')
    ms_order = input_dict.get('ms level')  # ms_list_ms1/2
    prec_mass = 0
    if ms_order == 2:
        charge = int(
            input_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get(
                'charge state'))
        mono_mz = round(
            input_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get(
                'selected ion m/z'), 4)
        prec_mass = calculate_mass(mono_mz, charge)
    return rt, masses, intensities, ms_order, prec_mass


def extract_mzxml_info(input_dict):
    rt = float(input_dict.get('retentionTime'))
    masses = input_dict.get('m/z array')
    intensities = input_dict.get('intensity array')
    ms_order = input_dict.get('msLevel')  # ms_list_ms1/2
    prec_mass = 0
    if ms_order == 2:
        charge = int(input_dict.get('precursorMz')[0].get('precursorCharge'))
        mono_mz = round(input_dict.get('precursorMz')[0].get('precursorMz'), 4)
        prec_mass = calculate_mass(mono_mz, charge)
    return rt, masses, intensities, ms_order, prec_mass


def read_mzML(filename, most_abundant):
    """
    Read spectral data from an mzML file and return various lists separately for ms1 and ms2 data.
    """
    from pyteomics import mzml, mzxml

    try:
        if os.path.splitext(filename)[1] == '.gz':
            reader = mzml.read(gzip.open(filename), use_index=True)
        else:
            reader = mzml.read(filename, use_index=True)
        spec_indices = np.array(range(1, len(reader) + 1))

    except OSError:
        logging('Could not open the file. Please, specify the correct path to the file.')
        sys.exit(1)

    scan_list = []
    rt_list = []
    mass_list = []
    int_list = []
    ms_list = []
    prec_mzs_list = []

    logging('Start reading mzML file...')
    if reader:
        for i in tqdm(spec_indices):
            spec = next(reader)
            scan_list.append(i)
            rt, masses, intensities, ms_order, prec_mass = extract_mzml_info(spec, min_charge, max_charge)
            if ms_order == 2:
                masses, intensities = get_most_abundant(masses, intensities, most_abundant)
            rt_list.append(rt)
            mass_list.append(masses)
            int_list.append(intensities)
            ms_list.append(ms_order)
            prec_mzs_list.append(prec_mass)

    check_sanity(mass_list)

    scan_list_ms1 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    rt_list_ms1 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    mass_list_ms1 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    int_list_ms1 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    ms_list_ms1 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 1]

    scan_list_ms2 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    rt_list_ms2 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mass_list_ms2 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    int_list_ms2 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    ms_list_ms2 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    prec_mass_list2 = [prec_mzs_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    
    query_data = {}

    query_data["scan_list_ms1"] = np.array(scan_list_ms1)
    query_data["rt_list_ms1"] = np.array(rt_list_ms1)
    query_data["mass_list_ms1"] = np.array(mass_list_ms1)
    query_data["int_list_ms1"] = np.array(int_list_ms1)
    query_data["ms_list_ms1"] = np.array(ms_list_ms1)

    query_data["scan_list_ms2"] = np.array(scan_list_ms2)
    query_data["rt_list_ms2"] = np.array(rt_list_ms2)
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    query_data["ms_list_ms2"] = np.array(ms_list_ms2)
    query_data["prec_mass_list2"] = np.array(prec_mass_list2)
    query_data["mono_mzs2"] = np.array(mono_mzs2)
    query_data["charge2"] = np.array(charge2)
    
    return query_data


def read_mzXML(filename, most_abundant):
    """
    Read spectral data from an mzXML file and return various lists separately for ms1 and ms2 data.
    """

    try:
        if os.path.splitext(filename)[1] == '.gz':
            reader = mzxml.read(gzip.open(filename), use_index=True)
        else:
            reader = mzxml.read(filename, use_index=True)
        spec_indices = np.array(range(1, len(reader) + 1))

    except OSError:
        print('Could not open the file. Please, specify the correct path to the file.')
        sys.exit(1)

    scan_list = []
    rt_list = []
    mass_list = []
    int_list = []
    ms_list = []
    prec_mzs_list = []

    print('Start reading mzXML file...')
    if reader:
        for i in tqdm(spec_indices):
            spec = next(reader)
            scan_list.append(i)
            rt, masses, intensities, ms_order, prec_mass = extract_mzxml_info(spec, min_charge, max_charge)
            if ms_order == 2:
                masses, intensities = get_most_abundant(masses, intensities, most_abundant)
            rt_list.append(rt)
            mass_list.append(masses)
            int_list.append(intensities)
            ms_list.append(ms_order)
            prec_mzs_list.append(prec_mass)

    check_sanity(mass_list)

    scan_list_ms1 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    rt_list_ms1 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    mass_list_ms1 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    int_list_ms1 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    ms_list_ms1 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 1]

    scan_list_ms2 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    rt_list_ms2 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mass_list_ms2 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    int_list_ms2 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    ms_list_ms2 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    prec_mass_list2 = [prec_mzs_list[i] for i, _ in enumerate(ms_list) if _ == 2]

    check_sanity(mass_list)
    
    query_data = {}

    query_data["scan_list_ms1"] = np.array(scan_list_ms1)
    query_data["rt_list_ms1"] = np.array(rt_list_ms1)
    query_data["mass_list_ms1"] = np.array(mass_list_ms1)
    query_data["int_list_ms1"] = np.array(int_list_ms1)
    query_data["ms_list_ms1"] = np.array(ms_list_ms1)

    query_data["scan_list_ms2"] = np.array(scan_list_ms2)
    query_data["rt_list_ms2"] = np.array(rt_list_ms2)
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    query_data["ms_list_ms2"] = np.array(ms_list_ms2)
    query_data["prec_mass_list2"] = np.array(prec_mass_list2)
    query_data["mono_mzs2"] = np.array(mono_mzs2)
    query_data["charge2"] = np.array(charge2)
    
    return query_data

## Saving

For saving, we are currently relying on the NumPy-native npz-container. It offers reasonable speed, dictionary-type access, and does not need individual type definitions.

While we could, in principle, store the mz and int arrays as a list of variable length, this will come at a performance decrease. We, therefore, create an array of the dimensions of the n most abundant peaks and the number of spectra with the function `list_to_numpy_f32` and fill the unoccupied cells with `-1`. This allows an increase in accessing times at the cost of additional disk space.

Implementation Note: For large files (e.g., choosing a large number of peaks that should be kept, the npz array can fail and trigger an ZIP64 error. This is supposed to be fixed in a later NumPy version.)

In [None]:
#export
def list_to_numpy_f32(long_list):
    """
    Function to convert a list to float32 array
    """
    np_array = (
        np.zeros(
            [len(max(long_list, key=lambda x: len(x))), len(long_list)],
            dtype=np.float32,
        )
        - 1
    )
    for i, j in enumerate(long_list):
        np_array[0 : len(j), i] = j

    return np_array

        
def save_query_as_npz(raw_file_npz, query_data):
    """
    Saves query_data as npz
    """
    
    to_save = {}
    
    for key in query_data.keys():
        if key in ['mass_list_ms2','int_list_ms2']:
            to_save[key] = list_to_numpy_f32(query_data[key])
        else:
            to_save[key] = query_data[key]
            
    to_save["bounds"] = np.sum(to_save['mass_list_ms2']>=0,axis=0).astype(np.int64)
            
    np.savez(raw_file_npz, **to_save)
    
    return raw_file_npz

## Parsing other Files

Benchmarking proteomics software against each other is not straightforward as various naming conventions exist, and different algorithms are implemented. In this section, we define some helper functions that allow us to facilitate the comparison of different tools.

### Reading MaxQuant xml settings file

In [None]:
#export
import xml.etree.ElementTree as ET

def extract_nested(child):
    """
    Helper function to extract nested entries
    """
    if len(child) > 0:
        temp_dict = {}
        for xx in child:
            temp_dict[xx.tag] = extract_nested(xx)
        return temp_dict
    else:
        if child.text == 'True':
            info = True
        elif child.text == 'False':
            info = False
        else:
            info = child.text
        return info

def extract_mq_settings(path):
    """
    Function to return MaxQuant values as a dictionary for a given xml file
    """
    if not path.endswith('.xml'):
        raise ValueError("Path {} is not a valid xml file.".format(path))
    
    tree = ET.parse(path)
    root = tree.getroot()
    
    mq_dict = {}

    for child in root:  

        mq_dict[child.tag] = extract_nested(child)
        
    return mq_dict

In [None]:
mq_dict = extract_mq_settings('../testfiles/test_mqpar.xml')
mq_dict['fastaFiles']

{'FastaFileInfo': {'fastaFilePath': 'testfile.fasta',
  'identifierParseRule': '>([^\\s]*)',
  'descriptionParseRule': '>(.*)',
  'taxonomyParseRule': None,
  'variationParseRule': None,
  'modificationParseRule': None,
  'taxonomyId': None}}

In [None]:
#export
def parse_mq_seq(peptide):
    """
    Replaces maxquant convention to alphapept convention
    ToDo: include more sequences
    """
    peptide = peptide[1:-1] #Remove _

    peptide = peptide.replace('(Acetyl (Protein N-term))','a')
    peptide = peptide.replace('M(Oxidation (M))','oxM')
    peptide = peptide.replace('C','cC') #This is fixed and not indicated in MaxQuant
    
    return peptide

In [None]:
parse_mq_seq('_AFQPFFVELTM(Oxidation (M))PYSVIR_')

'AFQPFFVELToxMPYSVIR'

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_settings.ipynb.
Converted 01_chem.ipynb.
Converted 02_io.ipynb.
Converted 03_fasta.ipynb.
Converted 04_feature_finding.ipynb.
Converted 05_search.ipynb.
Converted 06_score.ipynb.
Converted 07_recalibration.ipynb.
Converted 08_quantification.ipynb.
Converted 09_matching.ipynb.
Converted 10_constants.ipynb.
Converted index.ipynb.
Converted io_utils.ipynb.
