In [None]:
# default_exp io

# Input / Output

> Functions related to input and output

This notebook contains all functions related to importing and exporting files. The current way to store raw data is to use the numpy native *.npz container. It allows dictionary-type access and provides reasonable access speeds.

To access proprietary data formats, we have import functions to access `Bruker` and `Thermo` data.

In [None]:
#hide
from nbdev.showdoc import *

## Conversion functions

`get_most_abundant`: In order to save spectra in a more memory efficient form, we only keep the n most abundant peaks. This allows us to save data in a fast accessible matrix format. 

In [None]:
#export
from alphapept.chem import calculate_mass

In [None]:
#export
from tqdm import tqdm
import numpy as np
from numba.typed import List
from numba import njit
from pyteomics import mzml, mzxml
import gzip
import sys
import os
import logging


def get_most_abundant(mass, intensity, n_max):
    """
    Returns the n_max most abundant peaks of a spectrum
    """
    if len(mass) < n_max:
        return mass, intensity
    else:
        sortindex = np.argsort(intensity)[::-1][:n_max]
        sortindex.sort()

    return mass[sortindex], intensity[sortindex]

## Reading Thermo Files

> This implementation is based on `pymsfilereader`. It requires that MSFileReader from Thermo is installed.

> The current implementation uses a lot of lists and fills them with list comprehensions. This creates a lot of variables but seems to work reasonably fast. This code could be refactored as all variables end up in a dictionary-type container anyhow.

In [None]:
#export
def load_thermo_raw(raw_file, most_abundant, callback=None, **kwargs):
    """
    Load thermo raw file and extract spectra
    """

    from pymsfilereader import MSFileReader
    rawfile = MSFileReader(raw_file)

    spec_indices = np.array(
        range(rawfile.FirstSpectrumNumber, rawfile.LastSpectrumNumber + 1)
    )

    scan_list = []
    rt_list = []
    mass_list = []
    int_list = []
    ms_list = []
    prec_mzs_list = []
    mono_mzs_list = []
    charge_list = []

    for idx, i in enumerate(spec_indices):
        ms_order = rawfile.GetMSOrderForScanNum(i)
        rt = rawfile.RTFromScanNum(i)

        prec_mz = rawfile.GetPrecursorMassForScanNum(i, 2)

        trailer_extra = rawfile.GetTrailerExtraForScanNum(i)
        mono_mz = trailer_extra["Monoisotopic M/Z"]
        charge = trailer_extra["Charge State"]

        label_data = rawfile.GetLabelData(i)

        # if labeled data is not available extract else
        # Todo: check for centroided or not 
        
        if label_data[0][0] == ():
            mlist = rawfile.GetMassListFromScanNum(i)
            masses = np.array(mlist[0][0])
            intensity = np.array(mlist[0][1])
        else:
            intensity = np.array(label_data[0][1])
            masses = np.array(label_data[0][0])

        if ms_order == 2:
            masses, intensity = get_most_abundant(masses, intensity, most_abundant)

        scan_list.append(i)
        rt_list.append(rt)
        mass_list.append(np.array(masses))
        int_list.append(np.array(intensity, dtype=np.int64))
        ms_list.append(ms_order)
        prec_mzs_list.append(prec_mz)
        mono_mzs_list.append(mono_mz)
        charge_list.append(charge)
        
        if callback:
            callback((idx+1)/len(spec_indices))

    scan_list_ms1 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    rt_list_ms1 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    mass_list_ms1 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    int_list_ms1 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    ms_list_ms1 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 1]

    scan_list_ms2 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    rt_list_ms2 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mass_list_ms2 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    int_list_ms2 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    ms_list_ms2 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mono_mzs2 = [mono_mzs_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    charge2 = [charge_list[i] for i, _ in enumerate(ms_list) if _ == 2]

    prec_mass_list2 = [
        calculate_mass(mono_mzs_list[i], charge_list[i])
        for i, _ in enumerate(ms_list)
        if _ == 2
    ]

    check_sanity(mass_list)
    
    query_data = {}

    query_data["scan_list_ms1"] = np.array(scan_list_ms1)
    query_data["rt_list_ms1"] = np.array(rt_list_ms1)
    query_data["mass_list_ms1"] = np.array(mass_list_ms1)
    query_data["int_list_ms1"] = np.array(int_list_ms1)
    query_data["ms_list_ms1"] = np.array(ms_list_ms1)

    query_data["scan_list_ms2"] = np.array(scan_list_ms2)
    query_data["rt_list_ms2"] = np.array(rt_list_ms2)
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    query_data["ms_list_ms2"] = np.array(ms_list_ms2)
    query_data["prec_mass_list2"] = np.array(prec_mass_list2)
    query_data["mono_mzs2"] = np.array(mono_mzs2)
    query_data["charge2"] = np.array(charge2)
    
    return query_data


## Wrapper

We use `multiprocessing - pool` to be able to convert multiple files to raw in parallel.

In [None]:
#export

def raw_to_npz(to_process, callback = None):
    """
    Wrapper function to convert raw to npz
    """

    path, settings = to_process

    base, ext = os.path.splitext(path)
    
    if ext.lower() == '.raw':
        logging.info('File {} has extension {} - converting from Thermo.'.format(base, ext))
        query_data = load_thermo_raw(path, callback=callback, **settings['raw'])
    elif ext.lower() == '.d':
        logging.info('File {} has extension {} - converting from Bruker.'.format(base, ext))
        query_data = load_bruker_raw(path, callback=callback, **settings['raw'])
    else:
        raise NotImplementedError('File extension {} not understood.'.format(ext))
        
    logging.info('File conversion complete. Extracted {:,} precursors.'.format(len(query_data['prec_mass_list2'])))
        
    save_path = base + ".npz"
    save_query_as_npz(save_path, query_data)
    logging.info('Converted file saved to {}'.format(save_path))
    

from multiprocessing import Pool

def raw_to_npz_parallel(path_list, settings, callback=None):
    
    n_processes = settings['general']['n_processes']
    
    to_process = [(_, settings) for _ in path_list]
    
    if len(to_process) == 1:
        raw_to_npz(to_process[0], callback=callback)
    
    else:
        with Pool(n_processes) as p:
            max_ = len(to_process)
            for i, _ in enumerate(p.imap_unordered(raw_to_npz, to_process)):
                if callback:
                    callback((i+1)/max_)

## Bruker

For accessing Bruker files, we rely on the external `timsdata` library. 
For `ccs` values, we need some functions from this library. As the live feature-finder might not be able to determine some charge values, it is intended to perform this calculation at a later stage once we have charge values from the post-processing feature finder. 

In [None]:
#export
def load_bruker_raw(raw_file, most_abundant, callback=None, **kwargs):
    """
    Load bruker raw file and extract spectra
    """
    import sqlalchemy as db
    import pandas as pd
    from ext.bruker import timsdata

    tdf = os.path.join(raw_file, 'analysis.tdf')
    engine = db.create_engine('sqlite:///{}'.format(tdf))
    prec_data = pd.read_sql_table('Precursors', engine)
    frame_data = pd.read_sql_table('Frames', engine)
    frame_data = frame_data.set_index('Id')
    
    from alphapept.constants import mass_dict

    tdf = timsdata.TimsData(raw_file)

    M_PROTON = mass_dict['Proton']

    prec_data['Mass'] = prec_data['MonoisotopicMz'].values * prec_data['Charge'].values - prec_data['Charge'].values*M_PROTON

    from alphapept.io import list_to_numpy_f32, get_most_abundant

    mass_list_ms2 = []
    int_list_ms2 = []
    scan_list_ms2 = []
    
    prec_data = prec_data.sort_values(by='Mass', ascending=True)
    
    precursor_ids = prec_data['Id'].tolist()

    for idx, key in enumerate(precursor_ids):

        ms2_data = tdf.readPasefMsMs([key])
        masses, intensity = ms2_data[key]

        masses, intensity = get_most_abundant(np.array(masses), np.array(intensity), most_abundant)

        mass_list_ms2.append(masses)
        int_list_ms2.append(intensity)
        scan_list_ms2.append(key)
        
        if callback:
            callback((idx+1)/len(precursor_ids))
            

    check_sanity(mass_list_ms2)
                               
    query_data = {}

    query_data['prec_mass_list2'] = prec_data['Mass'].values
    query_data['prec_id'] = prec_data['Id'].values
    query_data['mono_mzs2'] = prec_data['MonoisotopicMz'].values
    query_data['rt_list_ms2'] = frame_data.loc[prec_data['Parent'].values]['Time'].values / 60 #convert to minutes
    query_data['scan_list_ms2'] = prec_data['Parent'].values
    query_data['charge2'] = prec_data['Charge'].values
    query_data['mobility'] = tdf.scanNumToOneOverK0(1, prec_data['ScanNumber'].to_list()) #check if its okay to always use first frame
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    
    
    return query_data

def one_over_k0_to_CCS(one_over_k0s, charges, mzs):
    """
    convert one_over_k0 to CCS
    """
    from ext.bruker import timsdata
    ccs = np.empty(len(one_over_k0s))
    ccs[:] = np.nan
    
    for idx, (one_over, charge, mz) in enumerate(zip(one_over_k0s, charges, mzs)):
        try:
            ccs[idx] =timsdata.oneOverK0ToCCSforMz(one_over, int(charge), mz)
        except ValueError:
            pass
    return ccs

## MZML 

To access mzml files, we rely on the pyteomics package.

In [None]:
#export

def check_sanity(mass_list):
    """
    Sanity check for mass list to make sure the masses are sorted
    """
    
    if not all(
        mass_list[0][i] <= mass_list[0][i + 1] for i in range(len(mass_list[0]) - 1)
    ):
        raise ValueError("Masses are not sorted.")
        
        
def extract_mzml_info(input_dict):
    rt = float(input_dict.get('scanList').get('scan')[0].get('scan start time'))  # rt_list_ms1/2
    masses = input_dict.get('m/z array')
    intensities = input_dict.get('intensity array')
    ms_order = input_dict.get('ms level')  # ms_list_ms1/2
    prec_mass = 0
    if ms_order == 2:
        charge = int(
            input_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get(
                'charge state'))
        mono_mz = round(
            input_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get(
                'selected ion m/z'), 4)
        prec_mass = calculate_mass(mono_mz, charge)
    return rt, masses, intensities, ms_order, prec_mass


def extract_mzxml_info(input_dict):
    rt = float(input_dict.get('retentionTime'))
    masses = input_dict.get('m/z array')
    intensities = input_dict.get('intensity array')
    ms_order = input_dict.get('msLevel')  # ms_list_ms1/2
    prec_mass = 0
    if ms_order == 2:
        charge = int(input_dict.get('precursorMz')[0].get('precursorCharge'))
        mono_mz = round(input_dict.get('precursorMz')[0].get('precursorMz'), 4)
        prec_mass = calculate_mass(mono_mz, charge)
    return rt, masses, intensities, ms_order, prec_mass


def read_mzML(filename, most_abundant):
    """
    Read spectral data from an mzML file and return various lists separately for ms1 and ms2 data.
    """

    try:
        if os.path.splitext(filename)[1] == '.gz':
            reader = mzml.read(gzip.open(filename), use_index=True)
        else:
            reader = mzml.read(filename, use_index=True)
        spec_indices = np.array(range(1, len(reader) + 1))

    except OSError:
        logging('Could not open the file. Please, specify the correct path to the file.')
        sys.exit(1)

    scan_list = []
    rt_list = []
    mass_list = []
    int_list = []
    ms_list = []
    prec_mzs_list = []

    logging('Start reading mzML file...')
    if reader:
        for i in tqdm(spec_indices):
            spec = next(reader)
            scan_list.append(i)
            rt, masses, intensities, ms_order, prec_mass = extract_mzml_info(spec, min_charge, max_charge)
            if ms_order == 2:
                masses, intensities = get_most_abundant(masses, intensities, most_abundant)
            rt_list.append(rt)
            mass_list.append(masses)
            int_list.append(intensities)
            ms_list.append(ms_order)
            prec_mzs_list.append(prec_mass)

    check_sanity(mass_list)

    scan_list_ms1 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    rt_list_ms1 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    mass_list_ms1 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    int_list_ms1 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    ms_list_ms1 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 1]

    scan_list_ms2 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    rt_list_ms2 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mass_list_ms2 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    int_list_ms2 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    ms_list_ms2 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    prec_mass_list2 = [prec_mzs_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    
    query_data = {}

    query_data["scan_list_ms1"] = np.array(scan_list_ms1)
    query_data["rt_list_ms1"] = np.array(rt_list_ms1)
    query_data["mass_list_ms1"] = np.array(mass_list_ms1)
    query_data["int_list_ms1"] = np.array(int_list_ms1)
    query_data["ms_list_ms1"] = np.array(ms_list_ms1)

    query_data["scan_list_ms2"] = np.array(scan_list_ms2)
    query_data["rt_list_ms2"] = np.array(rt_list_ms2)
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    query_data["ms_list_ms2"] = np.array(ms_list_ms2)
    query_data["prec_mass_list2"] = np.array(prec_mass_list2)
    query_data["mono_mzs2"] = np.array(mono_mzs2)
    query_data["charge2"] = np.array(charge2)
    
    return query_data


def read_mzXML(filename, most_abundant):
    """
    Read spectral data from an mzXML file and return various lists separately for ms1 and ms2 data.
    """

    try:
        if os.path.splitext(filename)[1] == '.gz':
            reader = mzxml.read(gzip.open(filename), use_index=True)
        else:
            reader = mzxml.read(filename, use_index=True)
        spec_indices = np.array(range(1, len(reader) + 1))

    except OSError:
        print('Could not open the file. Please, specify the correct path to the file.')
        sys.exit(1)

    scan_list = []
    rt_list = []
    mass_list = []
    int_list = []
    ms_list = []
    prec_mzs_list = []

    print('Start reading mzXML file...')
    if reader:
        for i in tqdm(spec_indices):
            spec = next(reader)
            scan_list.append(i)
            rt, masses, intensities, ms_order, prec_mass = extract_mzxml_info(spec, min_charge, max_charge)
            if ms_order == 2:
                masses, intensities = get_most_abundant(masses, intensities, most_abundant)
            rt_list.append(rt)
            mass_list.append(masses)
            int_list.append(intensities)
            ms_list.append(ms_order)
            prec_mzs_list.append(prec_mass)

    check_sanity(mass_list)

    scan_list_ms1 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    rt_list_ms1 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    mass_list_ms1 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    int_list_ms1 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 1]
    ms_list_ms1 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 1]

    scan_list_ms2 = [scan_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    rt_list_ms2 = [rt_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    mass_list_ms2 = [mass_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    int_list_ms2 = [int_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    ms_list_ms2 = [ms_list[i] for i, _ in enumerate(ms_list) if _ == 2]
    prec_mass_list2 = [prec_mzs_list[i] for i, _ in enumerate(ms_list) if _ == 2]

    check_sanity(mass_list)
    
    query_data = {}

    query_data["scan_list_ms1"] = np.array(scan_list_ms1)
    query_data["rt_list_ms1"] = np.array(rt_list_ms1)
    query_data["mass_list_ms1"] = np.array(mass_list_ms1)
    query_data["int_list_ms1"] = np.array(int_list_ms1)
    query_data["ms_list_ms1"] = np.array(ms_list_ms1)

    query_data["scan_list_ms2"] = np.array(scan_list_ms2)
    query_data["rt_list_ms2"] = np.array(rt_list_ms2)
    query_data["mass_list_ms2"] = mass_list_ms2
    query_data["int_list_ms2"] = int_list_ms2
    query_data["ms_list_ms2"] = np.array(ms_list_ms2)
    query_data["prec_mass_list2"] = np.array(prec_mass_list2)
    query_data["mono_mzs2"] = np.array(mono_mzs2)
    query_data["charge2"] = np.array(charge2)
    
    return query_data

## Saving

For saving, we are currently relying on the NumPy-native npz-container. It offers reasonable speed, dictionary-type access, and does not need individual type definitions.

While we could, in principle, store the mz and int arrays as a list of variable length, this will come at a performance decrease. We, therefore, create an array of the dimensions of the n most abundant peaks and the number of spectra with the function `list_to_numpy_f32` and fill the unoccupied cells with `-1`. This allows an increase in accessing times at the cost of additional disk space.

Implementation Note: For large files (e.g., choosing a large number of peaks that should be kept, the npz array can fail and trigger an ZIP64 error. This is supposed to be fixed in a later NumPy version.)

In [None]:
#export
def list_to_numpy_f32(long_list):
    """
    Function to convert a list to float32 array
    """
    np_array = (
        np.zeros(
            [len(max(long_list, key=lambda x: len(x))), len(long_list)],
            dtype=np.float32,
        )
        - 1
    )
    for i, j in enumerate(long_list):
        np_array[0 : len(j), i] = j

    return np_array

        
def save_query_as_npz(raw_file_npz, query_data):
    """
    Saves query_data as npz
    """
    
    to_save = {}
    
    for key in query_data.keys():
        if key in ['mass_list_ms2','int_list_ms2']:
            to_save[key] = list_to_numpy_f32(query_data[key])
        else:
            to_save[key] = query_data[key]
            
    to_save["bounds"] = np.sum(to_save['mass_list_ms2']>=0,axis=0).astype(np.int64)
            
    np.savez(raw_file_npz, **to_save)
    
    return raw_file_npz

## Parsing other Files

Benchmarking proteomics software against each other is not straightforward as various naming conventions exist, and different algorithms are implemented. In this section, we define some helper functions that allow us to facilitate the comparison of different tools.

### Reading MaxQuant xml settings file

In [None]:
#export
import xml.etree.ElementTree as ET

def extract_nested(child):
    """
    Helper function to extract nested entries
    """
    if len(child) > 0:
        temp_dict = {}
        for xx in child:
            temp_dict[xx.tag] = extract_nested(xx)
        return temp_dict
    else:
        if child.text == 'True':
            info = True
        elif child.text == 'False':
            info = False
        else:
            info = child.text
        return info

def extract_mq_settings(path):
    """
    Function to return MaxQuant values as a dictionary for a given xml file
    """
    if not path.endswith('.xml'):
        raise ValueError("Path {} is not a valid xml file.".format(path))
    
    tree = ET.parse(path)
    root = tree.getroot()
    
    mq_dict = {}

    for child in root:  

        mq_dict[child.tag] = extract_nested(child)
        
    return mq_dict

In [None]:
mq_dict = extract_mq_settings('../testfiles/test_mqpar.xml')
mq_dict['fastaFiles']

{'FastaFileInfo': {'fastaFilePath': 'testfile.fasta',
  'identifierParseRule': '>([^\\s]*)',
  'descriptionParseRule': '>(.*)',
  'taxonomyParseRule': None,
  'variationParseRule': None,
  'modificationParseRule': None,
  'taxonomyId': None}}

In [None]:
#export
def parse_mq_seq(peptide):
    """
    Replaces maxquant convention to alphapept convention
    ToDo: include more sequences
    """
    peptide = peptide[1:-1] #Remove _

    peptide = peptide.replace('(Acetyl (Protein N-term))','a')
    peptide = peptide.replace('M(Oxidation (M))','oxM')
    peptide = peptide.replace('C','cC') #This is fixed and not indicated in MaxQuant
    
    return peptide

In [None]:
parse_mq_seq('_AFQPFFVELTM(Oxidation (M))PYSVIR_')

'AFQPFFVELToxMPYSVIR'

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_settings.ipynb.
Converted 01_chem.ipynb.
Converted 02_io.ipynb.
Converted 03_fasta.ipynb.
Converted 04_feature_finding.ipynb.
Converted 05_search.ipynb.
Converted 06_score.ipynb.
Converted 07_recalibration.ipynb.
Converted 08_quantification.ipynb.
Converted 09_matching.ipynb.
Converted 10_constants.ipynb.
Converted index.ipynb.
