In [None]:
#| default_exp match.psm_match

# Match

Peak matching functionalities

In [None]:
#| export

import numpy as np
import numba
import pandas as pd
import tqdm
from typing import Union, Tuple

from alphabase.peptide.fragment import (
    create_fragment_mz_dataframe, 
    get_charged_frag_types
)

from alpharaw.ms_data_base import (
    MSData_Base, ms_reader_provider
)

from alpharaw.match.match_utils import (
    match_closest_peaks, match_highest_peaks, 
)
from alpharaw.utils.ms_path_utils import parse_ms_files_to_dict

In [None]:
#| export

@numba.njit
def match_one_raw_with_numba(
    spec_idxes, frag_start_idxes, frag_end_idxes,
    all_frag_mzs,
    all_spec_mzs, all_spec_intensities, 
    peak_start_idxes, peak_end_idxes,
    matched_intensities, matched_mz_errs,
    use_ppm, tol, centroid_mode=True,
):
    """ 
    Internel function to match fragment mz values to spectrum mz values.
    Matched_mz_errs[i] = np.inf if no peaks are matched.
    """
    for spec_idx, frag_start, frag_end in zip(
        spec_idxes, frag_start_idxes, frag_end_idxes
    ):
        peak_start = peak_start_idxes[spec_idx]
        peak_end = peak_end_idxes[spec_idx]
        if peak_end == peak_start: continue
        spec_mzs = all_spec_mzs[peak_start:peak_end]
        spec_intens = all_spec_intensities[peak_start:peak_end]

        frag_mzs = all_frag_mzs[frag_start:frag_end,:].copy()
        
        if use_ppm:
            frag_mz_tols = frag_mzs*tol*1e-6
        else:
            frag_mz_tols = np.full_like(frag_mzs, tol)
        
        if centroid_mode:
            matched_idxes = match_closest_peaks(
                spec_mzs, spec_intens, 
                frag_mzs, frag_mz_tols
            ).reshape(-1)
        else:
            matched_idxes = match_highest_peaks(
                spec_mzs, spec_intens, 
                frag_mzs, frag_mz_tols
            ).reshape(-1)

        matched_intens = spec_intens[matched_idxes]
        matched_intens[matched_idxes==-1] = 0

        matched_mass_errs = np.abs(
            spec_mzs[
                matched_idxes.reshape(-1)
            ]-frag_mzs.reshape(-1)
        )
        matched_mass_errs[matched_idxes==-1] = np.inf

        matched_intensities[
            frag_start:frag_end,:
        ] = matched_intens.reshape(frag_mzs.shape)

        matched_mz_errs[
            frag_start:frag_end,:
        ] = matched_mass_errs.reshape(frag_mzs.shape)


In [None]:
#| export

def load_ms_data(
    ms_file:Union[str, MSData_Base],
    ms_file_type:str='alpharaw_hdf',
):
    """Load MS files

    Parameters
    ----------
    ms_file : str | MSData_Base
        ms2 file path

    ms_file_type : str, optional
        ms2 file type, could be 
        ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"].
        Default to 'alpharaw_hdf'
    """
    if isinstance(ms_file, MSData_Base):
        return ms_file
    else:
        raw_data = ms_reader_provider.get_reader(
            ms_file_type
        )
        raw_data.import_raw(ms_file)
        return raw_data

In [None]:
#| export

class PepSpecMatch:
    """
    Extract fragment ions from MS2 data.

    Parameters
    ----------
    charged_frag_types : list, optional
        fragment types with charge states, 
        e.g. ['b_z1', 'y_z2', 'b_modloss_z1', 'y_H2O_z2'].
        By default `get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2)`

    centroid_mode : bool, optional
        if True, match the closest peak for a m/z;
        if False, matched the higest peak for a m/z in the tolerance range.
        By default True

    use_ppm : bool, optional
        If use ppm, by default True
        
    tol_value : float, optional
        tolerance value, by default 20.0
    """
    def __init__(self,
        charged_frag_types:list = get_charged_frag_types(
            ['b','y','b_modloss','y_modloss'], 2
        ), 
        centroid_mode:bool=True,
        use_ppm:bool = True,
        tol_value:float = 20.0
    ):
        self.charged_frag_types = charged_frag_types
        self.centroid_mode = centroid_mode
        self.use_ppm = use_ppm
        self.tol = tol_value

    def _preprocess_psms(self, psm_df):
        pass

    def get_fragment_mz_df(self, psm_df):
        return create_fragment_mz_dataframe(
            psm_df, self.charged_frag_types
        )

    def _add_missing_columns_to_psm_df(self,
        psm_df:pd.DataFrame, raw_data=None
    ):
        if raw_data is None:
            raw_data = self.raw_data
        add_spec_info_list = []
        if 'rt' not in psm_df.columns:
            add_spec_info_list.append('rt')

        if (
            'mobility' not in psm_df.columns and 
            'mobility' in raw_data.spectrum_df.columns
        ):
            add_spec_info_list.append('mobility')

        if len(add_spec_info_list) > 0:
            # pfind does not report RT in the result file
            psm_df = psm_df.reset_index().merge(
                raw_data.spectrum_df[
                    ['spec_idx']+add_spec_info_list
                ],
                how='left',
                on='spec_idx',
            ).set_index('index')

            if 'rt' in add_spec_info_list:
                psm_df['rt_norm'] = (
                    psm_df.rt/raw_data.spectrum_df.rt.max()
                )
        # if 'rt_sec' not in psm_df.columns:
        #     psm_df['rt_sec'] = psm_df.rt*60
        return psm_df

    def _prepare_matching_dfs(self, psm_df):

        fragment_mz_df = self.get_fragment_mz_df(psm_df)
        
        matched_intensity_df = pd.DataFrame(
            np.zeros_like(
                fragment_mz_df.values, dtype=np.float64
            ), 
            columns=fragment_mz_df.columns
        )

        matched_mz_err_df = pd.DataFrame(
            np.full_like(
                fragment_mz_df.values, np.inf, 
                dtype=np.float64
            ), 
            columns=fragment_mz_df.columns
        )
        return (
            fragment_mz_df, matched_intensity_df, 
            matched_mz_err_df
        )

    def load_ms_data(self,
        ms_file:Union[str, MSData_Base],
        ms_file_type:str='alpharaw_hdf',
    ):
        """Load MS files

        Parameters
        ----------
        ms_file : str | MSData_Base
            ms2 file path

        ms_file_type : str, optional
            ms2 file type, could be 
            ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"].
            Default to 'alpharaw_hdf'
        """
        self.raw_data = load_ms_data(ms_file, ms_file_type)

    def get_peaks(self,
        spec_idx:int,
        **kwargs
    ):
        return self.raw_data.get_peaks(spec_idx)

    def _match_one_psm(self,
        spec_mzs:np.ndarray, spec_intens:np.ndarray,
        fragment_mz_df:pd.DataFrame, 
        matched_intensity_df:pd.DataFrame,
        matched_mz_err_df:pd.DataFrame,
        frag_start_idx:int, frag_end_idx:int,
    ):
        if len(spec_mzs)==0: return

        frag_mzs = fragment_mz_df.values[
            frag_start_idx:frag_end_idx,:
        ]

        if self.use_ppm:
            mz_tols = frag_mzs*self.tol*1e-6
        else:
            mz_tols = np.full_like(frag_mzs, self.tol)

        if self.centroid_mode:
            matched_idxes = match_closest_peaks(
                spec_mzs, spec_intens, frag_mzs, mz_tols
            )
        else:
            matched_idxes = match_highest_peaks(
                spec_mzs, spec_intens, frag_mzs, mz_tols,
            )
        
        matched_intens = spec_intens[matched_idxes]
        matched_intens[matched_idxes==-1] = 0

        matched_mz_errs = np.abs(
            spec_mzs[matched_idxes]-frag_mzs
        )
        matched_mz_errs[matched_idxes==-1] = np.inf

        matched_intensity_df.values[
            frag_start_idx:frag_end_idx,:
        ] = matched_intens

        matched_mz_err_df.values[
            frag_start_idx:frag_end_idx,:
        ] = matched_mz_errs

    def match_ms2_one_raw(self, 
        psm_df_one_raw: pd.DataFrame,
    )->tuple:
        """
        Matching psm_df_one_raw against self.raw_data 
        after `self.load_ms_data()`

        Parameters
        ----------
        psm_df_one_raw : pd.DataFrame
            psm dataframe 
            that contains only one raw file

        Returns
        -------
        tuple:
            pd.DataFrame: psm dataframe with fragment index information.
            
            pd.DataFrame: fragment mz dataframe.
            
            pd.DataFrame: matched intensity dataframe.
            
            pd.DataFrame: matched mass error dataframe. 
            np.inf if a fragment is not matched.
            
        """
        self._preprocess_psms(psm_df_one_raw)

        psm_df_one_raw = self._add_missing_columns_to_psm_df(
            psm_df_one_raw
        )

        (
            fragment_mz_df, 
            matched_intensity_df,
            matched_mz_err_df,
        ) = self._prepare_matching_dfs(psm_df_one_raw)
        
        for (
            spec_idx, frag_start_idx, frag_end_idx
        ) in psm_df_one_raw[[
            'spec_idx', 'frag_start_idx', 
            'frag_end_idx'
        ]].values:
            (
                spec_mzs, spec_intens
            ) = self.get_peaks(spec_idx)

            self._match_one_psm(
                spec_mzs, spec_intens,
                fragment_mz_df, 
                matched_intensity_df,
                matched_mz_err_df,
                frag_start_idx, frag_end_idx,
            )

        return (
            psm_df_one_raw, fragment_mz_df, 
            matched_intensity_df, matched_mz_err_df
        )

    def _match_ms2_one_raw_numba(self, raw_name, df_group):
        if raw_name in self._ms_file_dict:
            raw_data = load_ms_data(
                self._ms_file_dict[raw_name], self._ms_file_type
            )

            df_group = self._add_missing_columns_to_psm_df(
                df_group, raw_data
            )

            match_one_raw_with_numba(
                df_group.spec_idx.values,
                df_group.frag_start_idx.values,
                df_group.frag_end_idx.values,
                self.fragment_mz_df.values,
                raw_data.peak_df.mz.values, 
                raw_data.peak_df.intensity.values,
                raw_data.spectrum_df.peak_start_idx.values,
                raw_data.spectrum_df.peak_end_idx.values,
                self.matched_intensity_df.values,
                self.matched_mz_err_df.values,
                self.use_ppm, self.tol, 
                self.centroid_mode
            )
    
    def match_ms2_multi_raw(self,
        psm_df: pd.DataFrame,
        ms_files: Union[dict,list],
        ms_file_type:str = 'alpharaw_hdf',
    ):
        """Matching PSM dataframe against the ms2 files in ms_files
        This method will store matched values as attributes:
        - self.psm_df
        - self.fragment_mz_df
        - self.matched_intensity_df
        - self.matched_mz_err_df

        Parameters
        ----------
        psm_df : pd.DataFrame
            PSM dataframe

        ms_files : dict | list
            if dict: {raw_name: ms2 path}
            if list: [ms2 path1, ms2 path2]

        ms_file_type : str, optional
            Could be 'alpharaw_hdf', 'mgf' or 'thermo', 'sciex', 'alphapept_hdf'. 
            Defaults to 'alphapept'.
            
        Returns
        -------
        tuple:
            pd.DataFrame: psm dataframe with fragment index information.
            
            pd.DataFrame: fragment mz dataframe.
            
            pd.DataFrame: matched intensity dataframe.
            
            pd.DataFrame: matched mass error dataframe. 
            np.inf if a fragment is not matched.

        """
        self._preprocess_psms(psm_df)
        self.psm_df = psm_df
        
        (
            self.fragment_mz_df, 
            self.matched_intensity_df,
            self.matched_mz_err_df,
        ) = self._prepare_matching_dfs(psm_df)
        
        if isinstance(ms_files, dict):
            self._ms_file_dict = ms_files
        else:
            self._ms_file_dict = parse_ms_files_to_dict(ms_files)

        self._ms_file_type = ms_file_type

        for raw_name, df_group in tqdm.tqdm(
            self.psm_df.groupby('raw_name')
        ):
            self._match_ms2_one_raw_numba(raw_name, df_group)

        return (
            self.psm_df, self.fragment_mz_df, 
            self.matched_intensity_df, self.matched_mz_err_df
        )

In [None]:
#| hide
from nbdev.showdoc import show_doc

In [None]:
show_doc(PepSpecMatch.match_ms2_multi_raw)

---

[source](https://github.com/MannLabs/alpharaw/blob/main/alpharaw/match/psm_match.py#L366){target="_blank" style="float:right; font-size:smaller"}

### PepSpecMatch.match_ms2_multi_raw

>      PepSpecMatch.match_ms2_multi_raw (psm_df:pandas.core.frame.DataFrame,
>                                        ms_files:Union[dict,list],
>                                        ms_file_type:str='alpharaw_hdf')

Matching PSM dataframe against the ms2 files in ms_files
This method will store matched values as attributes:
- self.psm_df
- self.fragment_mz_df
- self.matched_intensity_df
- self.matched_mz_err_df

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| psm_df | DataFrame |  | PSM dataframe |
| ms_files | typing.Union[dict, list] |  | if dict: {raw_name: ms2 path}<br>if list: [ms2 path1, ms2 path2] |
| ms_file_type | str | alpharaw_hdf | Could be 'alpharaw_hdf', 'mgf' or 'thermo', 'sciex', 'alphapept_hdf'. <br>Defaults to 'alphapept'. |
| **Returns** | **tuple:** |  | **pd.DataFrame: psm dataframe with fragment index information.<br><br>pd.DataFrame: fragment mz dataframe.<br><br>pd.DataFrame: matched intensity dataframe.<br><br>pd.DataFrame: matched mass error dataframe. <br>np.inf if a fragment is not matched.** |

In [None]:
show_doc(PepSpecMatch.match_ms2_one_raw)

---

[source](https://github.com/MannLabs/alpharaw/blob/main/alpharaw/match/psm_match.py#L280){target="_blank" style="float:right; font-size:smaller"}

### PepSpecMatch.match_ms2_one_raw

>      PepSpecMatch.match_ms2_one_raw
>                                      (psm_df_one_raw:pandas.core.frame.DataFra
>                                      me)

Matching psm_df_one_raw against self.raw_data 
after `self.load_ms_data()`

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| psm_df_one_raw | DataFrame | psm dataframe <br>that contains only one raw file |
| **Returns** | **tuple** | **pd.DataFrame: psm dataframe with fragment index information.<br><br>pd.DataFrame: fragment mz dataframe.<br><br>pd.DataFrame: matched intensity dataframe.<br><br>pd.DataFrame: matched mass error dataframe. <br>np.inf if a fragment is not matched.** |

In [None]:
#| hide
import io
import copy

import peptdeep.psm_frag_reader.psmlabel_reader #to register psmlabel_reader
from peptdeep.psm_frag_reader.psm_frag_reader import psm_w_frag_reader_provider
from alphabase.peptide.fragment import create_fragment_mz_dataframe_by_sort_precursor
from alpharaw.legacy_msdata.mgf import MGFReader

In [None]:
#| hide
#unittest
mgf = io.StringIO("""
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.31809.31809.3.0.dta
CHARGE=3+
RTINSECONDS=0.5418930
PEPMASS=272.276336
103.92207 5457.3
104.20045 5051.4
108.70090 5891.7
113.94175 6442.6
116.92975 40506.3
116.93716 8945.5
128.37773 6427.8
131.95308 288352.6
133.93259 7344.6
138.44611 7326.1
139.00072 41556.8
140.00319 16738.8
140.99719 9493.8
145.93156 10209.3
145.94897 10497.8
147.94559 8206.3
147.96396 30552.8
148.95543 14654.7
149.96338 234207.8
150.95096 8306.0
157.01089 84638.9
158.01357 27925.7
159.00627 16084.7
163.94281 24751.1
163.95915 32203.3
165.95605 44458.0
165.97186 11530.2
166.99500 26432.2
167.97302 9216.7
181.95230 13858.8
191.95448 66152.7
192.95538 8408.9
193.07185 9092.8
193.95313 660574.9
194.95674 23452.8
194.99008 143940.9
200.00568 19510.8
200.99942 23678.7
204.30894 9406.1
209.96466 21853.6
211.96245 65351.0
218.90355 9149.6
223.91072 11300.2
238.89684 12108.8
243.93825 10150.2
243.97040 10987.7
244.94121 8744.2
246.90314 11556.3
271.93225 29430.0
271.99219 51184.4
272.19150 31960.4
272.98602 35844.1
273.94431 11031.8
284.47998 8191.3
290.00125 66212.4
290.99539 54064.7
293.89490 10005.0
407.06372 10838.2
464.36697 9715.4
633.40036 633.40036
698.81390 9711.7
707.301117 707.301117
END IONS
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23862.23862.2.0.dta
CHARGE=2+
RTINSECONDS=0.6455220
PEPMASS=287.427959
103.34669 5304.0
104.66884 5639.7
113.42419 6258.3
118.84039 5837.5
119.93203 13977.3
130.69589 6876.2
133.94824 43094.3
134.30524 7671.5
135.96359 9031.3
138.99994 8329.7
146.95573 31143.9
147.96323 12176.5
150.95151 65859.3
151.95818 24384.2
157.01105 19241.5
157.34985 7532.5
161.08838 7843.9
161.94234 20119.7
162.95146 60110.4
163.95877 183305.5
164.96657 13647.5
174.95139 150331.9
175.95258 21393.4
178.94460 11433.1
179.95316 13650.5
180.96204 15353.5
190.94572 30418.9
191.95422 61914.1
192.61461 8642.1
192.94395 12331.4
192.96207 132342.5
193.96318 19303.0
209.04164 25149.6
209.96368 154185.0
209.98361 12353.5
213.86244 11541.3
224.93071 12903.0
228.92879 8773.6
241.86043 135357.5
242.86113 20805.2
242.94327 26679.4
243.95219 29569.9
244.92361 12153.5
246.90300 16650.3
252.96521 73484.3
253.96646 11527.5
286.85858 10166.4
287.94186 18763.2
303.87665 39189.3
304.88116 11976.0
321.89087 97122.5
322.88867 28020.8
370.28696 9008.2
389.82578 13277.0
407.83545 12220.4
425.84872 13236.5
482.54852 10940.2
END IONS
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23431.23431.2.0.dta
CHARGE=2+
RTINSECONDS=0.6455220
PEPMASS=287.427959
103.34669 5304.0
104.66884 5639.7
END IONS
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.32733.32733.2.0.dta
CHARGE=2+
RTINSECONDS=0.6455220
PEPMASS=287.427959
103.34669 5304.0
104.66884 5639.7
402.705571 402.705571
END IONS
BEGIN IONS
TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23669.23669.2.0.dta
CHARGE=2+
RTINSECONDS=0.6455220
PEPMASS=287.427959
END IONS
""")

ms_file_dict = {
    'raw': copy.deepcopy(mgf),
    'raw1': copy.deepcopy(mgf),
}

psmlabel_str = '''spec	peptide	modinfo	b	b-NH3	b-H2O	b-ModLoss	y	y-NH3	y-H2O	y-ModLoss
raw.31809.31809.2.0.dta	PSTDLLMLK	2,Phospho[S];7,Oxidation[M];	b2+1,11394796;b3+1,1242152.8;b4+1,3736963.3;b4+2,169730.9;b5+1,1963146.4;b6+1,1264694.9;b6+2,265013.9;b7+1,1253226.5;b7+2,909294.6;b8+1,720161.7;		b2-H2O+1,1392711.1;b3-H2O+1,2807275.5;b4-H2O+1,656366;b5-H2O+1,341585;b6-H2O+1,209442.1;	b7-ModLoss+1,473386.4;b8-ModLoss+1,208994.1;	y8+1,22006548;y8+2,256042.3;y7+1,19231634;y7+2,213004.9;y6+1,6696723;y5+1,5890172;y4+1,4885660.5;y3+1,3570823.5;y2+1,1857323.8;y1+1,1636183.8;	y8-NH3+1,567207.4;y1-NH3+1,531551.1;	y8-H2O+1,1416820.1;y8-H2O+2,256081;y7-H2O+1,900931.1;y7-H2O+2,2961118.5;y3-H2O+1,184890.4;y2-H2O+1,306988.6;y1-H2O+1,1126237.5;	y8-ModLoss+1,4600049;y7-ModLoss+1,3840026.3;y6-ModLoss+1,1045096.9;y5-ModLoss+1,868705.3;y4-ModLoss+1,573257.7;y3-ModLoss+1,518627;
raw.23862.23862.2.0.dta	HTAYSDFLSDK		b1+1,299364.8;b2+1,3488062;b3+1,308160.7;b4+1,233294.5;b5+1,55810.8;b6+1,650653.9;b7+1,485245;b8+1,328604.8;b9+1,160565.1;b10+1,376348.6;	b7-NH3+1,63030.5;b10-NH3+1,129601.2;	b2-H2O+1,176123.1;b3-H2O+1,114956.5;b4-H2O+1,59385.5;b5-H2O+1,41324.8;b6-H2O+1,527812.9;b7-H2O+1,275831.8;b8-H2O+1,365457.2;b9-H2O+1,227540.1;b9-H2O+2,59055.5;b10-H2O+1,265041.1;b10-H2O+2,55810.8;		y10+1,2513661;y9+1,3651241.3;y8+1,989975.4;y7+1,594356.4;y6+1,155207.8;y5+1,1266161.9;y4+1,321580;y3+1,1227822.8;y2+1,636557.6;y1+1,697604.3;	y10-NH3+1,75562.7;y7-NH3+1,102006.4;y1-NH3+1,185766.1;	y10-H2O+1,189888.1;y9-H2O+1,73236.7;y4-H2O+1,56329.2;y3-H2O+1,91522.7;y2-H2O+1,98231.2;y1-H2O+1,375849.7;	
raw.23431.23431.2.0.dta	HTAYSDFLSDK		b1+1,45976.2;b2+1,568759.5;b3+1,49093.1;b4+1,49601;b5+1,23729.4;b6+1,141218;b7+1,104082.9;b8+1,115693.4;b9+1,60744.1;b10+1,98634.1;	b5-NH3+1,12496.8;b8-NH3+1,33514.1;b9-NH3+1,34818.7;	b2-H2O+1,13616.9;b3-H2O+1,9902.4;b4-H2O+1,29442.6;b5-H2O+1,13391.7;b6-H2O+1,54826.9;b7-H2O+1,62953.9;b8-H2O+1,69100.3;b9-H2O+1,60146.4;b10-H2O+1,50907.2;b10-H2O+2,23729.4;		y10+1,361255.9;y9+1,552602.6;y8+1,160028.2;y7+1,102606.7;y6+1,22479.1;y5+1,167033.7;y4+1,76430.6;y3+1,273281.6;y2+1,165234.1;y1+1,142589;	y7-NH3+1,22439.1;y1-NH3+1,37364.8;	y10-H2O+1,29709;y9-H2O+1,16514.8;y3-H2O+1,36499.1;y2-H2O+1,17987.4;y1-H2O+1,96955.6;	
raw.32733.32733.2.0.dta	HFALFSTDVTK		b1+1,27135.7;b2+1,361137.4;b3+1,68835.3;b4+1,70138.3;b5+1,45754.8;b7+1,11576.6;b8+1,91503.8;b9+1,64331.7;b10+1,27626.7;b10+2,25667;		b3-H2O+1,48033;b9-H2O+1,14316.2;b10-H2O+1,11975.8;		y10+1,219460.2;y10+2,13433.4;y9+1,442455.6;y8+1,97392.2;y7+1,108960.5;y6+1,60849.7;y5+1,26771.3;y4+1,17036.4;y3+1,45523.9;y2+1,103608.1;y1+1,62643;	y6-NH3+2,11445.5;y1-NH3+1,18111.4;	y2-H2O+1,15362.3;y1-H2O+1,34004.8;	
raw.23669.23669.2.0.dta	HTAYSDFLSDK		b1+1,262855;b2+1,3235572.3;b3+1,268667.7;b4+1,237506.8;b5+1,80077.3;b6+1,557696.8;b7+1,336325.9;b7+2,31299.9;b8+1,247175;b8+2,28601.6;b9+1,116897.4;b9+2,18714.8;b10+1,275498.9;	b2-NH3+1,19037.2;	b2-H2O+1,141344.2;b3-H2O+1,92893.6;b4-H2O+1,56392;b5-H2O+1,46386.1;b6-H2O+1,404526;b7-H2O+1,203047.2;b7-H2O+2,13485.6;b8-H2O+1,231333.9;b8-H2O+2,30468.7;b9-H2O+1,151952.4;b9-H2O+2,53914;b10-H2O+1,172398.7;b10-H2O+2,80077.3;		y10+1,1652851.5;y10+2,31706.2;y9+1,2379192.5;y8+1,664060.9;y8+2,26944.2;y7+1,418105.1;y6+1,118890.7;y5+1,1026599.5;y4+1,309265.2;y3+1,1084321;y2+1,608127.8;y1+1,617369.5;	y10-NH3+1,41452.9;y7-NH3+1,61761.1;y2-NH3+1,32386.8;y1-NH3+1,199112.3;	y10-H2O+1,127643.4;y9-H2O+1,49576.6;y8-H2O+1,26233.2;y6-H2O+1,13648.5;y5-H2O+1,34467.8;y4-H2O+1,28410.1;y3-H2O+1,75421.2;y2-H2O+1,106013.4;y1-H2O+1,351150.3;	
raw1.31809.31809.2.0.dta	PSTDLLMLK	2,Phospho[S];7,Oxidation[M];	b2+1,11394796;b3+1,1242152.8;b4+1,3736963.3;b4+2,169730.9;b5+1,1963146.4;b6+1,1264694.9;b6+2,265013.9;b7+1,1253226.5;b7+2,909294.6;b8+1,720161.7;		b2-H2O+1,1392711.1;b3-H2O+1,2807275.5;b4-H2O+1,656366;b5-H2O+1,341585;b6-H2O+1,209442.1;	b7-ModLoss+1,473386.4;b8-ModLoss+1,208994.1;	y8+1,22006548;y8+2,256042.3;y7+1,19231634;y7+2,213004.9;y6+1,6696723;y5+1,5890172;y4+1,4885660.5;y3+1,3570823.5;y2+1,1857323.8;y1+1,1636183.8;	y8-NH3+1,567207.4;y1-NH3+1,531551.1;	y8-H2O+1,1416820.1;y8-H2O+2,256081;y7-H2O+1,900931.1;y7-H2O+2,2961118.5;y3-H2O+1,184890.4;y2-H2O+1,306988.6;y1-H2O+1,1126237.5;	y8-ModLoss+1,4600049;y7-ModLoss+1,3840026.3;y6-ModLoss+1,1045096.9;y5-ModLoss+1,868705.3;y4-ModLoss+1,573257.7;y3-ModLoss+1,518627;
raw1.23862.23862.2.0.dta	HTAYSDFLSDK		b1+1,299364.8;b2+1,3488062;b3+1,308160.7;b4+1,233294.5;b5+1,55810.8;b6+1,650653.9;b7+1,485245;b8+1,328604.8;b9+1,160565.1;b10+1,376348.6;	b7-NH3+1,63030.5;b10-NH3+1,129601.2;	b2-H2O+1,176123.1;b3-H2O+1,114956.5;b4-H2O+1,59385.5;b5-H2O+1,41324.8;b6-H2O+1,527812.9;b7-H2O+1,275831.8;b8-H2O+1,365457.2;b9-H2O+1,227540.1;b9-H2O+2,59055.5;b10-H2O+1,265041.1;b10-H2O+2,55810.8;		y10+1,2513661;y9+1,3651241.3;y8+1,989975.4;y7+1,594356.4;y6+1,155207.8;y5+1,1266161.9;y4+1,321580;y3+1,1227822.8;y2+1,636557.6;y1+1,697604.3;	y10-NH3+1,75562.7;y7-NH3+1,102006.4;y1-NH3+1,185766.1;	y10-H2O+1,189888.1;y9-H2O+1,73236.7;y4-H2O+1,56329.2;y3-H2O+1,91522.7;y2-H2O+1,98231.2;y1-H2O+1,375849.7;	
raw1.23431.23431.2.0.dta	HTAYSDFLSDK		b1+1,45976.2;b2+1,568759.5;b3+1,49093.1;b4+1,49601;b5+1,23729.4;b6+1,141218;b7+1,104082.9;b8+1,115693.4;b9+1,60744.1;b10+1,98634.1;	b5-NH3+1,12496.8;b8-NH3+1,33514.1;b9-NH3+1,34818.7;	b2-H2O+1,13616.9;b3-H2O+1,9902.4;b4-H2O+1,29442.6;b5-H2O+1,13391.7;b6-H2O+1,54826.9;b7-H2O+1,62953.9;b8-H2O+1,69100.3;b9-H2O+1,60146.4;b10-H2O+1,50907.2;b10-H2O+2,23729.4;		y10+1,361255.9;y9+1,552602.6;y8+1,160028.2;y7+1,102606.7;y6+1,22479.1;y5+1,167033.7;y4+1,76430.6;y3+1,273281.6;y2+1,165234.1;y1+1,142589;	y7-NH3+1,22439.1;y1-NH3+1,37364.8;	y10-H2O+1,29709;y9-H2O+1,16514.8;y3-H2O+1,36499.1;y2-H2O+1,17987.4;y1-H2O+1,96955.6;	
raw1.32733.32733.2.0.dta	HFALFSTDVTK		b1+1,27135.7;b2+1,361137.4;b3+1,68835.3;b4+1,70138.3;b5+1,45754.8;b7+1,11576.6;b8+1,91503.8;b9+1,64331.7;b10+1,27626.7;b10+2,25667;		b3-H2O+1,48033;b9-H2O+1,14316.2;b10-H2O+1,11975.8;		y10+1,219460.2;y10+2,13433.4;y9+1,442455.6;y8+1,97392.2;y7+1,108960.5;y6+1,60849.7;y5+1,26771.3;y4+1,17036.4;y3+1,45523.9;y2+1,103608.1;y1+1,62643;	y6-NH3+2,11445.5;y1-NH3+1,18111.4;	y2-H2O+1,15362.3;y1-H2O+1,34004.8;	
raw1.23669.23669.2.0.dta	HTAYSDFLSDK		b1+1,262855;b2+1,3235572.3;b3+1,268667.7;b4+1,237506.8;b5+1,80077.3;b6+1,557696.8;b7+1,336325.9;b7+2,31299.9;b8+1,247175;b8+2,28601.6;b9+1,116897.4;b9+2,18714.8;b10+1,275498.9;	b2-NH3+1,19037.2;	b2-H2O+1,141344.2;b3-H2O+1,92893.6;b4-H2O+1,56392;b5-H2O+1,46386.1;b6-H2O+1,404526;b7-H2O+1,203047.2;b7-H2O+2,13485.6;b8-H2O+1,231333.9;b8-H2O+2,30468.7;b9-H2O+1,151952.4;b9-H2O+2,53914;b10-H2O+1,172398.7;b10-H2O+2,80077.3;		y10+1,1652851.5;y10+2,31706.2;y9+1,2379192.5;y8+1,664060.9;y8+2,26944.2;y7+1,418105.1;y6+1,118890.7;y5+1,1026599.5;y4+1,309265.2;y3+1,1084321;y2+1,608127.8;y1+1,617369.5;	y10-NH3+1,41452.9;y7-NH3+1,61761.1;y2-NH3+1,32386.8;y1-NH3+1,199112.3;	y10-H2O+1,127643.4;y9-H2O+1,49576.6;y8-H2O+1,26233.2;y6-H2O+1,13648.5;y5-H2O+1,34467.8;y4-H2O+1,28410.1;y3-H2O+1,75421.2;y2-H2O+1,106013.4;y1-H2O+1,351150.3;	
'''
reader = psm_w_frag_reader_provider.get_reader('psmlabel')
reader.import_file(io.StringIO(psmlabel_str))
psm_df = reader.psm_df
matching = PepSpecMatch()
matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')
merrs = matching.matched_mz_err_df.values
#np.sum(matching.matched_intensity_df.values!=0,axis=1)
assert len(merrs[~np.isinf(merrs)])==6
assert np.count_nonzero(matching.matched_intensity_df.values)==6

100%|██████████| 2/2 [00:03<00:00,  1.66s/it]


In [None]:
#| hide
mgf_reader = ms_reader_provider.get_reader('mgf')
mgf_reader.import_raw(copy.deepcopy(mgf))
ms_file_dict = {'raw': mgf_reader}
mgf_reader1 = ms_reader_provider.get_reader('mgf')
mgf_reader1.import_raw(copy.deepcopy(mgf))
ms_file_dict['raw1'] = mgf_reader1
matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')
merrs = matching.matched_mz_err_df.values
assert np.count_nonzero(matching.matched_intensity_df.values) == 6
assert len(merrs[~np.isinf(merrs)]) == 6

100%|██████████| 2/2 [00:00<00:00, 101.41it/s]


In [None]:
#| hide
ms_file_dict = {
    'raw': copy.deepcopy(mgf),
}
reader = psm_w_frag_reader_provider.get_reader('psmlabel')
reader.import_file(io.StringIO(psmlabel_str))
psm_df = reader.psm_df
psm_df = psm_df[~psm_df.raw_name.str.startswith('raw1')].copy()
matching = PepSpecMatch()
matching.match_ms2_multi_raw(psm_df, ms_file_dict, 'mgf')
matching.load_ms_data(copy.deepcopy(mgf), 'mgf')
df, frag_mz_df, frag_inten_df, frag_merr_df = matching.match_ms2_one_raw(
    psm_df
)
assert (matching.fragment_mz_df==frag_mz_df).values.all()
assert (matching.matched_intensity_df==frag_inten_df).values.all()
assert (matching.matched_mz_err_df==frag_merr_df).values.all()

100%|██████████| 1/1 [00:00<00:00, 71.69it/s]
