In [None]:
#default_exp peptide.fragment

In [None]:
#hide
%reload_ext autoreload
%autoreload 2

# 'alphabase/constants/fragment.py` process fragment-level dataframes

### First, it is worth mentioning that, in AlphaBase:
 1. peptide N-term modification site is 0
 2. C-term modification site is -1 
 3. other modifications sites are integers from 1 to nAA

Just in case that we have two modifications, one is on the peptide N-term, and the other is on the N-term AA site chain. Similar for C-term sites.

In [None]:
#export
import numpy as np
import pandas as pd
from typing import List, Union, Tuple
import warnings

from alphabase.peptide.mass_calc import *
from alphabase.constants.modification import (
    calc_modloss_mass
)
from alphabase.constants.element import (
    MASS_H2O, MASS_PROTON, 
    MASS_NH3, CHEM_MONO_MASS
)

def get_charged_frag_types(
    frag_types:List[str], 
    max_frag_charge:int = 2
)->List[str]:
    '''
    Combine fragment types and charge states.

    Args:
        frag_types (List[str]): e.g. ['b','y','b_modloss','y_modloss']
        max_frag_charge (int): max fragment charge. (default: 2)
    Returns:
        List[str]: for `frag_types=['b','y','b_modloss','y_modloss']` and `max_frag_charge=2`, 
        return `['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','b_modloss_z2','y_modloss_z1','y_modloss_z2']`.
    '''
    charged_frag_types = []
    for _type in frag_types:
        for _ch in range(1, max_frag_charge+1):
            charged_frag_types.append(f"{_type}_z{_ch}")
    return charged_frag_types

def parse_charged_frag_type(
    charged_frag_type: str
)->Tuple[str,int]:
    '''
    Oppsite to `get_charged_frag_types`.
    Args:
        charged_frag_type (str): e.g. 'y_z1', 'b_modloss_z1'
    Returns:
        str: fragment type, e.g. 'b','y'
        int: charge state, can be a negative value
    '''
    items = charged_frag_type.split('_')
    _ch = items[-1]
    _type = '_'.join(items[:-1])
    return _type, int(_ch[1:])

In [None]:
get_charged_frag_types(['b','b_modloss'],2)

['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2']

In [None]:
parse_charged_frag_type('b_z2'), parse_charged_frag_type('b_modloss_z2')

(('b', 2), ('b_modloss', 2))

# Fragment dataframe processing

In AlphaX Ecosystem, library fragments are stored in a dataframe, where the columns are charged_frag_types (`['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','y_H2O_z1'...]`) and the rows are corresponding positions (starting with peptide N-term) of the fragments. Library precursor/peptide dataframe must contain `frag_start_idx` and `frag_end_idx` columns to tell us where are the fragments of each precursor/peptide.

We provide different ways to initialize fragment dataframes, see below:

In [None]:
#export
def init_zero_fragment_dataframe(
    peplen_array:np.array,
    charged_frag_types:List[str]
)->Tuple[pd.DataFrame, np.array, np.array]: 
    '''
    Args:
        peplen_array (np.array): peptide lengths for the fragment dataframe
        charged_frag_types (List[str]): 
            `['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','y_H2O_z1'...]`
    Returns:
        pd.DataFrame: `fragment_df` with zero values
        np.array (int64): the start indices point to the `fragment_df` for each peptide
        np.array (int64): the end indices point to the `fragment_df` for each peptide
    '''
    indices = np.zeros(len(peplen_array)+1, dtype=np.int64)
    indices[1:] = peplen_array-1
    indices = np.cumsum(indices)
    fragment_df = pd.DataFrame(
        np.zeros((indices[-1],len(charged_frag_types))),
        columns = charged_frag_types
    )
    return fragment_df, indices[:-1], indices[1:]

def init_fragment_dataframe_from_other(
    reference_fragment_df: pd.DataFrame
):
    '''
    Init zero fragment dataframe from the `reference_fragment_df`
    '''
    return pd.DataFrame(
        np.zeros_like(reference_fragment_df.values),
        columns = reference_fragment_df.columns
    )

def init_fragment_by_precursor_dataframe(
    precursor_df,
    charged_frag_types: List[str],
    reference_fragment_df: int = None
):
    '''
    Init zero fragment dataframe for the `precursor_df`. If 
    the `reference_fragment_df` is provided, it will generate 
    the dataframe based on the reference. Otherwise it 
    generates the dataframe from scratch.
    Args:
        precursor_df (pd.DataFrame): precursors to generate fragment masses,
            if `precursor_df` contains the 'frag_start_idx' column, 
            it is better to provide `reference_fragment_df` as 
            `precursor_df.frag_start_idx` and `precursor.frag_end_idx` 
            point to the indices in `reference_fragment_df`
        charged_frag_types (List): 
            `['b_z1','b_z2','y_z1','y_z2','b_modloss_z1','y_H2O_z1'...]`
        reference_fragment_df (pd.DataFrame): generate fragment_mz_df based
            on this reference (default: None)
    Returns:
        pd.DataFrame: zero `fragment_df` with given `charged_frag_types`
    '''
    if 'frag_start_idx' not in precursor_df.columns:
        fragment_df, start_indices, end_indices = init_zero_fragment_dataframe(
            precursor_df.nAA.values,
            charged_frag_types
        )
        precursor_df['frag_start_idx'] = start_indices
        precursor_df['frag_end_idx'] = end_indices
    else:
        if reference_fragment_df is None:
            warnings.warn(
                "`precursor_df` contains the 'frag_start_idx' column, "\
                "it is better to provide `reference_fragment_df`", RuntimeWarning
            )
            fragment_df = pd.DataFrame(
                np.zeros((
                    precursor_df.frag_end_idx.max(), len(charged_frag_types)
                )),
                columns = charged_frag_types
            )
        else:
            fragment_df = init_fragment_dataframe_from_other(
                reference_fragment_df[charged_frag_types]
            )
    return fragment_df

For a subset of the precursor dataframe, we need to set or get fragment values for the slicing (by `frag_start_idx` and `frag_end_idx`in `precursor_df`) of the fragment dataframe. We use `update_sliced_fragment_dataframe()` to set the values, and `get_sliced_fragment_dataframe()` to get values.

In [None]:
#export
def update_sliced_fragment_dataframe(
    fragment_df: pd.DataFrame,
    values: np.array,
    frag_start_end_list: List[Tuple[int,int]],
    charged_frag_types: List[str],
)->pd.DataFrame:
    '''
    Set the values of the slices `frag_start_end_list=[(start,end),(start,end),...]` of fragment_df.
    Args:
        fragment_df (pd.DataFrame): fragment dataframe to be set
        frag_start_end_list (List[Tuple[int,int]]): e.g. `[(start,end),(start,end),...]`
        charged_frag_types (List[str]): e.g. `['b_z1','b_z2','y_z1','y_z2']`
    Returns:
        pd.DataFrame: fragment_df after the values are set
    '''
    frag_slice_list = [slice(start,end) for start,end in frag_start_end_list]
    frag_slices = np.r_[tuple(frag_slice_list)]
    fragment_df.loc[frag_slices, charged_frag_types] = values
    return fragment_df

def get_sliced_fragment_dataframe(
    fragment_df: pd.DataFrame,
    frag_start_end_list:Union[List,np.array],
    charged_frag_types:List = None,
)->pd.DataFrame:
    '''
    Get the sliced fragment_df from `frag_start_end_list=[(start,end),(start,end),...]`.
    Args:
        fragment_df (pd.DataFrame): fragment dataframe to be set
        frag_start_end_list (List[Tuple[int,int]]): e.g. `[(start,end),(start,end),...]`
        charged_frag_types (List[str]): e.g. `['b_z1','b_z2','y_z1','y_z2']` (default: None)
    Returns:
        pd.DataFrame: sliced fragment_df. If `charged_frag_types` is None, 
        return fragment_df with all columns
    '''
    frag_slice_list = [slice(start,end) for start,end in frag_start_end_list]
    frag_slices = np.r_[tuple(frag_slice_list)]
    if charged_frag_types is None or len(charged_frag_types)==0:
        charged_frag_types = slice(None)
    return fragment_df.loc[frag_slices, charged_frag_types]

For some search engines, it reports different result files for different raw files. After load them separately, we concatenate `precursor_df_list` and `fragment_df_list` into single dataframes respectively. The main processing here is to cumulate `frag_start_idx` and `frag_end_idx` for different `precursor_df`s.

In [None]:
#export
def concat_precursor_fragment_dataframes(
    precursor_df_list: List[pd.DataFrame],
    fragment_df_list: List[pd.DataFrame],
    *other_fragment_df_lists
)->Tuple[pd.DataFrame,...]:
    '''
    Since fragment_df is indexed by precursor_df, when we concatenate multiple 
    fragment_df, the indexed positions will change for in precursor_dfs,  
    this function keeps the correct indexed positions of precursor_df when 
    concatenating multiple fragment_df dataframes.
    Args:
        precursor_df_list (List[pd.DataFrame]): precursor dataframe list to concatenate
        fragment_df_list (List[pd.DataFrame]): fragment dataframe list to concatenate
        *other_fragment_df_lists: arbitray other fragment dataframe list to concatenate, 
            e.g. fragment_mass_df, fragment_inten_df, ...
    Returns:
        Tuple[pd.DataFrame,...]: concatenated precursor_df, fragment_df, *other_fragment_df ...
    '''
    fragment_df_lens = [len(fragment_df) for fragment_df in fragment_df_list]
    cum_frag_df_lens = np.cumsum(fragment_df_lens)
    for i,precursor_df in enumerate(precursor_df_list[1:]):
        precursor_df[['frag_start_idx','frag_end_idx']] += cum_frag_df_lens[i]
    return (
        pd.concat(precursor_df_list).reset_index(drop=True),
        pd.concat(fragment_df_list).reset_index(drop=True),
        *[pd.concat(other_list).reset_index(drop=True) 
            for other_list in other_fragment_df_lists
        ]
    )

# Create fragment mz dataframe
 This is one of the most important functions in alphabase. For a given `precursor_df`, it calculates the fragment ion dataframe, and also set the `frag_start_idx` and `frag_end_idx` column values to connect the `precursor_df` and `fragment_mz_df`.

In [None]:
#export
def create_fragment_mz_dataframe(
    precursor_df: pd.DataFrame,
    charged_frag_types:List,
    reference_fragment_df: pd.DataFrame = None,
)->Tuple[pd.DataFrame, pd.DataFrame]:
    '''
    Generate fragment mass dataframe for the precursor_df. If 
    the `reference_fragment_df` is provided, it will generate 
    the mz dataframe based on the reference. Otherwise it 
    generates the mz dataframe from scratch.
    Args:
        precursor_df (pd.DataFrame): precursors to generate fragment masses,
            if `precursor_df` contains the 'frag_start_idx' column, 
            `reference_fragment_df` must be provided
        charged_frag_types (List): 
            `['b_z1','b_z2','y_z1','y_z2','b_modloss_1','y_H2O_z1'...]`
        reference_fragment_df (pd.DataFrame): generate fragment_mz_df based
            on this reference, as `precursor_df.frag_start_idx` and 
            `precursor.frag_end_idx` point to the indices in 
            `reference_fragment_df`
    Returns:
        pd.DataFrame: `precursor_df`. `precursor_df` contains the 'charge' column, 
        this function will automatically assign the 'precursor_mz' to `precursor_df`
        pd.DataFrame: `fragment_mz_df` with given `charged_frag_types`
    Raises:
        ValueError: when 1. `precursor_df` contains 'frag_start_idx' but 
        `reference_fragment_df` is not None; or 2. `reference_fragment_df` 
        is None but `precursor_df` does not contain 'frag_start_idx'
    '''
    if reference_fragment_df is None:
        if 'frag_start_idx' in precursor_df.columns:
            raise ValueError(
                "`precursor_df` contains 'frag_start_idx' column, "\
                "please provide `reference_fragment_df` argument"
            )
    else:
        if 'frag_start_idx' not in precursor_df.columns:
            raise ValueError(
                "No column 'frag_start_idx' in `precursor_df` "\
                "to slice the `reference_fragment_df`"
            )
    if 'nAA' not in precursor_df.columns:
        precursor_df['nAA'] = precursor_df.sequence.str.len()
    if reference_fragment_df is not None:
        fragment_mz_df = init_fragment_dataframe_from_other(
            reference_fragment_df[charged_frag_types]
        )
    else:
        fragment_df_list = []

    precursor_df_list = []

    _grouped = precursor_df.groupby('nAA')
    for nAA, df_group in _grouped:
        mod_list = df_group.mods.str.split(';').apply(
            lambda x: [m for m in x if len(m)>0]
        ).values
        site_list = df_group.mod_sites.str.split(';').apply(
            lambda x: [int(s) for s in x if len(s)>0]
        ).values

        if 'mod_deltas' in df_group.columns:
            mod_delta_list = df_group.mod_deltas.str.split(';').apply(
                lambda x: [float(m) for m in x if len(m)>0]
            ).values
            mod_delta_site_list = df_group.mod_delta_sites.str.split(';').apply(
                lambda x: [int(s) for s in x if len(s)>0]
            ).values
        else:
            mod_delta_list = None
            mod_delta_site_list = None
        (
            b_mass, y_mass, pepmass
        ) = calc_b_y_and_peptide_masses_for_same_len_seqs(
            df_group.sequence.values.astype('U'), 
            mod_list, site_list,
            mod_delta_list,
            mod_delta_site_list
        )
        b_mass = b_mass.reshape(-1)
        y_mass = y_mass.reshape(-1)

        if (
            'charge' in df_group.columns and 
            'precursor_mz' not in df_group.columns
        ):
            df_group['precursor_mz'] = pepmass/df_group[
                'charge'
            ].values + MASS_PROTON

        for charged_frag_type in charged_frag_types:
            if charged_frag_type.startswith('b_modloss'):
                b_modloss = np.concatenate([
                    calc_modloss_mass(nAA, mods, sites, True)
                    for mods, sites in zip(mod_list, site_list)
                ])
                break
        for charged_frag_type in charged_frag_types:
            if charged_frag_type.startswith('y_modloss'):
                y_modloss = np.concatenate([
                    calc_modloss_mass(nAA, mods, sites, False)
                    for mods, sites in zip(mod_list, site_list)
                ])
                break

        set_values = []
        add_proton = MASS_PROTON
        for charged_frag_type in charged_frag_types:
            frag_type, charge = parse_charged_frag_type(charged_frag_type)
            if frag_type =='b':
                set_values.append(b_mass/charge + add_proton)
            elif frag_type == 'y':
                set_values.append(y_mass/charge + add_proton)
            elif frag_type == 'b_modloss':
                _mass = (b_mass-b_modloss)/charge + add_proton
                _mass[b_modloss == 0] = 0
                set_values.append(_mass)
            elif frag_type == 'y_modloss':
                _mass = (y_mass-y_modloss)/charge + add_proton
                _mass[y_modloss == 0] = 0
                set_values.append(_mass)
            elif frag_type == 'b_H2O':
                _mass = (b_mass-MASS_H2O)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'y_H2O':
                _mass = (y_mass-MASS_H2O)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'b_NH3':
                _mass = (b_mass-MASS_NH3)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'y_NH3':
                _mass = (y_mass-MASS_NH3)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'c':
                _mass = (b_mass+MASS_NH3)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'z':
                _mass = (
                    y_mass-(MASS_NH3-CHEM_MONO_MASS['H'])
                )/charge + add_proton
                set_values.append(_mass)
            else:
                raise NotImplementedError(
                    f'Fragment type "{frag_type}" is not in fragment_mz_df.'
                )
        
        if reference_fragment_df is not None:
            update_sliced_fragment_dataframe(
                fragment_mz_df, np.array(set_values).T, 
                df_group[['frag_start_idx','frag_end_idx']].values, 
                charged_frag_types,
            )
        else:
            _fragment_mz_df = init_fragment_by_precursor_dataframe(
                df_group,
                charged_frag_types
            )
            _fragment_mz_df[:] = np.array(set_values).T
            fragment_df_list.append(_fragment_mz_df)
        precursor_df_list.append(df_group)

    if reference_fragment_df is not None:
        return pd.concat(precursor_df_list), fragment_mz_df
    else:
        return concat_precursor_fragment_dataframes(
            precursor_df_list, fragment_df_list
        )


In [None]:
#export
def update_precursor_mz(
    precursor_df: pd.DataFrame
)->pd.DataFrame:
    """
    Calculate precursor_mz for the precursor_df
    Args:
        precursor_df (pd.DataFrame): 
          precursor_df with the 'charge' column

    Returns:
        pd.DataFrame: precursor_df with 'precursor_mz'
    """

    if 'nAA' not in precursor_df:
        precursor_df['nAA'] = precursor_df.sequence.str.len()
    precursor_df['precursor_mz'] = 0
    _grouped = precursor_df.groupby('nAA')
    for nAA, df_group in _grouped:

        pep_masses = calc_peptide_masses_for_same_len_seqs(
            df_group.sequence.values.astype('U'),
            df_group.mods.values,
            df_group.mod_deltas.values if 'mod_deltas' in df_group.columns else None
        )
        
        precursor_df.loc[
            df_group.index, 'precursor_mz'
        ] = pep_masses/df_group.charge + MASS_PROTON
    return precursor_df

### Examples:

In [None]:
repeat = 100
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
3,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
4,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
...,...,...,...,...,...
195,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
196,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
197,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
198,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2


In [None]:
precursor_df = update_precursor_mz(precursor_df)
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333
3,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333
4,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333
...,...,...,...,...,...,...
195,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770
196,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770
197,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770
198,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770


In [None]:
precursor_df, fragment_mz_df = create_fragment_mz_dataframe(
    precursor_df,
    get_charged_frag_types(['b','y','b_modloss','y_modloss'],2)
)
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,frag_start_idx,frag_end_idx
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,0,8
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,8,16
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,16,24
3,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,24,32
4,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,32,40
...,...,...,...,...,...,...,...,...
195,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770,2035,2048
196,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770,2048,2061
197,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770,2061,2074
198,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869770,2074,2087


In [None]:
fragment_mz_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,114.054954,57.531115,1091.439711,546.223494,0.000000,0.000000,1027.441425,514.224351
1,171.076418,86.041847,1034.418247,517.712762,0.000000,0.000000,970.419962,485.713619
2,308.135330,154.571303,897.359336,449.183306,0.000000,0.000000,833.361050,417.184163
3,468.165979,234.586627,737.328687,369.167981,0.000000,0.000000,673.330401,337.168839
4,597.208572,299.107924,608.286094,304.646685,0.000000,0.000000,544.287808,272.647542
...,...,...,...,...,...,...,...,...
2095,1186.476825,593.742050,561.262715,281.134996,1122.478539,561.742908,0.000000,0.000000
2096,1257.513939,629.260607,490.225602,245.616439,1193.515653,597.261464,0.000000,0.000000
2097,1328.551053,664.779164,419.188488,210.097882,1264.552767,632.780021,0.000000,0.000000
2098,1443.577996,722.292636,304.161545,152.584410,1379.579710,690.293493,0.000000,0.000000


In [None]:
_reference_frag_df = fragment_mz_df
_reference_frag_df[:] = 0
precursor_df, fragment_mz_df = create_fragment_mz_dataframe(
    precursor_df,
    ['b_z1','y_z1'],
    reference_fragment_df=_reference_frag_df
)
fragment_mz_df

Unnamed: 0,b_z1,y_z1
0,114.054954,1091.439711
1,171.076418,1034.418247
2,308.135330,897.359336
3,468.165979,737.328687
4,597.208572,608.286094
...,...,...
2095,1186.476825,561.262715
2096,1257.513939,490.225602
2097,1328.551053,419.188488
2098,1443.577996,304.161545


In [None]:
_reference_frag_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Note that `df.loc[start:end]` also includes the `end` slice.

In [None]:
frag_types = ['y_z1']
frags = get_sliced_fragment_dataframe(
    fragment_mz_df, 
    precursor_df.loc[:1, ['frag_start_idx','frag_end_idx']].values,
    frag_types
)
frags

Unnamed: 0,y_z1
0,1091.439711
1,1034.418247
2,897.359336
3,737.328687
4,608.286094
5,422.206781
6,294.148203
7,147.112804
8,1091.439711
9,1034.418247


## Test mod deltas

In [None]:
repeat = 1
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
mod_deltas = ['']*len(precursor_df)
mod_delta_sites = ['']*len(precursor_df)
mod_deltas[0],mod_delta_sites[0] = '100;200','0;-1'
precursor_df['mod_deltas'] = mod_deltas
precursor_df['mod_delta_sites'] = mod_delta_sites
update_precursor_mz(precursor_df)
precursor_df, fragment_mz_df = create_fragment_mz_dataframe(precursor_df, charged_frag_types=fragment_mz_df.columns.values)
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,mod_deltas,mod_delta_sites,precursor_mz,frag_start_idx,frag_end_idx
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,100;200,0;-1,752.747333,0,8
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,,,602.747333,8,16


In [None]:
fragment_mz_df

Unnamed: 0,b_z1,y_z1
0,214.054954,1291.439711
1,271.076418,1234.418247
2,408.13533,1097.359336
3,568.165979,937.328687
4,697.208572,808.286094
5,883.287885,622.206781
6,1011.346462,494.148203
7,1158.381862,347.112804
8,114.054954,1091.439711
9,171.076418,1034.418247


# Speed Test

In [None]:
repeat = 1000000
peptides = ['AGHCEWQMKADFEWGDGGGGGGGGGGGGK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADERADFEWGDGGGGGGGGGGGGR']*repeat
mods += ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge
0,AGHCEWQMKADFEWGDGGGGGGGGGGGGK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,29,2
1,AGHCEWQMKADFEWGDGGGGGGGGGGGGK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,29,2
2,AGHCEWQMKADFEWGDGGGGGGGGGGGGK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,29,2
3,AGHCEWQMKADFEWGDGGGGGGGGGGGGK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,29,2
4,AGHCEWQMKADFEWGDGGGGGGGGGGGGK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,29,2
...,...,...,...,...,...
1999995,AGHCEWQMKAADERADFEWGDGGGGGGGGGGGGR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,34,2
1999996,AGHCEWQMKAADERADFEWGDGGGGGGGGGGGGR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,34,2
1999997,AGHCEWQMKAADERADFEWGDGGGGGGGGGGGGR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,34,2
1999998,AGHCEWQMKAADERADFEWGDGGGGGGGGGGGGR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,34,2


### Timing: b/y ions without modloss

In [None]:
%timeit -n 1 -r 1 create_fragment_mz_dataframe(precursor_df,get_charged_frag_types(['b','y'], 2))

16.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Timing: b/y ions with modloss
It is much slower than those without `modloss`, as `modloss` is only important for phospho-peptide identification, we can ignore `modloss` for non-phosphoproteomics.

But it is already quite fast to generate b/y/b_modloss/y_modloss ions for 2M peptides within ~70 seconds.

In [None]:
%timeit -n 1 -r 1 create_fragment_mz_dataframe(precursor_df,get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2))

49.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
