In [None]:
#default_exp peptide.fragment

In [None]:
#hide
%reload_ext autoreload
%autoreload 2

In [None]:
#export
import numpy as np
import pandas as pd
from typing import List, Union, Tuple, Iterable
import warnings

from alphabase.constants.aa import \
    get_sequence_mass, \
    get_AA_masses_for_same_len_seqs,\
    get_sequence_masses_for_same_len_seqs
from alphabase.constants.modification import \
    get_modification_mass, get_modloss_mass,\
    get_modification_mass_sum
from alphabase.constants.element import \
    MASS_H2O, MASS_PROTON, MASS_NH3, CHEM_MONO_MASS

def get_charged_frag_types(
    frag_types:List[str], 
    max_frag_charge:int = 2
)->List[str]:
    '''
    Args:
        frag_types (List[str]): e.g. ['b','y','b_modloss','y_modloss']
        max_frag_charge (int): max fragment charge. (default: 2)
    Returns:
        List[str]: for `frag_types=['b','y','b_modloss','y_modloss']` and `max_frag_charge=2`, 
        return `['b_1','b_2','y_1','y_2','b_modloss_1','b_modloss_2','y_modloss_1','y_modloss_2']`.
    '''
    charged_frag_types = []
    for _type in frag_types:
        for _ch in range(1, max_frag_charge+1):
            charged_frag_types.append(f"{_type}_{_ch}")
    return charged_frag_types

def parse_charged_frag_type(
    charged_frag_type: str
)->Tuple[str,int]:
    '''
    Args:
        charged_frag_type (str): e.g. 'y_1', 'b_modloss_1'
    Returns:
        str: fragment type, e.g. 'b','y'
        int: charge state, can be a negative value
    '''
    items = charged_frag_type.split('_')
    _ch = items[-1]
    _type = '_'.join(items[:-1])
    if not _ch[-1].isdigit():
        return _type, int(_ch[:-1])
    else:
        return _type, int(_ch)

def get_by_and_peptide_mass(
    sequence: str,
    mod_names: List[str],
    mod_sites: List[int]
)->Tuple[np.array,np.array,float]:
    '''
    It is highly recommend to use `get_by_and_peptide_mass_for_same_len_seqs()`
    as it is much faster
    '''
    residue_masses = get_sequence_mass(sequence)
    mod_masses = get_modification_mass(len(sequence), mod_names, mod_sites)
    residue_masses += mod_masses
    #residue_masses = residue_masses[np.newaxis, ...]
    b_masses = np.cumsum(residue_masses)
    b_masses, pepmass = b_masses[:-1], b_masses[-1]
        
    pepmass += MASS_H2O
    y_masses = pepmass - b_masses
    return b_masses, y_masses, pepmass

def get_peptide_mass_for_same_len_seqs(
    sequences: np.array,
    mod_list: Iterable[List[str]],
)->np.array:
    '''
    Args:
        mod_list (Iterable[List[str]]): Iterable of modifications (list, array, ...), 
            e.g. `[['Oxidation@M','Phospho@S'],['Phospho@S','Deamidated@N']]` 
    Returns:
        np.array: peptide masses (1-D array, H2O already added)
    '''
    seq_masses = get_sequence_masses_for_same_len_seqs(
        sequences
    )
    mod_masses = np.zeros_like(seq_masses)
    for i, mods in enumerate(mod_list):
        if mods:
            mod_masses[i] = get_modification_mass_sum(
                mods.split(';')
            )
    return seq_masses+mod_masses
    

def get_by_and_peptide_mass_for_same_len_seqs(
    sequences: np.array,
    mod_list: Iterable[List[str]],
    site_list: Iterable[List[int]]
)->Tuple[np.array,np.array,np.array]:
    '''
    Args:
        sequence (np.array of str): np.array of peptie sequences with same length.
        mod_list (Iterable[List[str]]): Iterable of modifications (list, array, ...), 
            e.g. `[['Oxidation@M','Phospho@S'],['Phospho@S','Deamidated@N']]` 
        site_list (Iterable[List[int]]): Iterable of modification sites
            corresponding to `mod_list`, e.g. `[[3,6],[4,17]]`
    Returns:
        np.array: neutral b fragment masses (2-D array)
        np.array: neutral y fragmnet masses (2-D array)
        np.array: neutral peptide masses (1-D array)
    '''
    residue_masses = get_AA_masses_for_same_len_seqs(sequences)
    mod_masses = np.zeros_like(residue_masses)
    seq_len = len(sequences[0])
    for i, (mods, sites) in enumerate(zip(mod_list, site_list)):
        if mods:
            mod_masses[i,:] = get_modification_mass(
                seq_len, 
                mods, 
                sites,
            )
    residue_masses += mod_masses

    b_masses = np.cumsum(residue_masses, axis=1)
    b_masses, pepmass = b_masses[:,:-1], b_masses[:,-1:]
        
    pepmass += MASS_H2O
    y_masses = pepmass - b_masses
    return b_masses, y_masses, pepmass.reshape(-1)

In [None]:
seq, mods, mod_sites = 'AGHCEWQMK', ['Carbamidomethyl@C', 'Oxidation@M'], [4, 8]

In [None]:
get_by_and_peptide_mass(seq, mods, mod_sites)

(array([  71.0371103,  128.0585714,  265.1174773,  425.1481187,
         554.1907064,  740.2700129,  868.3285843, 1015.3639771]),
 array([1090.4323858, 1033.4109247,  896.3520188,  736.3213774,
         607.2787897,  421.1994832,  293.1409118,  146.105519 ]),
 1161.4694961)

In [None]:
get_by_and_peptide_mass_for_same_len_seqs([seq]*2, [mods,[]], [mod_sites,[]])

(array([[  71.0371103,  128.0585714,  265.1174773,  425.1481187,
          554.1907064,  740.2700129,  868.3285843, 1015.3639771],
        [  71.0371103,  128.0585714,  265.1174773,  368.1266576,
          497.1692453,  683.2485518,  811.3071232,  942.3476019]]),
 array([[1090.4323858, 1033.4109247,  896.3520188,  736.3213774,
          607.2787897,  421.1994832,  293.1409118,  146.105519 ],
        [1017.4160106,  960.3945495,  823.3356436,  720.3264633,
          591.2838756,  405.2045691,  277.1459977,  146.105519 ]]),
 array([1161.4694961, 1088.4531209]))

In [None]:
get_peptide_mass_for_same_len_seqs([seq]*2, [';'.join(mods),""])

array([1161.4694961, 1088.4531209])

In [None]:
#export
def init_zero_fragment_dataframe(
    peplen_array:np.array,
    charged_frag_types:List[str]
)->Tuple[pd.DataFrame, np.array, np.array]: 
    '''
    Args:
        peplen_array (np.array): peptide lengths for the fragment dataframe
        charged_frag_types (List[str]): 
            `['b_1','b_2','y_1','y_2','b_modloss_1','y_H2O_1'...]`
    Returns:
        pd.DataFrame: `fragment_df` with zero values
        np.array (int64): the start indices point to the `fragment_df` for each peptide
        np.array (int64): the end indices point to the `fragment_df` for each peptide
    '''
    indices = np.zeros(len(peplen_array)+1, dtype=np.int64)
    indices[1:] = peplen_array-1
    indices = np.cumsum(indices)
    fragment_df = pd.DataFrame(
        np.zeros((indices[-1],len(charged_frag_types))),
        columns = charged_frag_types
    )
    return fragment_df, indices[:-1], indices[1:]

def init_fragment_dataframe_from_other(
    reference_fragment_df: pd.DataFrame
):
    '''
    Init zero fragment dataframe from the `reference_fragment_df`
    '''
    return pd.DataFrame(
        np.zeros_like(reference_fragment_df.values),
        columns = reference_fragment_df.columns
    )

def init_fragment_by_precursor_dataframe(
    precursor_df,
    charged_frag_types: List[str],
    reference_fragment_df: int = None
):
    '''
    Init zero fragment dataframe for the `precursor_df`. If 
    the `reference_fragment_df` is provided, it will generate 
    the dataframe based on the reference. Otherwise it 
    generates the dataframe from scratch.
    Args:
        precursor_df (pd.DataFrame): precursors to generate fragment masses,
            if `precursor_df` contains the 'frag_start_idx' column, 
            it is better to provide `reference_fragment_df` as 
            `precursor_df.frag_start_idx` and `precursor.frag_end_idx` 
            point to the indices in `reference_fragment_df`
        charged_frag_types (List): 
            `['b_1+','b_2+','y_1+','y_2+','b_modloss_1+','y_H2O_1+'...]`
        reference_fragment_df (pd.DataFrame): generate fragment_mass_df based
            on this reference (default: None)
    Returns:
        pd.DataFrame: zero `fragment_df` with given `charged_frag_types`
    '''
    if 'frag_start_idx' not in precursor_df.columns:
        fragment_df, start_indices, end_indices = init_zero_fragment_dataframe(
            precursor_df.nAA.values,
            charged_frag_types
        )
        precursor_df['frag_start_idx'] = start_indices
        precursor_df['frag_end_idx'] = end_indices
    else:
        if reference_fragment_df is None:
            warnings.warn(
                "`precursor_df` contains the 'frag_start_idx' column, "\
                "it is better to provide `reference_fragment_df`", RuntimeWarning
            )
            fragment_df = pd.DataFrame(
                np.zeros((
                    precursor_df.frag_end_idx.max(), len(charged_frag_types)
                )),
                columns = charged_frag_types
            )
        else:
            fragment_df = init_fragment_dataframe_from_other(
                reference_fragment_df[charged_frag_types]
            )
    return fragment_df

In [None]:
#export
def set_sliced_fragment_dataframe(
    fragment_df: pd.DataFrame,
    values: np.array,
    frag_start_end_list: List[Tuple[int,int]],
    charged_frag_types: List[str],
)->pd.DataFrame:
    '''
    Set the values of the slices `frag_start_end_list=[(start,end),(start,end),...]` of fragment_df.
    Args:
        fragment_df (pd.DataFrame): fragment dataframe to be set
        frag_start_end_list (List[Tuple[int,int]]): e.g. `[(start,end),(start,end),...]`
        charged_frag_types (List[str]): e.g. `['b_1','b_2','y_1','y_2']`
    Returns:
        pd.DataFrame: fragment_df after the values are set
    '''
    frag_slice_list = [slice(start,end) for start,end in frag_start_end_list]
    frag_slices = np.r_[tuple(frag_slice_list)]
    fragment_df.loc[frag_slices, charged_frag_types] = values
    return fragment_df

def get_sliced_fragment_dataframe(
    fragment_df: pd.DataFrame,
    frag_start_end_list:Union[List,np.array],
    charged_frag_types:List = None,
)->pd.DataFrame:
    '''
    Get the sliced fragment_df from `frag_start_end_list=[(start,end),(start,end),...]`.
    Args:
        fragment_df (pd.DataFrame): fragment dataframe to be set
        frag_start_end_list (List[Tuple[int,int]]): e.g. `[(start,end),(start,end),...]`
        charged_frag_types (List[str]): e.g. `['b_1','b_2','y_1','y_2']` (default: None)
    Returns:
        pd.DataFrame: sliced fragment_df. If `charged_frag_types` is None, 
        return fragment_df with all columns
    '''
    frag_slice_list = [slice(start,end) for start,end in frag_start_end_list]
    frag_slices = np.r_[tuple(frag_slice_list)]
    if not charged_frag_types:
        charged_frag_types = slice(None)
    return fragment_df.loc[frag_slices, charged_frag_types]

In [None]:
#export
def concat_precursor_fragment_dataframes(
    precursor_df_list: List[pd.DataFrame],
    fragment_df_list: List[pd.DataFrame],
    *other_fragment_df_lists
)->Tuple[pd.DataFrame,...]:
    '''
    Since fragment_df is indexed by precursor_df, when we concatenate multiple 
    fragment_df, the indexed positions will change for in precursor_dfs,  
    this function keeps the correct indexed positions of precursor_df when 
    concatenating multiple fragment_df dataframes.
    Args:
        precursor_df_list (List[pd.DataFrame]): precursor dataframe list to concatenate
        fragment_df_list (List[pd.DataFrame]): fragment dataframe list to concatenate
        *other_fragment_df_lists: arbitray other fragment dataframe list to concatenate, 
            e.g. fragment_mass_df, fragment_inten_df, ...
    Returns:
        Tuple[pd.DataFrame,...]: concatenated precursor_df, fragment_df, *other_fragment_df ...
    '''
    fragment_df_lens = [len(fragment_df) for fragment_df in fragment_df_list]
    cum_frag_df_lens = np.cumsum(fragment_df_lens)
    for i,precursor_df in enumerate(precursor_df_list[1:]):
        precursor_df[['frag_start_idx','frag_end_idx']] += cum_frag_df_lens[i]
    return pd.concat(precursor_df_list).reset_index(drop=True),\
            pd.concat(fragment_df_list).reset_index(drop=True),\
            *[pd.concat(other_list).reset_index(drop=True) for other_list in other_fragment_df_lists]

In [None]:
#export
def get_fragment_mass_dataframe(
    precursor_df: pd.DataFrame,
    charged_frag_types:List,
    reference_fragment_df: pd.DataFrame = None,
)->Tuple[pd.DataFrame, pd.DataFrame]:
    '''
    Generate fragment mass dataframe for the precursor_df. If 
    the `reference_fragment_df` is provided, it will generate 
    the mass dataframe based on the reference. Otherwise it 
    generates the mass dataframe from scratch.
    Args:
        precursor_df (pd.DataFrame): precursors to generate fragment masses,
            if `precursor_df` contains the 'frag_start_idx' column, 
            `reference_fragment_df` must be provided
        charged_frag_types (List): 
            `['b_1','b_2','y_1','y_2','b_modloss_1','y_H2O_1'...]`
        reference_fragment_df (pd.DataFrame): generate fragment_mass_df based
            on this reference, as `precursor_df.frag_start_idx` and 
            `precursor.frag_end_idx` point to the indices in 
            `reference_fragment_df`
    Returns:
        pd.DataFrame: `precursor_df`. `precursor_df` contains the 'charge' column, 
        this function will automatically assign the 'precursor_mz' to `precursor_df`
        pd.DataFrame: `fragment_mass_df` with given `charged_frag_types`
    Raises:
        ValueError: when 1. `precursor_df` contains 'frag_start_idx' but 
        `reference_fragment_df` is not None; or 2. `reference_fragment_df` 
        is None but `precursor_df` does not contain 'frag_start_idx'
    '''
    if reference_fragment_df is None:
        if 'frag_start_idx' in precursor_df.columns:
            raise ValueError(
                "`precursor_df` contains 'frag_start_idx' column, "\
                "please provide `reference_fragment_df` argument"
            )
    else:
        if 'frag_start_idx' not in precursor_df.columns:
            raise ValueError(
                "No column 'frag_start_idx' in `precursor_df` "\
                "to slice the `reference_fragment_df`"
            )
        
    if reference_fragment_df is not None:
        fragment_mass_df = init_fragment_dataframe_from_other(
            reference_fragment_df[charged_frag_types]
        )
    else:
        fragment_df_list = []

    precursor_df_list = []

    _grouped = precursor_df.groupby('nAA')
    for nAA, df_group in _grouped:
        mod_list = []
        site_list = []
        for mod_names, mod_sites in df_group[
            ['mods', 'mod_sites']
        ].values:
            if mod_names:
                mod_names = mod_names.split(';')
                mod_sites = [int(_site) for _site in mod_sites.split(';')]
            else:
                mod_names = []
                mod_sites = []
            mod_list.append(mod_names)
            site_list.append(mod_sites)

        (
            b_mass, y_mass, pepmass
        ) = get_by_and_peptide_mass_for_same_len_seqs(
            df_group.sequence.values.astype('U'), mod_list, site_list
        )
        b_mass = b_mass.reshape(-1)
        y_mass = y_mass.reshape(-1)

        if (
            'charge' in df_group.columns and 
            'precursor_mz' not in df_group.columns
        ):
            df_group['precursor_mz'] = pepmass/df_group[
                'charge'
            ].values + MASS_PROTON

        for charged_frag_type in charged_frag_types:
            if charged_frag_type.startswith('b_modloss'):
                b_modloss = np.concatenate([
                    get_modloss_mass(nAA, mods, sites, True)
                    for mods, sites in zip(mod_list, site_list)
                ])
                break
        for charged_frag_type in charged_frag_types:
            if charged_frag_type.startswith('y_modloss'):
                y_modloss = np.concatenate([
                    get_modloss_mass(nAA, mods, sites, True)
                    for mods, sites in zip(mod_list, site_list)
                ])
                break

        set_values = []
        add_proton = MASS_PROTON
        for charged_frag_type in charged_frag_types:
            frag_type, charge = parse_charged_frag_type(charged_frag_type)
            if frag_type =='b':
                set_values.append(b_mass/charge + add_proton)
            elif frag_type == 'y':
                set_values.append(y_mass/charge + add_proton)
            elif frag_type == 'b_modloss':
                _mass = (b_mass-b_modloss)/charge + add_proton
                _mass[b_modloss == 0] = 0
                set_values.append(_mass)
            elif frag_type == 'y_modloss':
                _mass = (y_mass-y_modloss)/charge + add_proton
                _mass[y_modloss == 0] = 0
                set_values.append(_mass)
            elif frag_type == 'b_H2O':
                _mass = (b_mass-MASS_H2O)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'y_H2O':
                _mass = (y_mass-MASS_H2O)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'b_NH3':
                _mass = (b_mass-MASS_NH3)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'y_NH3':
                _mass = (y_mass-MASS_NH3)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'c':
                _mass = (b_mass+MASS_NH3)/charge + add_proton
                set_values.append(_mass)
            elif frag_type == 'z':
                _mass = (
                    y_mass-(MASS_NH3-CHEM_MONO_MASS['H'])
                )/charge + add_proton
                set_values.append(_mass)
            else:
                raise NotImplementedError(
                    f'Fragment type "{frag_type}" is not in fragment_mass_df.'
                )
        
        if reference_fragment_df is not None:
            set_sliced_fragment_dataframe(
                fragment_mass_df, np.array(set_values).T, 
                df_group[['frag_start_idx','frag_end_idx']].values, 
                charged_frag_types,
            )
        else:
            _fragment_mass_df = init_fragment_by_precursor_dataframe(
                df_group,
                charged_frag_types
            )
            _fragment_mass_df[:] = np.array(set_values).T
            fragment_df_list.append(_fragment_mass_df)
        precursor_df_list.append(df_group)

    if reference_fragment_df is not None:
        return pd.concat(precursor_df_list), fragment_mass_df
    else:
        return concat_precursor_fragment_dataframes(
            precursor_df_list, fragment_df_list
        )


In [None]:
repeat = 100
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
3,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
4,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
...,...,...,...,...,...
195,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
196,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
197,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
198,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2


In [None]:
#export
def set_precursor_mz(
    precursor_df: pd.DataFrame
)->pd.DataFrame:
    """
    Calculate precursor_mz for the precursor_df
    Args:
        precursor_df (pd.DataFrame): 
          precursor_df with the 'charge' column

    Returns:
        pd.DataFrame: precursor_df with 'precursor_mz'
    """

    precursor_df['precursor_mz'] = 0
    _grouped = precursor_df.groupby('nAA')
    for nAA, df_group in _grouped:

        pepmass = get_peptide_mass_for_same_len_seqs(
            df_group.sequence.values.astype('U'),
            df_group.mods.values
        )
        
        precursor_df.loc[
            df_group.index, 'precursor_mz'
        ] = pepmass/df_group.charge + MASS_PROTON
    return precursor_df

In [None]:
precursor_df = set_precursor_mz(precursor_df)
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306
3,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306
4,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306
...,...,...,...,...,...,...
195,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730
196,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730
197,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730
198,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730


In [None]:
precursor_df, fragment_mass_df = get_fragment_mass_dataframe(
    precursor_df,
    get_charged_frag_types(['b','y','b_modloss','y_modloss'],2)
)
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,frag_start_idx,frag_end_idx
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306,0,8
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306,8,16
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306,16,24
3,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306,24,32
4,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747306,32,40
...,...,...,...,...,...,...,...,...
195,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730,2035,2048
196,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730,2048,2061
197,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730,2061,2074
198,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869730,2074,2087


In [None]:
init_fragment_by_precursor_dataframe(
    precursor_df, 
    get_charged_frag_types(['b','y','b_modloss','y_modloss'],2),
)



Unnamed: 0,b_1,b_2,y_1,y_2,b_modloss_1,b_modloss_2,y_modloss_1,y_modloss_2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
fragment_mass_df

Unnamed: 0,b_1,b_2,y_1,y_2,b_modloss_1,b_modloss_2,y_modloss_1,y_modloss_2
0,114.054950,57.531113,1091.439662,546.223469,0.0,0.0,0.0,0.0
1,171.076411,86.041843,1034.418201,517.712738,0.0,0.0,0.0,0.0
2,308.135317,154.571296,897.359295,449.183285,0.0,0.0,0.0,0.0
3,468.165958,234.586617,737.328653,369.167965,0.0,0.0,0.0,0.0
4,597.208546,299.107911,608.286066,304.646671,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2095,1186.476772,593.742024,561.262688,281.134982,0.0,0.0,0.0,0.0
2096,1257.513882,629.260579,490.225578,245.616427,0.0,0.0,0.0,0.0
2097,1328.550993,664.779134,419.188468,210.097872,0.0,0.0,0.0,0.0
2098,1443.577931,722.292604,304.161529,152.584403,0.0,0.0,0.0,0.0


In [None]:
_reference_frag_df = fragment_mass_df
_reference_frag_df[:] = 0
precursor_df, fragment_mass_df = get_fragment_mass_dataframe(
    precursor_df,
    ['b_1','y_1'],
    reference_fragment_df=_reference_frag_df
)
fragment_mass_df

Unnamed: 0,b_1,y_1
0,114.054950,1091.439662
1,171.076411,1034.418201
2,308.135317,897.359295
3,468.165958,737.328653
4,597.208546,608.286066
...,...,...
2095,1186.476772,561.262688
2096,1257.513882,490.225578
2097,1328.550993,419.188468
2098,1443.577931,304.161529


In [None]:
_reference_frag_df

Unnamed: 0,b_1,b_2,y_1,y_2,b_modloss_1,b_modloss_2,y_modloss_1,y_modloss_2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Note that `df.loc[start:end]` also includes the `end` slice.

In [None]:
frag_types = ['y_1']
frags = get_sliced_fragment_dataframe(
    fragment_mass_df, 
    precursor_df.loc[:1, ['frag_start_idx','frag_end_idx']].values,
    frag_types
)
frags

Unnamed: 0,y_1
0,1091.439662
1,1034.418201
2,897.359295
3,737.328653
4,608.286066
5,422.206759
6,294.148188
7,147.112795
8,1091.439662
9,1034.418201


In [None]:
#hide
frags.values.reshape((-1, 8, len(frag_types)))

array([[[1091.4396618],
        [1034.4182007],
        [ 897.3592948],
        [ 737.3286534],
        [ 608.2860657],
        [ 422.2067592],
        [ 294.1481878],
        [ 147.112795 ]],

       [[1091.4396618],
        [1034.4182007],
        [ 897.3592948],
        [ 737.3286534],
        [ 608.2860657],
        [ 422.2067592],
        [ 294.1481878],
        [ 147.112795 ]]])

# Speed Test

In [None]:
repeat = 1000000
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites += ['0;4;8']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
3,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
4,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
...,...,...,...,...,...
1999995,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
1999996,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
1999997,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2
1999998,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2


### b/y ions without modloss

In [None]:
%timeit -n 1 -r 1 get_fragment_mass_dataframe(precursor_df,get_charged_frag_types(['b','y'], 2))

12.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b/y ions with modloss
It is much slower than those without `modloss`, as `modloss` is only important for phospho-peptide identification, we can ignore `modloss` for non-phosphoproteomics.

But it is already quite fast to generate b/y/b_modloss/y_modloss ions for 2M peptides within ~70 seconds.

In [None]:
%timeit -n 1 -r 1 get_fragment_mass_dataframe(precursor_df,get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2))

1min 3s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
