In [None]:
#default_exp peptide.precursor

In [None]:
#export
import pandas as pd
import numpy as np

from alphabase.constants.element import (
    MASS_PROTON
)
from alphabase.peptide.mass_calc import (
    calc_peptide_masses_for_same_len_seqs
)

def refine_precursor_df(
    df:pd.DataFrame, 
    drop_frag_idx=True,
)->pd.DataFrame:
    """ 
    Refine df inplace for faster precursor/fragment calculation.
    """
    df.sort_values('nAA', inplace=True)
    df.reset_index(drop=True, inplace=True)

    if drop_frag_idx and 'frag_start_idx' in df.columns:
        df.drop(columns=[
            'frag_start_idx','frag_end_idx'
        ], inplace=True)

    return df

reset_precursor_df = refine_precursor_df

def is_precursor_refined(precursor_df: pd.DataFrame):
    return (
        (len(precursor_df) == 0) or (
            (precursor_df.index.values[0] == 0) and
            precursor_df.nAA.is_monotonic and
            np.all(
                np.diff(precursor_df.index.values)==1
            )
        )
    )

is_precursor_sorted = is_precursor_refined

def update_precursor_mz(
    precursor_df: pd.DataFrame,
    batch_size = 500000,
)->pd.DataFrame:
    """
    Calculate precursor_mz for the precursor_df
    Args:
        precursor_df (pd.DataFrame): 
          precursor_df with the 'charge' column

    Returns:
        pd.DataFrame: precursor_df with 'precursor_mz'
    """

    if 'nAA' not in precursor_df:
        precursor_df['nAA'] = precursor_df.sequence.str.len()
        reset_precursor_df(precursor_df)
        _calc_in_order = True
    elif is_precursor_sorted(precursor_df):
        _calc_in_order = True
    else:
        _calc_in_order = False
    precursor_df['precursor_mz'] = 0.
    _grouped = precursor_df.groupby('nAA')
    for nAA, big_df_group in _grouped:
        for i in range(0, len(big_df_group), batch_size):
            batch_end = i+batch_size
            
            df_group = big_df_group.iloc[i:batch_end,:]

            pep_mzs = calc_peptide_masses_for_same_len_seqs(
                df_group.sequence.values.astype('U'),
                df_group.mods.values,
                df_group.mod_deltas.values if 'mod_deltas' in df_group.columns else None
            )/df_group.charge + MASS_PROTON
            if _calc_in_order:
                precursor_df.loc[:,'precursor_mz'].values[
                    df_group.index.values[0]:
                    df_group.index.values[-1]+1
                ] = pep_mzs
            else:
                precursor_df.loc[
                    df_group.index, 'precursor_mz'
                ] = pep_mzs
    return precursor_df

calc_precursor_mz = update_precursor_mz

Test `update_precursor_mz()`

In [None]:
repeat = 2
peptides = ['AGHCEWQMKAADER']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMK']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
precursor_df = update_precursor_mz(precursor_df)
assert np.allclose(
    precursor_df.precursor_mz.values, 
    [873.86977, 873.86977,545.233862, 545.233862]
)
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz
0,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771
1,AGHCEWQMKAADER,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,14,2,873.869771
2,AGHCEWQMK,,,9,2,545.233862
3,AGHCEWQMK,,,9,2,545.233862
