In [None]:
#default_exp spec_lib.translate

In [None]:
#export
import pandas as pd
import numpy as np
import tqdm
import typing
import itertools

from alphabase.spectrum_library.library_base import SpecLibBase
from peptdeep.model.ccs import ccs_to_mobility_pred_df

In [None]:
#export
#@numba.njit #(cannot use numba for pd.Series)
def create_modified_sequence(
    df_items:typing.Tuple, # must be ('sequence','mods','mod_sites')
    translate_mod_dict:dict=None,
    mod_sep='[]'
):
    '''
    Translate `(sequence, mods, mod_sites)` into a modified sequence. Used by `df.apply()`.
    For example, `('ABCDEFG','Mod1@A;Mod2@E','1;5')`->`_A[Mod1@A]BCDE[Mod2@E]FG_`.
    Args:
        df_items (List): must be `(sequence, mods, mod_sites)`
        translate_mod_dict (dict): A dict to map alpha modification names to other software
        mod_seq (str): '[]' or '()', default '[]'
    '''
    nterm = '_'
    cterm = '_'
    mod_seq = df_items[0]
    if df_items[1]:
        mods = df_items[1].split(';')[::-1]
        mod_sites = df_items[2].split(';')[::-1]
        if translate_mod_dict is not None:
            mods = [translate_mod_dict[mod] for mod in mods]
        for site, mod in zip(mod_sites, mods):
            _site = int(site)
            if _site > 0:
                mod_seq = mod_seq[:_site] + mod_sep[0]+mod+mod_sep[1] + mod_seq[_site:]
            elif _site == -1:
                cterm += mod_sep[0]+mod+mod_sep[1]
            elif _site == 0:
                nterm += mod_sep[0]+mod+mod_sep[1]
            else:
                mod_seq = mod_seq[:_site] + mod_sep[0]+mod+mod_sep[1] + mod_seq[_site:]
    return nterm + mod_seq + cterm

In [None]:
df = pd.DataFrame()
df['sequence'] = ['ACDEFGHIK']*10
df['mods'] = ['ModNterm;ModCterm;ModA@C;ModB@G']*10
df['mod_sites'] = ['0;-1;2;6']*10
df[['sequence','mods','mod_sites']].apply(create_modified_sequence, axis=1)

0    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
1    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
2    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
3    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
4    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
5    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
6    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
7    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
8    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
9    _[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]
dtype: object

In [None]:
create_modified_sequence(('ACDEFGHIK','ModNterm;ModCterm;ModA@C;ModB@G','0;-1;2;6'))

'_[ModNterm]AC[ModA@C]DEFG[ModB@G]HIK_[ModCterm]'

In [None]:
create_modified_sequence(
    ('ACDEFGHIK','ModNterm;ModCterm;ModA@C;ModB@G','0;-1;2;6'),
    {'ModNterm':'Mod(Nterm)', 'ModCterm':'Mod(Cterm)', 'ModA@C':'ModA(C)', 'ModB@G':'ModB(G)'},
    mod_sep='()'
)

'_(Mod(Nterm))AC(ModA(C))DEFG(ModB(G))HIK_(Mod(Cterm))'

In [None]:
#export

def _get_frag_info_from_column_name(column:str):
    '''
    Only used when convert alphabase libraries into other libraries
    '''
    idx = column.rfind('_')
    frag_type = column[:idx]
    ch_str = column[idx+2:]
    charge = int(ch_str)
    if len(frag_type)==1:
        loss_type = 'noloss'
    else:
        idx = frag_type.find('_')
        loss_type = frag_type[idx+1:]
        frag_type = frag_type[0]
    return frag_type, loss_type, charge

def _get_frag_num(columns, rows, frag_len):
    frag_nums = []
    for r,c in zip(rows, columns):
        if c[0] in 'xyz':
            frag_nums.append(frag_len-r)
        else:
            frag_nums.append(r+1)
    return frag_nums

def _flatten(list_of_lists):
    '''
    Flatten a list of lists
    '''
    return list(
        itertools.chain.from_iterable(list_of_lists)
    )

def merge_precursor_fragment_df(
    precursor_df:pd.DataFrame, 
    fragment_mz_df:pd.DataFrame, 
    fragment_inten_df:pd.DataFrame, 
    top_n_inten:int,
    frag_type_head:str='FragmentType',
    frag_mass_head:str='FragmengMz',
    frag_inten_head:str='RelativeIntensity',
    frag_charge_head:str='FragmentCharge',
    frag_loss_head:str='FragmentLossType',
    frag_num_head:str='FragmentNumber'
):
    '''
    Convert alphabase library into a single dataframe. 
    This method is not important, as it will be only 
    used by DiaNN, or spectronaut, or others
    '''
    df = precursor_df.copy()
    frag_columns = fragment_mz_df.columns.values.astype('U')
    frag_type_list = []
    frag_loss_list = []
    frag_charge_list = []
    frag_mass_list = []
    frag_inten_list = []
    frag_num_list = []
    for start, end in tqdm.tqdm(df[['frag_start_idx','frag_end_idx']].values):
        intens = fragment_inten_df.iloc[start:end,:].values # is loc[start:end-1,:] faster?
        masses = fragment_mz_df.iloc[start:end,:].values
        sorted_idx = np.argsort(intens.reshape(-1))[-top_n_inten:][::-1]
        idx_in_df = np.unravel_index(sorted_idx, masses.shape)

        frag_len = end-start
        rows = np.arange(frag_len, dtype=np.int32)[idx_in_df[0]]
        columns = frag_columns[idx_in_df[1]]

        frag_types, loss_types, charges = zip(
            *[_get_frag_info_from_column_name(_) for _ in columns]
        )

        frag_nums = _get_frag_num(columns, rows, frag_len)

        frag_type_list.append(frag_types)
        frag_loss_list.append(loss_types)
        frag_charge_list.append(charges)
        frag_mass_list.append(masses[idx_in_df])
        frag_inten_list.append(intens[idx_in_df])
        frag_num_list.append(frag_nums)
    
    try:
        df[frag_type_head] = frag_type_list
        df[frag_mass_head] = frag_mass_list
        df[frag_inten_head] = frag_inten_list
        df[frag_charge_head] = frag_charge_list
        df[frag_loss_head] = frag_loss_list
        df[frag_num_head] = frag_num_list
        return df.explode([
            frag_type_head,
            frag_mass_head,
            frag_inten_head,
            frag_charge_head,
            frag_loss_head,
            frag_num_head
        ])
    except ValueError:
        # df.explode does not allow mulitple columns before pandas version 1.x.x.
        df[frag_type_head] = frag_type_list
        df = df.explode(frag_type_head)

        df[frag_mass_head] = _flatten(frag_mass_list)
        df[frag_inten_head] = _flatten(frag_inten_list)
        df[frag_charge_head] = _flatten(frag_charge_list)
        df[frag_loss_head] = _flatten(frag_loss_list)
        df[frag_num_head] = _flatten(frag_num_list)
        return df

mod_to_other_mod_dict = {
    "Carbamidomethyl@C": "Carbamidomethyl (C)",
    "Oxidation@M": "Oxidation (M)",
    "Phospho@S": "Phospho (STY)",
    "Phospho@T": "Phospho (STY)",
    "Phospho@Y": "Phospho (STY)",
    "GlyGly@K": "GlyGly (K)",
    "Acetyl@Protein N-term": "Acetyl (Protein N-term)",
}

from alphabase.constants.modification import MOD_DF
mod_to_unimod_dict = {}
for mod_name,unimod_id in MOD_DF[['name','unimod_id']].values:
    mod_to_unimod_dict[mod_name] = f"UniMod:{unimod_id}"

def mask_fragment_intensity_by_mz_(
    fragment_mz_df:pd.DataFrame, 
    fragment_intensity_df:pd.DataFrame,
    min_frag_mz, max_frag_mz
):
    fragment_intensity_df.mask(
        (fragment_mz_df>max_frag_mz)&(fragment_mz_df<min_frag_mz),
        0, inplace=True
    )

def speclib_to_single_df(
    speclib:SpecLibBase,
    *,
    translate_mod_dict:dict = None,
    keep_k_highest_intensity:int=12,
    min_frag_mz = 200,
    max_frag_mz = 2000,
    min_frag_intensity = 0.02,
    modloss='H3PO4',
    frag_type_head:str='FragmentType',
    frag_mass_head:str='FragmentMz',
    frag_inten_head:str='RelativeIntensity',
    frag_charge_head:str='FragmentCharge',
    frag_loss_head:str='FragmentLossType',
    frag_num_head:str='FragmentNumber',
):
    '''
    Convert alphabase library to diann (or Spectronaut) library dataframe
    This method is not important, as it will be only 
    used by DiaNN, or spectronaut, or others
    Args:
        translate_mod_dict (dict): a dict map modifications from alphabase to other software. Default: build-in `alpha_to_other_mod_dict`
        keep_k_highest_intensity (int): only keep highest fragment intensities for each precursor. Default: 12
    Returns:
        pd.DataFrame: a single-file dataframe which contains precursors and fragments
    '''
    df = pd.DataFrame()
    df['ModifiedPeptide'] = speclib._precursor_df[
        ['sequence','mods','mod_sites']
    ].apply(
        create_modified_sequence, 
        axis=1,
        translate_mod_dict=translate_mod_dict,
        mod_sep='[]'
    )

    df['frag_start_idx'] = speclib._precursor_df['frag_start_idx']
    df['frag_end_idx'] = speclib._precursor_df['frag_end_idx']
    
    df['PrecursorCharge'] = speclib._precursor_df['charge']
    if 'irt_pred' in speclib._precursor_df.columns:
        df['iRT'] = speclib._precursor_df['irt_pred']
    elif 'rt_pred' in speclib._precursor_df.columns:
        df['iRT'] = speclib._precursor_df['rt_pred']
    elif 'rt_norm' in speclib._precursor_df.columns:
        df['iRT'] = speclib._precursor_df['rt_norm']
    else:
        raise ValueError('precursor_df must contain the "rt_pred" or "rt_norm" column')

    if 'mobility_pred' in speclib._precursor_df.columns:
        df['IonMobility'] = speclib._precursor_df.mobility_pred
    elif 'mobility' in speclib._precursor_df.columns:
        df['IonMobility'] = speclib._precursor_df.mobility
    
    df['LabelModifiedSequence'] = df['ModifiedPeptide']
    df['StrippedPeptide'] = speclib._precursor_df['sequence']

    if 'precursor_mz' not in speclib._precursor_df.columns:
        speclib.calc_precursor_mz()
    df['PrecursorMz'] = speclib._precursor_df['precursor_mz']

    if 'proteins' in speclib._precursor_df.columns:
        df['ProteinName'] = speclib._precursor_df['proteins']
        df['UniprotID'] = df['ProteinName']
        df['ProteinGroups'] = df['ProteinName']

    if 'uniprot_ids' in speclib._precursor_df.columns:
        df['UniprotID'] = speclib._precursor_df['uniprot_ids']
        if 'ProteinName' not in df.columns:
            df['ProteinName'] = df['UniprotID']
            df['ProteinGroups'] = df['UniprotID']

    if 'genes' in speclib._precursor_df.columns:
        df['Genes'] = speclib._precursor_df['genes']

    if 'protein_group' in speclib._precursor_df.columns:
        df['ProteinGroups'] = speclib._precursor_df['protein_group']

    mask_fragment_intensity_by_mz_(
        speclib._fragment_mz_df,
        speclib._fragment_intensity_df,
        min_frag_mz, max_frag_mz
    )

    df = merge_precursor_fragment_df(
        df,
        speclib._fragment_mz_df,
        speclib._fragment_intensity_df,
        top_n_inten=keep_k_highest_intensity,
        frag_type_head=frag_type_head,
        frag_mass_head=frag_mass_head,
        frag_inten_head=frag_inten_head,
        frag_charge_head=frag_charge_head,
        frag_loss_head=frag_loss_head,
        frag_num_head=frag_num_head,
    )
    df = df[df['RelativeIntensity']>min_frag_intensity]
    df.loc[df[frag_loss_head]=='modloss',frag_loss_head] = modloss

    return df.drop(['frag_start_idx','frag_end_idx'], axis=1)

def speclib_to_swath_df(
    speclib:SpecLibBase,
    *,
    keep_k_highest_intensity:int=12,
    min_frag_mz = 200,
    max_frag_mz = 2000,
    min_frag_intensity = 0.02,
):
    speclib_to_single_df(
        speclib, 
        translate_mod_dict=mod_to_other_mod_dict,
        keep_k_highest_intensity=keep_k_highest_intensity,
        min_frag_mz = min_frag_mz,
        max_frag_mz = max_frag_mz,
        min_frag_intensity = min_frag_intensity,
    )

In [None]:
from alphabase.peptide.fragment import create_fragment_mz_dataframe
repeat = 10
charged_frag_types = ['b_z1','y_z1','y_modloss_z1']
precursor_df = pd.DataFrame({
    'sequence': ['ASGHCEWMKYR']*repeat,
    'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,
    'mod_sites': ['0;4;8']*repeat,
    'nAA': 11,
    'NCE': 20,
    'instrument': 'QE',
    'rt_pred': 10,
    'charge': 2,
    'protein_name': 'unknown',
    'mobility_pred': 1,
})
precursor_df.loc[0,['mods','mod_sites']] = ['Phospho@S','2']
frag_mass_df = create_fragment_mz_dataframe(precursor_df, charged_frag_types)
frag_mass_df

Unnamed: 0,b_z1,y_z1,y_modloss_z1
0,72.044390,1376.527555,1278.550659
1,239.042750,1209.529195,0.000000
2,296.064213,1152.507732,0.000000
3,433.123125,1015.448820,0.000000
4,536.132310,912.439635,0.000000
...,...,...,...
95,684.240601,799.391956,735.393670
96,870.319914,613.312643,549.314358
97,1017.355313,466.277244,0.000000
98,1145.450276,338.182281,0.000000


In [None]:
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,NCE,instrument,rt_pred,charge,protein_name,mobility_pred,frag_start_idx,frag_end_idx
0,ASGHCEWMKYR,Phospho@S,2,11,20,QE,10,2,unknown,1,0,10
1,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,10,20
2,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,20,30
3,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,30,40
4,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,40,50
5,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,50,60
6,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,60,70
7,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,70,80
8,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,80,90
9,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,90,100


In [None]:
spec_lib = SpecLibBase(charged_frag_types)
spec_lib._precursor_df = precursor_df
spec_lib._fragment_intensity_df = frag_mass_df
spec_lib._fragment_mz_df = frag_mass_df
speclib_to_single_df(spec_lib)

100%|██████████| 10/10 [00:00<00:00, 1605.60it/s]


Unnamed: 0,ModifiedPeptide,PrecursorCharge,iRT,IonMobility,LabelModifiedSequence,StrippedPeptide,PrecursorMz,FragmentType,FragmentMz,RelativeIntensity,FragmentCharge,FragmentLossType,FragmentNumber
0,_AS[Phospho@S]GHCEWMKYR_,2,10,1,_AS[Phospho@S]GHCEWMKYR_,ASGHCEWMKYR,724.285972,y,1376.527555,1376.527555,1,noloss,10
0,_AS[Phospho@S]GHCEWMKYR_,2,10,1,_AS[Phospho@S]GHCEWMKYR_,ASGHCEWMKYR,724.285972,y,1278.550659,1278.550659,1,H3PO4,10
0,_AS[Phospho@S]GHCEWMKYR_,2,10,1,_AS[Phospho@S]GHCEWMKYR_,ASGHCEWMKYR,724.285972,b,1273.452993,1273.452993,1,noloss,10
0,_AS[Phospho@S]GHCEWMKYR_,2,10,1,_AS[Phospho@S]GHCEWMKYR_,ASGHCEWMKYR,724.285972,y,1209.529195,1209.529195,1,noloss,9
0,_AS[Phospho@S]GHCEWMKYR_,2,10,1,_AS[Phospho@S]GHCEWMKYR_,ASGHCEWMKYR,724.285972,y,1152.507732,1152.507732,1,noloss,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,2,10,1,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,ASGHCEWMKYR,741.816279,b,1145.450276,1145.450276,1,noloss,9
9,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,2,10,1,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,ASGHCEWMKYR,741.816279,y,1031.443734,1031.443734,1,noloss,7
9,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,2,10,1,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,ASGHCEWMKYR,741.816279,b,1017.355313,1017.355313,1,noloss,8
9,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,2,10,1,_[Acetyl@Protein N-term]ASGH[Carbamidomethyl@C...,ASGHCEWMKYR,741.816279,y,967.445449,967.445449,1,H3PO4,7
