In [None]:
#default_exp library.library_base

In [None]:
#export
import pandas as pd
import numpy as np
import numba
import typing
import tqdm
import itertools

import alphabase.peptide.fragment as fragment

@numba.njit
def str_to_int(s):
    if s.startswith('-'):
        neg = True
        s = s[1:]
    else:
        neg = False
    final_index, result = len(s) - 1, 0
    for i,v in enumerate(s):
        result += (ord(v) - 48) * (10 ** (final_index - i))
    if neg:
        return -result
    else:
        return result

#@numba.njit #(cannot use numba for pd.Series)
def generate_modified_sequence(
    df_items:typing.Tuple, # must be ('sequence','mods','mod_sites')
    translate_mod_dict:dict=None,
    mod_sep='()'
):
    '''
    Translate `(sequence, mods, mod_sites)` into a modified sequence. Used by `df.apply()`.
    For example, `('ABCDEFG','Mod1@A;Mod2@E','1;5')`->`_A(Mod1@A)BCDE(Mod2@E)FG_`.
    Args:
        df_items (List): must be `(sequence, mods, mod_sites)`
        translate_mod_dict (dict): A dict to map alpha modification names to other software
        mod_seq (str): '[]' or '()', default '()'
    '''
    nterm = '_'
    cterm = '_'
    mod_seq = df_items[0]
    if df_items[1]:
        mods = df_items[1].split(';')[::-1]
        mod_sites = df_items[2].split(';')[::-1]
        if translate_mod_dict:
            mods = [translate_mod_dict[mod] for mod in mods]
        for site, mod in zip(mod_sites, mods):
            _site = int(site)
            if _site > 0:
                mod_seq = mod_seq[:_site] + mod_sep[0]+mod+mod_sep[1] + mod_seq[_site:]
            elif _site == -1:
                cterm += mod_sep[0]+mod+mod_sep[1]
            elif _site == 0:
                nterm += mod_sep[0]+mod+mod_sep[1]
            else:
                mod_seq = mod_seq[:_site] + mod_sep[0]+mod+mod_sep[1] + mod_seq[_site:]
    return nterm + mod_seq + cterm

In [None]:
generate_modified_sequence(('ACDEFGHIK','ModNterm;ModCterm;ModA@C;ModB@G','0;-1;2;6'))

'_(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)'

In [None]:
generate_modified_sequence(
    ('ACDEFGHIK','ModNterm;ModCterm;ModA@C;ModB@G','0;-1;2;6'),
    {'ModNterm':'Mod(Nterm)', 'ModCterm':'Mod(Cterm)', 'ModA@C':'ModA(C)', 'ModB@G':'ModB(G)'},
    mod_sep='[]'
)

'_[Mod(Nterm)]AC[ModA(C)]DEFG[ModB(G)]HIK_[Mod(Cterm)]'

In [None]:
df = pd.DataFrame()
df['sequence'] = ['ACDEFGHIK']*10
df['mods'] = ['ModNterm;ModCterm;ModA@C;ModB@G']*10
df['mod_sites'] = ['0;-1;2;6']*10
df[['sequence','mods','mod_sites']].apply(generate_modified_sequence, axis=1)

0    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
1    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
2    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
3    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
4    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
5    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
6    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
7    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
8    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
9    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
dtype: object

In [None]:
#export
@numba.njit
def _get_frag_info_from_column_name(column:str):
    idx = column.find('_')
    frag_type = column[:idx]
    ch_str = column[idx+1:]
    charge = str_to_int(ch_str[:-1])
    if ch_str[-1] == '-':
        charge = -charge
    if len(frag_type)==1:
        loss_type = 'noloss'
    else:
        idx = frag_type.find('-')
        loss_type = frag_type[idx+1:]
        frag_type = frag_type[0]
    return frag_type, loss_type, charge

def _get_frag_info_column_names(columns:typing.List[str]):
    frag_types = []
    loss_types = []
    charges = []
    for column in columns:
        frag, loss, charge = _get_frag_info_from_column_name(column)
        frag_types.append(frag)
        loss_types.append(loss)
        charges.append(charge)
    return frag_types, loss_types, charges

def _get_frag_num(columns, rows, frag_len):
    frag_nums = []
    for r,c in zip(rows, columns):
        if c[0] in 'xyz':
            frag_nums.append(frag_len-r)
        else:
            frag_nums.append(r+1)
    return frag_nums

def _flatten(list_of_lists):
    '''
    Flatten a list of lists
    '''
    return list(
        itertools.chain.from_iterable(list_of_lists)
    )

def merge_precursor_fragment_df(
    precursor_df:pd.DataFrame, 
    fragment_mass_df:pd.DataFrame, 
    fragment_inten_df:pd.DataFrame, 
    top_n_inten:int,
    frag_type_head:str='FragmentType',
    frag_mass_head:str='FragmengMz',
    frag_inten_head:str='RelativeIntensity',
    frag_charge_head:str='FragmentCharge',
    frag_loss_head:str='FragmentLossType',
    frag_num_head:str='FragmentNumber'
):
    '''
    Convert alphabase library into a single dataframe (library for other software)
    '''
    df = precursor_df.copy()
    frag_columns = fragment_mass_df.columns.values.astype('U')
    frag_type_list = []
    frag_loss_list = []
    frag_charge_list = []
    frag_mass_list = []
    frag_inten_list = []
    frag_num_list = []
    for start, end in tqdm.tqdm(df[['frag_start_idx','frag_end_idx']].values):
        intens = fragment_inten_df.iloc[start:end,:].values # is loc[start:end-1,:] faster?
        masses = fragment_mass_df.iloc[start:end,:].values
        sorted_idx = np.argsort(intens.reshape(-1))[-top_n_inten:][::-1]
        idx_in_df = np.unravel_index(sorted_idx, masses.shape)

        frag_len = end-start
        rows = np.arange(frag_len, dtype=np.int32)[idx_in_df[0]]
        columns = frag_columns[idx_in_df[1]]

        frag_types, loss_types, charges = _get_frag_info_column_names(columns)

        frag_nums = _get_frag_num(columns, rows, frag_len)

        frag_type_list.append(frag_types)
        frag_loss_list.append(loss_types)
        frag_charge_list.append(charges)
        frag_mass_list.append(masses[idx_in_df])
        frag_inten_list.append(intens[idx_in_df])
        frag_num_list.append(frag_nums)
    
    try:
        df[frag_type_head] = frag_type_list
        df[frag_mass_head] = frag_mass_list
        df[frag_inten_head] = frag_inten_list
        df[frag_charge_head] = frag_charge_list
        df[frag_loss_head] = frag_loss_list
        df[frag_num_head] = frag_num_list
        return df.explode([
            frag_type_head,
            frag_mass_head,
            frag_inten_head,
            frag_charge_head,
            frag_loss_head,
            frag_num_head
        ])
    except ValueError:
        # df.explode does not allow mulitple columns before pandas version 1.x.x.
        df[frag_type_head] = frag_type_list
        df = df.explode(frag_type_head)

        df[frag_mass_head] = _flatten(frag_mass_list)
        df[frag_inten_head] = _flatten(frag_inten_list)
        df[frag_charge_head] = _flatten(frag_charge_list)
        df[frag_loss_head] = _flatten(frag_loss_list)
        df[frag_num_head] = _flatten(frag_num_list)
        return df



In [None]:
#export
alpha_to_other_mod_dict = {
    "Carbamidomethyl@C": "Carbamidomethyl (C)",
    "Oxidation@M": "Oxidation (M)",
    "Phospho@S": "Phospho (STY)",
    "Phospho@T": "Phospho (STY)",
    "Phospho@Y": "Phospho (STY)",
    "GlyGly@K": "GlyGly (K)",
    "Acetyl@Protein N-term": "Acetyl (Protein N-term)",
}

class SpecLibBase(object):
    def __init__(self,
        charged_ion_types:str, # e.g. ['b_1+','b_2+','y_1+','y_2+', ...]
        min_frag_mz = 200, max_frag_mz = 2000,
        min_precursor_mz = 500, max_precursor_mz = 2000,
    ):
        self.charged_ion_types = charged_ion_types
        self._precursor_df:pd.DataFrame = None
        self._fragment_inten_df:pd.DataFrame = None
        self._fragment_mass_df:pd.DataFrame = None
        self.min_frag_mz = min_frag_mz
        self.max_frag_mz = max_frag_mz
        self.min_precursor_mz = min_precursor_mz
        self.max_precursor_mz = max_precursor_mz

    @property
    def precursor_df(self):
        return self._precursor_df

    @precursor_df.setter
    def precursor_df(self, df):
        self._precursor_df = df.reset_index(drop=True)
        if 'precursor_mz' in self._precursor_df.columns:
            self.clip_precursor_()

    @property
    def fragment_mass_df(self):
        return self._fragment_mass_df

    @property
    def fragment_inten_df(self):
        return self._fragment_inten_df

    def clip_precursor_(self):
        ''' 
        Clip self._precursor_df inplace
        '''
        self._precursor_df = self._precursor_df[
            (self._precursor_df['precursor_mz']>=self.min_precursor_mz)&
            (self._precursor_df['precursor_mz']<=self.max_precursor_mz)
        ]
        self._precursor_df.reset_index(drop=True, inplace=True)

    def clip_inten_by_fragment_mass_(self):
        ''' 
        Clip self._fragment_inten_df inplace. All clipped masses are set as zeros.
        '''
        self._fragment_inten_df[
            (self._fragment_mass_df<self.min_frag_mz)|
            (self._fragment_mass_df>self.max_frag_mz)
        ] = 0

    def clip_inten_by_fragment_mass(self)->pd.DataFrame:
        df = self._fragment_inten_df.copy()
        df[
            (self._fragment_mass_df<self.min_frag_mz)|
            (self._fragment_mass_df>self.max_frag_mz)
        ] = 0
        return df
    
    def load_precursor_df(self, 
        precursor_files, **kargs
    ):
        self._load_precursor_df(precursor_files, **kargs)
        self.clip_precursor_()

    def _load_precursor_df(self, precursor_files, **kargs):
        '''
        All sub-class must reimplement this method
        '''
        raise NotImplementedError(
            f'Sub-class of "{self.__class__}" must re-implement "_load_precursor_df()"'
        )

    def load_fragment_df(self, **kargs):
        self.load_fragment_mass_df(**kargs)
        self.load_fragment_inten_df(**kargs)

    def load_fragment_inten_df(self, **kargs):
        '''
        All sub-class must reimplement this method. 
        Fragment intensities can be predicted or from AlphaPept, or ...
        '''
        raise NotImplementedError(
            f'Sub-class of "{self.__class__}" must re-implement "load_fragment_inten_df()"'
        )

    def load_fragment_mass_df(self):
        self._precursor_df, self._fragment_mass_df = fragment.get_fragment_mass_dataframe(
            self._precursor_df, self.charged_ion_types
        )
        # clip precursor after mass calculation
        self.clip_precursor_()

    def save_hdf(self, hdf_file):
        raise NotImplementedError('') # we need alphabase.HDFFile for HDF files

    def to_single_df(self,
        translate_mod_dict:dict = alpha_to_other_mod_dict,
        keep_k_highest_inten:int=12
    )->pd.DataFrame:
        '''
        Convert alphabase library to diann (or Spectronaut) library dataframe
        Args:
            translate_mod_dict (dict): a dict map modifications from alphabase to other software. Default: build-in `alpha_to_other_mod_dict`
            keep_k_highest_inten (int): only keep highest fragment intensities for each precursor. Default: 12
        Return:
            pd.DataFrame: a single-file dataframe which contains precursors and fragments
        '''
        df = pd.DataFrame()
        df['ModifiedPeptide'] = self._precursor_df[
            ['sequence','mods','mod_sites']
        ].apply(
            generate_modified_sequence, 
            axis=1,
            translate_mod_dict=translate_mod_dict,
            mod_sep='[]'
        )

        df['frag_start_idx'] = self._precursor_df['frag_start_idx']
        df['frag_end_idx'] = self._precursor_df['frag_end_idx']
        
        df['PrecursorCharge'] = self._precursor_df['charge']
        if 'predict_RT' in self._precursor_df.columns:
            df['iRT'] = self._precursor_df['predict_RT']
        else:
            df['iRT'] = self._precursor_df['RT']

        if 'predict_CCS' in self._precursor_df.columns:
            df['CCS'] = self._precursor_df['predict_CCS']
        elif 'CCS' in self._precursor_df.columns:
            df['CCS'] = self._precursor_df['CCS']

        df['LabelModifiedSequence'] = df['ModifiedPeptide']
        df['StrippedPeptide'] = self._precursor_df['sequence']

        df['PrecursorMz'] = self._precursor_df['precursor_mz']

        if 'protein_name' in self._precursor_df.columns:
            df['ProteinName'] = self._precursor_df['protein_name']
            df['UniprotID'] = df['ProteinName']
            df['ProteinGroups'] = df['ProteinName']

        if 'uniprot_id' in self._precursor_df.columns:
            df['UniprotID'] = self._precursor_df['uniprot_id']
            if 'ProteinName' not in df.columns:
                df['ProteinName'] = df['UniprotID']
                df['ProteinGroups'] = df['UniprotID']

        if 'genes' in self._precursor_df.columns:
            df['Genes'] = self._precursor_df['genes']

        if 'protein_group' in self._precursor_df.columns:
            df['ProteinGroups'] = self._precursor_df['protein_group']

        frag_inten = self.clip_inten_by_fragment_mass()

        df = merge_precursor_fragment_df(
            df,
            self._fragment_mass_df,
            frag_inten,
            top_n_inten=keep_k_highest_inten,
            frag_type_head='FragmentType',
            frag_mass_head='FragmentMz',
            frag_inten_head='RelativeIntensity',
            frag_charge_head='FragmentCharge',
            frag_loss_head='FragmentLossType',
            frag_num_head='FragmentNumber'
        )
        df = df[df['RelativeIntensity']>0]

        return df.drop(['frag_start_idx','frag_end_idx'], axis=1)


### Test `to_single_df` method

In [None]:
from alphabase.peptide.fragment import get_fragment_mass_dataframe
repeat = 10
charged_frag_types = ['b_1+','y_1+','y-modloss_1+']
precursor_df = pd.DataFrame({
    'sequence': ['ASGHCEWMKYR']*repeat,
    'mods': ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat,
    'mod_sites': ['0;4;8']*repeat,
    'nAA': 11,
    'NCE': 20,
    'instrument': 'QE',
    'RT': 10,
    'charge': 2,
    'protein_name': 'unknown'
})
precursor_df.loc[0,['mods','mod_sites']] = ['Phospho@S','2']
precursor_df,frag_mass_df = get_fragment_mass_dataframe(precursor_df, charged_frag_types)
frag_mass_df

Unnamed: 0,b_1+,y_1+,y-modloss_1+
0,72.044386,1376.527493,0.000000
1,239.042740,1209.529140,1111.552248
2,296.064201,1152.507679,1054.530787
3,433.123107,1015.448773,917.471881
4,536.132287,912.439593,814.462701
...,...,...,...
95,684.240570,799.391919,0.000000
96,870.319877,613.312613,0.000000
97,1017.355269,466.277220,0.000000
98,1145.450225,338.182264,0.000000


In [None]:
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,NCE,instrument,RT,charge,protein_name,precursor_mz,frag_start_idx,frag_end_idx
0,ASGHCEWMKYR,Phospho@S,2,11,20,QE,10,2,unknown,724.28594,0,10
1,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,10,20
2,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,20,30
3,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,30,40
4,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,40,50
5,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,50,60
6,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,60,70
7,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,70,80
8,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,80,90
9,ASGHCEWMKYR,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,741.816245,90,100


In [None]:
spec_lib = SpecLibBase(charged_frag_types)
spec_lib._precursor_df = precursor_df
spec_lib._fragment_inten_df = frag_mass_df
spec_lib._fragment_mass_df = frag_mass_df
spec_lib.to_single_df()

100%|██████████| 10/10 [00:00<00:00, 2720.22it/s]


Unnamed: 0,ModifiedPeptide,PrecursorCharge,iRT,LabelModifiedSequence,StrippedPeptide,ProteinName,UniprotID,ProteinGroups,FragmentType,FragmengMz,RelativeIntensity,FragmentCharge,FragmentLossType,FragmentNumber
0,_AS[Phospho (STY)]GHCEWMKYR_,2,10,_AS[Phospho (STY)]GHCEWMKYR_,ASGHCEWMKYR,unknown,unknown,unknown,y,1376.527493,1376.527493,1,noloss,10
0,_AS[Phospho (STY)]GHCEWMKYR_,2,10,_AS[Phospho (STY)]GHCEWMKYR_,ASGHCEWMKYR,unknown,unknown,unknown,b,1273.452938,1273.452938,1,noloss,10
0,_AS[Phospho (STY)]GHCEWMKYR_,2,10,_AS[Phospho (STY)]GHCEWMKYR_,ASGHCEWMKYR,unknown,unknown,unknown,y,1209.52914,1209.52914,1,noloss,9
0,_AS[Phospho (STY)]GHCEWMKYR_,2,10,_AS[Phospho (STY)]GHCEWMKYR_,ASGHCEWMKYR,unknown,unknown,unknown,y,1152.507679,1152.507679,1,noloss,8
0,_AS[Phospho (STY)]GHCEWMKYR_,2,10,_AS[Phospho (STY)]GHCEWMKYR_,ASGHCEWMKYR,unknown,unknown,unknown,y,1111.552248,1111.552248,1,modloss,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,2,10,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,ASGHCEWMKYR,unknown,unknown,unknown,y,928.434507,928.434507,1,noloss,6
9,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,2,10,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,ASGHCEWMKYR,unknown,unknown,unknown,b,870.319877,870.319877,1,noloss,7
9,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,2,10,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,ASGHCEWMKYR,unknown,unknown,unknown,y,799.391919,799.391919,1,noloss,5
9,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,2,10,_[Acetyl (Protein N-term)]ASGH[Carbamidomethyl...,ASGHCEWMKYR,unknown,unknown,unknown,b,684.24057,684.24057,1,noloss,6
