In [1]:
#default_exp speclib.lib_base

In [2]:
#export
import pandas as pd
import numpy as np
import numba
import typing
import tqdm

import alphabase.peptide.fragment as fragment
from alphadeep._utils import str_to_int


#@numba.njit #(cannot use numba for pd.Series)
def generate_modified_sequence(
    df_items, #['sequence','mods','mod_sites']
    translate_mod_dict:dict=None,
    mod_sep='()'
):
    nterm = '_'
    cterm = '_'
    mod_seq = df_items[0]
    if df_items[1]:
        mods = df_items[1].split(';')[::-1]
        mod_sites = df_items[2].split(';')[::-1]
        if translate_mod_dict:
            mods = [translate_mod_dict[mod] for mod in mods]
        for site, mod in zip(mod_sites, mods):
            _site = int(site)
            if _site > 0:
                mod_seq = mod_seq[:_site] + mod_sep[0]+mod+mod_sep[1] + mod_seq[_site:]
            elif _site == -1:
                cterm += mod_sep[0]+mod+mod_sep[1]
            elif _site == 0:
                nterm += mod_sep[0]+mod+mod_sep[1]
            else:
                mod_seq = mod_seq[:_site] + mod_sep[0]+mod+mod_sep[1] + mod_seq[_site:]
    return nterm + mod_seq + cterm

In [3]:
generate_modified_sequence(('ACDEFGHIK','ModNterm;ModCterm;ModA@C;ModB@G','0;-1;2;6'))

'_(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)'

In [4]:
generate_modified_sequence(
    ('ACDEFGHIK','ModNterm;ModCterm;ModA@C;ModB@G','0;-1;2;6'),
    {'ModNterm':'Mod(Nterm)', 'ModCterm':'Mod(Cterm)', 'ModA@C':'ModA(C)', 'ModB@G':'ModB(G)'},
    mod_sep='[]'
)

'_[Mod(Nterm)]AC[ModA(C)]DEFG[ModB(G)]HIK_[Mod(Cterm)]'

In [5]:
df = pd.DataFrame()
df['sequence'] = ['ACDEFGHIK']*10
df['mods'] = ['ModNterm;ModCterm;ModA@C;ModB@G']*10
df['mod_sites'] = ['0;-1;2;6']*10
df[['sequence','mods','mod_sites']].apply(generate_modified_sequence, axis=1)

0    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
1    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
2    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
3    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
4    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
5    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
6    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
7    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
8    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
9    _(ModNterm)AC(ModA@C)DEFG(ModB@G)HIK_(ModCterm)
dtype: object

In [6]:
#export
@numba.njit
def _get_frag_info_from_column_name(column:str):
    idx = column.find('_')
    frag_type = column[:idx]
    ch_str = column[idx+1:]
    charge = str_to_int(ch_str[:-1])
    if ch_str[-1] == '-':
        charge = -charge
    if len(frag_type)==1:
        loss_type = 'noloss'
    else:
        idx = frag_type.find('-')
        loss_type = frag_type[idx+1:]
        frag_type = frag_type[0]
    return frag_type, loss_type, charge

def _get_frag_info_column_names(columns:typing.List[str]):
    frag_types = []
    loss_types = []
    charges = []
    for column in columns:
        frag, loss, charge = _get_frag_info_from_column_name(column)
        frag_types.apped(frag)
        loss_types.append(loss)
        charges.append(charge)
    return frag_types, loss_types, charges

def _get_frag_num(columns, rows, frag_len):
    frag_nums = []
    for r,c in zip(rows, columns):
        if c[0] in 'xyz':
            frag_nums.append(frag_len-r)
        else:
            frag_nums.append(r+1)
    return frag_nums

def merge_precursor_fragment_df(
    precursor_df:pd.DataFrame, 
    fragment_mass_df, 
    fragment_inten_df, 
    top_n_inten,
    frag_type_head='FragmentType',
    frag_mass_head='FragmengMz',
    frag_inten_head='RelativeIntensity',
    frag_charge_head='FragmentCharge',
    frag_loss_head='FragmentLossType',
    frag_num_head='FragmentNumber'
):
    df = precursor_df.copy()
    frag_columns = fragment_mass_df.columns.values
    frag_type_list = []
    frag_loss_list = []
    frag_charge_list = []
    frag_mass_list = []
    frag_inten_list = []
    frag_num_list = []
    for start, end in tqdm(df[['frag_start_idx','frag_end_idx']].values):
        intens = fragment_inten_df.loc[start:end-1,:].values
        masses = fragment_mass_df.loc[start:end-1,:].values
        sorted_idx = np.argsort(intens.reshape(-1))[-top_n_inten:]
        idx_in_df = np.unravel_index(sorted_idx, masses.shape)

        frag_len = end-start
        rows = np.arange(frag_len, dtype=np.int32)[idx_in_df[0]]
        columns = frag_columns[idx_in_df[1]]

        frag_types, loss_types, charges = _get_frag_info_column_names(columns)

        frag_nums = _get_frag_num(columns, rows, frag_len)

        frag_type_list.append(frag_types)
        frag_loss_list.append(loss_types)
        frag_charge_list.append(charges)
        frag_mass_list.append(masses)
        frag_inten_list.append(intens)
        frag_num_list.append(frag_nums)

    df[frag_type_head] = frag_type_list
    df[frag_mass_head] = frag_mass_list
    df[frag_inten_head] = frag_inten_list
    df[frag_charge_head] = frag_charge_list
    df[frag_loss_head] = frag_loss_list
    df[frag_num_head] = frag_num_list
    return df.explode([
        frag_type_head,
        frag_mass_head,
        frag_inten_head,
        frag_charge_head,
        frag_loss_head,
        frag_num_head
    ])



In [7]:
x = np.array((1,2,3,4)).reshape((2,2))
idx = np.unravel_index([3,0,2],x.shape)
x[idx], idx

(array([4, 1, 3]), (array([1, 0, 1]), array([1, 0, 0])))

In [8]:
import pandas as pd
import numpy as np
df = pd.DataFrame({
    'id' : ['a', 'b', 'c', 'd'],
    'A' : [np.array([2]*2),[1],[1],[3]*3],
    'B' : [np.array([20]*2),[10],[10],[30]*3]
})
df

Unnamed: 0,id,A,B
0,a,"[2, 2]","[20, 20]"
1,b,[1],[10]
2,c,[1],[10]
3,d,"[3, 3, 3]","[30, 30, 30]"


In [15]:
df.columns.values[[1,1,2,0,1]].astype('U')

array(['A', 'A', 'B', 'id', 'A'], dtype='<U2')

In [10]:
df.explode(list('AB'))

Unnamed: 0,id,A,B
0,a,2,20
0,a,2,20
1,b,1,10
2,c,1,10
3,d,3,30
3,d,3,30
3,d,3,30


In [None]:
#export
class SpecLibBase(object):
    def __init__(self,
        charged_ion_types, #['b_1+','b_2+','y_1+','y_2+', ...]
        min_frag_mz = 200, max_frag_mz = 2000,
        min_precursor_mz = 500, max_precursor_mz = 2000,
    ):
        self.charged_ion_types = charged_ion_types
        self._precursor_df:pd.DataFrame = None
        self._fragment_inten_df:pd.DataFrame = None
        self._fragment_mass_df:pd.DataFrame = None
        self.min_frag_mz = min_frag_mz
        self.max_frag_mz = max_frag_mz
        self.min_precursor_mz = min_precursor_mz
        self.max_precursor_mz = max_precursor_mz

    @property
    def precursor_df(self):
        return self._precursor_df
    @property
    def fragment_mass_df(self):
        return self._fragment_mass_df
    @property
    def fragment_inten_df(self):
        return self._fragment_inten_df

    def clip_precursor_(self):
        self._precursor_df = self._precursor_df[
            (self._precursor_df['precursor_mz']>=self.min_precursor_mz)&
            (self._precursor_df['precursor_mz']<=self.max_precursor_mz)
        ]
        self._precursor_df.reset_index(drop=True, inplace=True)

    def clip_by_fragment_mass_(self):
        self._fragment_inten_df[
            (self._fragment_mass_df<self.min_frag_mz)|
            (self._fragment_mass_df>self.max_frag_mz)
        ] = 0

    def clip_by_fragment_mass(self):
        df = self._fragment_inten_df.copy()
        df[
            (self._fragment_mass_df<self.min_frag_mz)|
            (self._fragment_mass_df>self.max_frag_mz)
        ] = 0
        return df
    
    def load_precursor_df(self, precursor_files, **kargs):
        self._load_precursor_df(precursor_files, **kargs)
        if 'precursor_mz' in self._precursor_df.columns:
            self._clip_precursor()

    def _load_precursor_df(self, precursor_files, **kargs):
        raise NotImplementedError(
            f'Sub-class of "{self.__class__}" must re-implement "load_precursor_df()"'
        )

    def load_fragment_df(self, **kargs):
        self.load_fragment_mass_df()
        self.load_fragment_inten_df()

    def load_fragment_inten_df(self, **kargs):
        raise NotImplementedError(
            f'Sub-class of "{self.__class__}" must re-implement "load_fragment_inten_df()"'
        )

    def load_fragment_mass_df(self):
        self._fragment_mass_df = fragment.get_fragment_mass_dataframe(
            self._precursor_df, self.charged_ion_types
        )
        self.clip_precursor_()

    def to_diann(self, top_k_frag=12):
        self.head.append("PrecursorMz")
        self.head.append("iRT")
        self.head.append("PrecursorCharge")
        self.head.append("StrippedPeptide")
        self.head.append("ModifiedPeptide")
        self.head.append('LabelModifiedSequence')
        self.head.append("FragmentMz")
        self.head.append("RelativeIntensity")
        self.head.append("FragmentType")
        self.head.append("FragmentCharge")
        self.head.append("FragmentNumber")
        self.head.append("FragmentLossType")
        self.head.append("ProteinName")
        self.head.append("ProteinGroups")
        self.head.append("UniprotID")
        self.head.append("Genes")

        translate_mods = {
            "Carbamidomethyl@C": "Carbamidomethyl (C)",
            "Oxidation@M": "Oxidation (M)",
            "Phospho@S": "Phospho (STY)",
            "Phospho@T": "Phospho (STY)",
            "Phospho@Y": "Phospho (STY)",
            "GlyGly@K": "GlyGly (K)",
            "Acetyl@Protein N-term": "Acetyl (Protein N-term)",
        }
        
        df = pd.DataFrame()
        df['ModifiedPeptide'] = self._precursor_df[
            ['sequence','mods','mod_sites']
        ].apply(
            generate_modified_sequence, 
            axis=1,
            translate_mod_dict=translate_mods,
            mod_sep='[]'
        )

        df['frag_start_idx'] = self._precursor_df['frag_start_idx']
        df['frag_end_idx'] = self._precursor_df['frag_end_idx']
        
        df['PrecursorCharge'] = self._precursor_df['charge']
        if 'predict_RT' in self._precursor_df.columns:
            df['iRT'] = self._precursor_df['predict_RT']
        else:
            df['iRT'] = self._precursor_df['RT']
        df['LabelModifiedSequence'] = df['ModifiedPeptide']
        df['StrippedPeptide'] = self._precursor_df['sequence']
        if 'uniprot_id' in self._precursor_df.columns:
            df['UniprotID'] = self._precursor_df['uniprot_id']
        if 'genes' in self._precursor_df.columns:
            df['Genes'] = self._precursor_df['genes']

        if 'protein_name' in self._precursor_df.columns:
            df['ProteinName'] = self._precursor_df['protein_name']
        elif 'UniprotID' in df.columns:
            df['ProteinName'] = df['UniprotID']

        if 'protein_group' in self._precursor_df.columns:
            df['ProteinGroups'] = self._precursor_df['protein_group']
        elif 'UniprotID' in df.columns:
            df['ProteinGroups'] = df['UniprotID']

        


SyntaxError: invalid syntax (<ipython-input-42-c73d0c584551>, line 85)