In [None]:
#---#| default_exp spectral_library.translate

# Translate Spectral Libraries

Translate peptdeep spectral libraries into other formats (e.g. TSV)

In [None]:
import numpy as np
import pandas as pd

from alphabase.spectral_library.base import SpecLibBase
from alphabase.spectral_library.translate import create_modified_sequence, speclib_to_single_df, translate_to_tsv

In [None]:
df = pd.DataFrame()
df['sequence'] = ['ACDEFGHIK']*10
df['mods'] = ['ModNterm@Nterm;ModB@G;ModCterm@Cterm;ModA@C']*10
df['mod_sites'] = ['0;6;-1;2']*10
df[['sequence','mods','mod_sites']].apply(create_modified_sequence, axis=1)

0    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
1    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
2    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
3    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
4    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
5    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
6    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
7    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
8    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
9    _(ModNterm)AC(ModA)DEFG(ModB)HIK_(ModCterm)
dtype: object

In [None]:
assert create_modified_sequence(('ACDEFGHIK','ModNterm@Nterm;ModB@G;ModCterm@Cterm;ModA@C','0;6;-1;2'), mod_sep='[]')=='_[ModNterm]AC[ModA]DEFG[ModB]HIK_[ModCterm]'

In [None]:
assert create_modified_sequence(
    ('ACDEFGHIK','ModNterm;ModB@G;ModCterm;ModA@C','0;6;-1;2'),
    {'ModNterm':'Mod(Nterm)', 'ModCterm':'Mod(Cterm)', 'ModA@C':'ModA(C)', 'ModB@G':'ModB(G)'},
    mod_sep='()'
) == '_(Mod(Nterm))AC(ModA(C))DEFG(ModB(G))HIK_(Mod(Cterm))'

In [None]:
assert create_modified_sequence(
    ('ACDEFGHIK','ModNterm;ModB@G;ModCterm;ModA@C','0;6;-1;2'),
    {'ModNterm':'Mod(Nterm)', 'ModCterm':'Mod(Cterm)', 'ModA@C':'ModA(C)', 'ModB@G':'ModB(G)'},
    mod_sep='()', nterm='', cterm=''
) == '(Mod(Nterm))AC(ModA(C))DEFG(ModB(G))HIK(Mod(Cterm))'

In [None]:
from alphabase.peptide.fragment import create_fragment_mz_dataframe

In [None]:
repeat = 10
charged_frag_types = ['b_z1','y_z1','y_modloss_z1']
precursor_df = pd.DataFrame({
    'sequence': ['ASGHCEWMKYR']*repeat+['ASGHCEWMAAR'],
    'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],
    'mod_sites': ['0;4;8']*repeat+[''],
    'nAA': 11,
    'NCE': 20,
    'instrument': 'QE',
    'rt_pred': 10,
    'charge': 2,
    'protein_name': 'unknown',
    'mobility_pred': 1,
})
precursor_df.loc[0,['mods','mod_sites']] = ['Phospho@S','2']
frag_mass_df = create_fragment_mz_dataframe(precursor_df, charged_frag_types)
frag_mass_df

Unnamed: 0,b_z1,y_z1,y_modloss_z1
0,72.044390,1376.527555,1278.550659
1,239.042750,1209.529195,0.000000
2,296.064213,1152.507732,0.000000
3,433.123125,1015.448820,0.000000
4,536.132310,912.439635,0.000000
...,...,...,...
105,585.208572,634.312978,0.000000
106,771.287885,448.233665,0.000000
107,902.328370,317.193180,0.000000
108,973.365484,246.156066,0.000000


In [None]:
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,NCE,instrument,rt_pred,charge,protein_name,mobility_pred,frag_start_idx,frag_stop_idx
0,ASGHCEWMKYR,Phospho@S,2,11,20,QE,10,2,unknown,1,0,10
1,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,10,20
2,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,20,30
3,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,30,40
4,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,40,50
5,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,50,60
6,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,60,70
7,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,70,80
8,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,80,90
9,ASGHCEWMKYR,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,11,20,QE,10,2,unknown,1,90,100


In [None]:
spec_lib = SpecLibBase(charged_frag_types)
spec_lib._precursor_df = precursor_df
spec_lib._fragment_intensity_df = frag_mass_df.copy()
spec_lib._fragment_mz_df = frag_mass_df.copy()
df = speclib_to_single_df(spec_lib, min_frag_mz=300, max_frag_mz=1800)
assert (df.FragmentMz>=300).all()
assert (df.FragmentMz<=1800).all()
df = speclib_to_single_df(spec_lib, min_frag_mz=200, min_frag_nAA=3)
assert (df.FragmentNumber>=3).all()

11it [00:01,  6.07it/s]
11it [00:00, 2904.83it/s]


In [None]:
#| hide

import tempfile
from alphabase.peptide.fragment import create_fragment_mz_dataframe

In [None]:
#| hide
repeat = 10
charged_frag_types = ['b_z1','y_z1','y_modloss_z1']
precursor_df = pd.DataFrame({
    'sequence': ['ASGHCEWMKYR']*repeat+['ASGHCEWMAAR'],
    'mods': ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat+[''],
    'mod_sites': ['0;4;8']*repeat+[''],
    'nAA': 11,
    'NCE': 20,
    'instrument': 'QE',
    'rt_pred': 10,
    'charge': 2,
    'protein_name': 'unknown',
    'mobility_pred': 1,
})
precursor_df.loc[0,['mods','mod_sites']] = ['Phospho@S','2']
frag_mass_df = create_fragment_mz_dataframe(precursor_df, charged_frag_types)
spec_lib = SpecLibBase(charged_frag_types)
spec_lib._precursor_df = precursor_df
spec_lib._fragment_intensity_df = frag_mass_df.copy()
spec_lib._fragment_mz_df = frag_mass_df.copy()
speclib_sdf = speclib_to_single_df(spec_lib)
with tempfile.TemporaryFile('w+') as f:
    translate_to_tsv(spec_lib, f, batch_size=2, multiprocessing=False)
    f.seek(0)
    ddf = pd.read_csv(f, sep="\t")
assert len(ddf) == len(speclib_sdf)
assert ddf.StrippedPeptide.values[0] == speclib_sdf.StrippedPeptide.values[0]
assert ddf.StrippedPeptide.values[-1] == speclib_sdf.StrippedPeptide.values[-1]
assert ddf.PrecursorCharge.dtype in [np.int8, np.int16,np.int32,np.int64]
ddf

11it [00:00, 3438.72it/s]
100%|██████████| 6/6 [00:00<00:00, 76.62it/s]


Unnamed: 0,ModifiedPeptide,PrecursorCharge,Tr_recalibrated,IonMobility,StrippedPeptide,PrecursorMz,FragmentType,FragmentMz,RelativeIntensity,FragmentCharge,FragmentLossType,FragmentNumber
0,_AS(Phospho)GHCEWMKYR_,2,10,1,ASGHCEWMKYR,724.285972,y,1376.527555,1.000000,1,noloss,10
1,_AS(Phospho)GHCEWMKYR_,2,10,1,ASGHCEWMKYR,724.285972,y,1278.550659,0.928823,1,H3PO4,10
2,_AS(Phospho)GHCEWMKYR_,2,10,1,ASGHCEWMKYR,724.285972,b,1273.452993,0.925120,1,noloss,10
3,_AS(Phospho)GHCEWMKYR_,2,10,1,ASGHCEWMKYR,724.285972,y,1209.529195,0.878681,1,noloss,9
4,_AS(Phospho)GHCEWMKYR_,2,10,1,ASGHCEWMKYR,724.285972,y,1152.507732,0.837257,1,noloss,8
...,...,...,...,...,...,...,...,...,...,...,...,...
127,_ASGHCEWMAAR_,2,10,1,ASGHCEWMAAR,609.760775,b,771.287885,0.672160,1,noloss,7
128,_ASGHCEWMAAR_,2,10,1,ASGHCEWMAAR,609.760775,y,763.355571,0.665247,1,noloss,6
129,_ASGHCEWMAAR_,2,10,1,ASGHCEWMAAR,609.760775,y,634.312978,0.552789,1,noloss,5
130,_ASGHCEWMAAR_,2,10,1,ASGHCEWMAAR,609.760775,b,585.208572,0.509996,1,noloss,6
