In [1]:
#---#| default_exp protein.fasta

# Spectral Library from fasta

In [2]:
import torch # noqa: 401, to prevent crash in Mac Arm

In [3]:
from peptdeep.protein.fasta import *

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [4]:
_lib = PredictSpecLibFasta(None, I_to_L=False, decoy='pseudo_reverse')
prot1 = 'MABCDEKFGHIJKLMNOPQRST'
prot2 = 'FGHIJKLMNOPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'gene_name': '',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'gene_name': 'gene',
        'sequence': prot2
    }
}
_lib.get_peptides_from_protein_dict(protein_dict)
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,MABCDEK,0,0,True,False,,,7
1,LMNOPQR,0;1,0,False,True,,,7
2,LMNOPQRST,0,1,False,True,,,9
3,ABCDEKFGHIJK,0,1,True,False,,,12
4,MABCDEKFGHIJK,0,1,True,False,,,13
5,FGHIJKLMNOPQR,0;1,1,True,True,,,13
6,FGHIJKLMNOPQRST,0,2,False,True,,,15
7,ABCDEKFGHIJKLMNOPQR,0,2,True,False,,,19
8,MABCDEKFGHIJKLMNOPQR,0,2,True,False,,,20


In [5]:
_lib.protein_df

Unnamed: 0,protein_id,gene_name,sequence
0,xx,,MABCDEKFGHIJKLMNOPQRST
1,yy,gene,FGHIJKLMNOPQR


In [6]:
_lib.append_protein_name()
assert 'proteins' in _lib.precursor_df.columns
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,MABCDEK,0,0,True,False,,,7,xx,
1,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
2,LMNOPQRST,0,1,False,True,,,9,xx,
3,ABCDEKFGHIJK,0,1,True,False,,,12,xx,
4,MABCDEKFGHIJK,0,1,True,False,,,13,xx,
5,FGHIJKLMNOPQR,0;1,1,True,True,,,13,xx;yy,gene
6,FGHIJKLMNOPQRST,0,2,False,True,,,15,xx,
7,ABCDEKFGHIJKLMNOPQR,0,2,True,False,,,19,xx,
8,MABCDEKFGHIJKLMNOPQR,0,2,True,False,,,20,xx,


In [7]:
#| hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    # test is_prot_nterm
    if prot1.startswith(seq) or prot2.startswith(seq):
        assert _lib.precursor_df.is_prot_nterm[i], seq
    elif prot1[1:].startswith(seq): # M.xxxxx
        assert _lib.precursor_df.is_prot_nterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_nterm[i], seq
    # test is_prot_cterm
    if prot1.endswith(seq) or prot2.endswith(seq):
        assert _lib.precursor_df.is_prot_cterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_cterm[i], seq
    # test protein_idxes
    if seq in prot1 and seq in prot2:
        assert _lib.precursor_df.protein_idxes[i] == '0;1'
        assert _lib.precursor_df.proteins[i] == 'xx;yy'
        assert _lib.precursor_df.genes[i] == 'gene'
    else:
        assert ';' not in _lib.precursor_df.protein_idxes[i]
        assert ';' not in _lib.precursor_df.proteins[i]
        assert _lib.precursor_df.genes[i] == ''

In [8]:
_lib.add_modifications()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,MABCDEK,0,0,True,False,Oxidation@M;Carbamidomethyl@C,1;4,7,xx,
1,MABCDEK,0,0,True,False,Carbamidomethyl@C,4,7,xx,
2,MABCDEK,0,0,True,False,Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...,0;1;4,7,xx,
3,MABCDEK,0,0,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;4,7,xx,
4,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
5,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
6,LMNOPQRST,0,1,False,True,Oxidation@M,2,9,xx,
7,LMNOPQRST,0,1,False,True,,,9,xx,
8,ABCDEKFGHIJK,0,1,True,False,Carbamidomethyl@C,3,12,xx,
9,ABCDEKFGHIJK,0,1,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;3,12,xx,


In [9]:
#| hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    mods = _lib.precursor_df.mods[i]
    sites = _lib.precursor_df.mod_sites[i]
    # test fix mods
    if 'C' in seq:
        assert str(seq.find('C')+1) in sites
        assert 'Carbamidomethyl@C' in mods
    else:
        assert 'Carbamidomethyl@C' not in mods
    # test Acetyl@Protein N-term
    if 'Acetyl@Protein_N-term' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert '0' in sites
    if '0' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert 'Acetyl@Protein_N-term' in mods
    if not _lib.precursor_df.is_prot_nterm[i]:
        assert 'Acetyl@Protein_N-term' not in mods
    # test Oxidation@M
    if 'Oxidation@M' in mods:
        assert 'M' in seq
        assert str(seq.find('M')+1) in sites
    # test unmodified
    if mods == '':
        assert sites == ''
    if sites == '':
        assert mods == ''
df = _lib.precursor_df
# at least one nterm peptide does not contain Acetyl@Protein N-term
assert not df[df.is_prot_nterm].mod_sites.str.contains('0').all()
# at least one nterm peptide contains Acetyl@Protein N-term
assert df[df.is_prot_nterm].mod_sites.str.contains('0').any()
# test var mod Oxidation@M
assert not df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').all()
assert df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').any()
assert '' in df.mods.values

In [10]:
_lib.special_mods = ['Phospho@S','Phospho@T']
_lib.add_special_modifications()
assert _lib.precursor_df.mods.str.contains('Phospho').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,MABCDEK,0,0,True,False,Oxidation@M;Carbamidomethyl@C,1;4,7,xx,
1,MABCDEK,0,0,True,False,Carbamidomethyl@C,4,7,xx,
2,MABCDEK,0,0,True,False,Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...,0;1;4,7,xx,
3,MABCDEK,0,0,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;4,7,xx,
4,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
5,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
6,LMNOPQRST,0,1,False,True,Oxidation@M;Phospho@S,2;8,9,xx,
7,LMNOPQRST,0,1,False,True,Oxidation@M;Phospho@T,2;9,9,xx,
8,LMNOPQRST,0,1,False,True,Oxidation@M,2,9,xx,
9,LMNOPQRST,0,1,False,True,Phospho@S,8,9,xx,


In [11]:
#| hide
_lib.add_peptide_labeling({
    'none': [], # not labelled for reference
    'light': ['Dimethyl@Any_N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes,labeling_channel
0,MABCDEK,0,0,True,False,Oxidation@M;Carbamidomethyl@C,1;4,7,xx,,none
1,MABCDEK,0,0,True,False,Carbamidomethyl@C,4,7,xx,,none
2,MABCDEK,0,0,True,False,Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...,0;1;4,7,xx,,none
3,MABCDEK,0,0,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;4,7,xx,,none
4,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene,none
...,...,...,...,...,...,...,...,...,...,...,...
115,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any_N-t...,4;0;7;13,20,xx,,heavy
116,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...,0;1;4;7;13,20,xx,,heavy
117,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...,0;15;4;7;13,20,xx,,heavy
118,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;...,0;1;15;4;7;13,20,xx,,heavy


In [12]:
from peptdeep.pretrained_models import ModelManager

In [13]:
model_mgr = ModelManager(device='cpu')
model_mgr.load_installed_models()
model_mgr.verbose = False
_lib = PredictSpecLibFasta(
    model_mgr, I_to_L=False, 
    decoy='pseudo_reverse'
)
prot1 = 'MACDESTYKBKFGHIKLMNPQRST'
prot2 = 'FGHIKLMNPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'sequence': prot2
    }
}
_lib.import_and_process_protein_dict(protein_dict)
_lib.generate_precursor_isotope = True
_lib.verbose = False
_lib.predict_all()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'i_0' in _lib.precursor_df.columns
assert 'i_1' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,...,i_5,mono_isotope_idx,rt_pred,rt_norm_pred,ccs_pred,mobility_pred,nce,instrument,frag_start_idx,frag_stop_idx
0,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,2,...,0.001232,0,0.021263,0.021263,318.941895,0.785035,30.0,Lumos,0,7
1,LMNPQRST,0,1,False,True,,,8,0,2,...,0.001173,0,0.092409,0.092409,317.660034,0.781693,30.0,Lumos,7,14
2,ACDESTYK,0,0,True,False,Carbamidomethyl@C,2,8,0,2,...,0.001409,0,0.032797,0.032797,329.177002,0.810355,30.0,Lumos,14,21
3,ACDESTYK,0,0,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;2,8,0,2,...,0.001604,0,0.109105,0.109105,342.048706,0.842529,30.0,Lumos,21,28
4,SRQPNMLT,0,1,False,True,Oxidation@M,6,8,1,2,...,0.001232,0,0.044289,0.044289,321.865723,0.792231,30.0,Lumos,28,35
5,SRQPNMLT,0,1,False,True,,,8,1,2,...,0.001173,0,0.15833,0.15833,323.465607,0.795979,30.0,Lumos,35,42
6,YTSEDCAK,0,0,True,False,Carbamidomethyl@C,6,8,1,2,...,0.001409,0,0.016274,0.016274,328.83197,0.809506,30.0,Lumos,42,49
7,YTSEDCAK,0,0,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;6,8,1,2,...,0.001604,0,0.119288,0.119288,339.180847,0.835465,30.0,Lumos,49,56
8,MACDESTYK,0,0,True,False,Oxidation@M;Carbamidomethyl@C,1;3,9,0,2,...,0.00349,0,0.048364,0.048364,351.815063,0.867675,30.0,Lumos,56,64
9,MACDESTYK,0,0,True,False,Carbamidomethyl@C,3,9,0,2,...,0.003395,0,0.081848,0.081848,353.857971,0.87256,30.0,Lumos,64,72


In [14]:
_lib.import_and_process_protein_dict(protein_dict)
_lib.add_peptide_labeling({
    'light': ['Dimethyl@Any_N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.predict_all()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'i_0' in _lib.precursor_df.columns
assert 'i_1' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,...,i_5,mono_isotope_idx,rt_pred,rt_norm_pred,ccs_pred,mobility_pred,nce,instrument,frag_start_idx,frag_stop_idx
0,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any_N-term,2;0,8,0,2,...,0.001352,0,0.242660,0.242660,345.390869,0.850135,30.0,Lumos,0,7
1,LMNPQRST,0,1,False,True,Dimethyl:2H(6)13C(2)@Any_N-term,0,8,0,2,...,0.027430,2,0.063860,0.063860,313.133270,0.770554,30.0,Lumos,7,14
2,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term,2;0,8,0,2,...,0.027954,2,0.017637,0.017637,314.302277,0.773615,30.0,Lumos,14,21
3,SRQPNMLT,0,1,False,True,Oxidation@M;Dimethyl:2H(6)13C(2)@Any_N-term,6;0,8,1,2,...,0.027954,2,0.040846,0.040846,319.400391,0.786163,30.0,Lumos,21,28
4,SRQPNMLT,0,1,False,True,Dimethyl:2H(6)13C(2)@Any_N-term,0,8,1,2,...,0.027430,2,0.152593,0.152593,320.333069,0.788271,30.0,Lumos,28,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,SRQPNMLKIHGFT,0,2,False,True,Dimethyl@Any_N-term;Dimethyl@K,0;8,13,1,2,...,0.005469,0,0.620949,0.620949,430.461243,1.065107,30.0,Lumos,692,704
76,SRQPNMLKIHGFT,0,2,False,True,Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K,6;0;8,13,1,3,...,0.005604,0,0.468698,0.468698,482.796661,0.796481,30.0,Lumos,704,716
77,SRQPNMLKIHGFT,0,2,False,True,Oxidation@M;Dimethyl@Any_N-term;Dimethyl@K,6;0;8,13,1,2,...,0.005604,0,0.468698,0.468698,428.150787,1.059489,30.0,Lumos,716,728
78,FGHIKLMNPQRST,0,2,False,True,Dimethyl:2H(6)13C(2)@Any_N-term;Dimethyl:2H(6)...,0;5,13,0,2,...,0.058123,2,0.206957,0.206957,412.858307,1.021552,30.0,Lumos,728,740
