In [None]:
#| default_exp protein.fasta

# Spectral Library from fasta

In [None]:
#| export
from alphabase.protein.fasta import FastaLib
from peptdeep.spec_lib.predict_lib import PredictSpecLib
from peptdeep.pretrained_models import ModelManager


In [None]:
#| export

class PredictFastaSpecLib(FastaLib, PredictSpecLib):
    def __init__(self,
        model_manager:ModelManager = None,
        charged_frag_types:list = ['b_z1','b_z2','y_z1','y_z2'],
        protease:str = 'trypsin/P',
        max_missed_cleavages:int = 2,
        peptide_length_min:int = 7,
        peptide_length_max:int = 35,
        precursor_charge_min:int = 2,
        precursor_charge_max:int = 4,
        precursor_mz_min:float = 200.0, 
        precursor_mz_max:float = 2000.0,
        var_mods:list = ['Acetyl@Protein N-term','Oxidation@M'],
        max_var_mod_num:int = 2,
        fix_mods:list = ['Carbamidomethyl@C'],
        decoy: str = None, # or pseudo_reverse or diann
        I_to_L=False,
    ):
        FastaLib.__init__(self,
            charged_frag_types=charged_frag_types,
            protease=protease,
            max_missed_cleavages=max_missed_cleavages,
            peptide_length_min=peptide_length_min,
            peptide_length_max=peptide_length_max,
            precursor_charge_min=precursor_charge_min,
            precursor_charge_max=precursor_charge_max,
            precursor_mz_min=precursor_mz_min, 
            precursor_mz_max=precursor_mz_max,
            var_mods=var_mods,
            max_var_mod_num=max_var_mod_num,
            fix_mods=fix_mods,
            decoy=decoy,
            I_to_L=I_to_L,
        )

        PredictSpecLib.__init__(self,
            model_manager=model_manager,
            charged_frag_types=self.charged_frag_types,
            precursor_mz_min=self.min_precursor_mz,
            precursor_mz_max=self.max_precursor_mz,
            decoy=self.decoy,
        )

    def _process_after_load_pep_seqs(self):
        FastaLib._process_after_load_pep_seqs(self)
        self.predict_all()

In [None]:
_lib = PredictFastaSpecLib(None, I_to_L=False, decoy='pseudo_reverse')
prot1 = 'MABCDEKFGHIJKLMNOPQRST'
prot2 = 'FGHIJKLMNOPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'gene_name': '',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'gene_name': 'gene',
        'sequence': prot2
    }
}
_lib.get_peptides_from_protein_dict(protein_dict)
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,MABCDEK,0,0,True,False,,,7
1,LMNOPQR,0;1,0,False,True,,,7
2,LMNOPQRST,0,1,False,True,,,9
3,ABCDEKFGHIJK,0,1,True,False,,,12
4,MABCDEKFGHIJK,0,1,True,False,,,13
5,FGHIJKLMNOPQR,0;1,1,True,True,,,13
6,FGHIJKLMNOPQRST,0,2,False,True,,,15
7,ABCDEKFGHIJKLMNOPQR,0,2,True,False,,,19
8,MABCDEKFGHIJKLMNOPQR,0,2,True,False,,,20


In [None]:
_lib.protein_df

Unnamed: 0,protein_id,gene_name,sequence
0,xx,,MABCDEKFGHIJKLMNOPQRST
1,yy,gene,FGHIJKLMNOPQR


In [None]:
_lib.append_protein_name()
assert 'proteins' in _lib.precursor_df.columns
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,MABCDEK,0,0,True,False,,,7,xx,
1,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
2,LMNOPQRST,0,1,False,True,,,9,xx,
3,ABCDEKFGHIJK,0,1,True,False,,,12,xx,
4,MABCDEKFGHIJK,0,1,True,False,,,13,xx,
5,FGHIJKLMNOPQR,0;1,1,True,True,,,13,xx;yy,gene
6,FGHIJKLMNOPQRST,0,2,False,True,,,15,xx,
7,ABCDEKFGHIJKLMNOPQR,0,2,True,False,,,19,xx,
8,MABCDEKFGHIJKLMNOPQR,0,2,True,False,,,20,xx,


In [None]:
#| hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    # test is_prot_nterm
    if prot1.startswith(seq) or prot2.startswith(seq):
        assert _lib.precursor_df.is_prot_nterm[i], seq
    elif prot1[1:].startswith(seq): # M.xxxxx
        assert _lib.precursor_df.is_prot_nterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_nterm[i], seq
    # test is_prot_cterm
    if prot1.endswith(seq) or prot2.endswith(seq):
        assert _lib.precursor_df.is_prot_cterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_cterm[i], seq
    # test protein_idxes
    if seq in prot1 and seq in prot2:
        assert _lib.precursor_df.protein_idxes[i] == '0;1'
        assert _lib.precursor_df.proteins[i] == 'xx;yy'
        assert _lib.precursor_df.genes[i] == 'gene'
    else:
        assert ';' not in _lib.precursor_df.protein_idxes[i]
        assert ';' not in _lib.precursor_df.proteins[i]
        assert _lib.precursor_df.genes[i] == ''

In [None]:
_lib.add_modifications()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,MABCDEK,0,0,True,False,Carbamidomethyl@C;Oxidation@M,4;1,7,xx,
1,MABCDEK,0,0,True,False,Carbamidomethyl@C,4,7,xx,
2,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1,7,xx,
3,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,4;0,7,xx,
4,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
5,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
6,LMNOPQRST,0,1,False,True,Oxidation@M,2,9,xx,
7,LMNOPQRST,0,1,False,True,,,9,xx,
8,ABCDEKFGHIJK,0,1,True,False,Carbamidomethyl@C,3,12,xx,
9,ABCDEKFGHIJK,0,1,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,3;0,12,xx,


In [None]:
#| hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    mods = _lib.precursor_df.mods[i]
    sites = _lib.precursor_df.mod_sites[i]
    # test fix mods
    if 'C' in seq:
        assert str(seq.find('C')+1) in sites
        assert 'Carbamidomethyl@C' in mods
    else:
        assert 'Carbamidomethyl@C' not in mods
    # test Acetyl@Protein N-term
    if 'Acetyl@Protein N-term' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert '0' in sites
    if '0' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert 'Acetyl@Protein N-term' in mods
    if not _lib.precursor_df.is_prot_nterm[i]:
        assert 'Acetyl@Protein N-term' not in mods
    # test Oxidation@M
    if 'Oxidation@M' in mods:
        assert 'M' in seq
        assert str(seq.find('M')+1) in sites
    # test unmodified
    if mods == '':
        assert sites == ''
    if sites == '':
        assert mods == ''
df = _lib.precursor_df
# at least one nterm peptide does not contain Acetyl@Protein N-term
assert not df[df.is_prot_nterm].mod_sites.str.contains('0').all()
# at least one nterm peptide contains Acetyl@Protein N-term
assert df[df.is_prot_nterm].mod_sites.str.contains('0').any()
# test var mod Oxidation@M
assert not df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').all()
assert df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').any()
assert '' in df.mods.values

In [None]:
_lib.add_additional_modifications(['Phospho@S','Phospho@T'])
assert _lib.precursor_df.mods.str.contains('Phospho').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,MABCDEK,0,0,True,False,Carbamidomethyl@C;Oxidation@M,4;1,7,xx,
1,MABCDEK,0,0,True,False,Carbamidomethyl@C,4,7,xx,
2,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1,7,xx,
3,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,4;0,7,xx,
4,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
5,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
6,LMNOPQRST,0,1,False,True,Oxidation@M;Phospho@S,2;8,9,xx,
7,LMNOPQRST,0,1,False,True,Oxidation@M;Phospho@T,2;9,9,xx,
8,LMNOPQRST,0,1,False,True,Oxidation@M,2,9,xx,
9,LMNOPQRST,0,1,False,True,Phospho@S,8,9,xx,


In [None]:
#| hide
_lib.add_peptide_labeling({
    'none': [], # not labelled for reference
    'light': ['Dimethyl@Any N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes,label_channel
0,MABCDEK,0,0,True,False,Carbamidomethyl@C;Oxidation@M,4;1,7,xx,,none
1,MABCDEK,0,0,True,False,Carbamidomethyl@C,4,7,xx,,none
2,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1,7,xx,,none
3,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,4;0,7,xx,,none
4,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene,none
...,...,...,...,...,...,...,...,...,...,...,...
115,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...,4;0;7;13,20,xx,,heavy
116,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;7;13,20,xx,,heavy
117,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;15;7;13,20,xx,,heavy
118,MABCDEKFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;15;7;13,20,xx,,heavy


In [None]:
from peptdeep.pretrained_models import ModelManager

In [None]:
model_mgr = ModelManager(device='cpu')
model_mgr.load_installed_models()
model_mgr.verbose = False
_lib = PredictFastaSpecLib(
    model_mgr, I_to_L=False, 
    decoy='pseudo_reverse'
)
prot1 = 'MACDESTYKBKFGHIKLMNPQRST'
prot2 = 'FGHIKLMNPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'sequence': prot2
    }
}
_lib.import_and_process_protein_dict(protein_dict)
_lib.predict_all()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'isotope_apex_index' in _lib.precursor_df.columns
assert 'isotope_apex_intensity' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

2022-09-07 10:22:59> Calculating precursor isotope distributions ...
2022-09-07 10:23:00> Predicting RT/IM/MS2 ...
2022-09-07 10:23:00> End Predicting RT/IM/MS2
2022-09-07 10:23:00> Calculating precursor isotope distributions ...
2022-09-07 10:23:00> Predicting RT/IM/MS2 ...
2022-09-07 10:23:00> End Predicting RT/IM/MS2


Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,...,isotope_apex_mz,isotope_right_most_mz,rt_pred,rt_norm_pred,ccs_pred,mobility_pred,nce,instrument,frag_end_idx,frag_start_idx
0,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,2,...,481.739834,482.241484,0.021263,0.021263,318.941895,0.785035,30.0,Lumos,7,0
1,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,3,...,321.495648,321.830081,0.021263,0.021263,374.614014,0.614719,30.0,Lumos,14,7
2,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,4,...,241.373555,241.624380,0.021263,0.021263,430.954041,0.530385,30.0,Lumos,21,14
3,LMNPQRST,0,1,False,True,,,8,0,2,...,473.742377,474.244027,0.092409,0.092409,317.660065,0.781693,30.0,Lumos,28,21
4,LMNPQRST,0,1,False,True,,,8,0,3,...,316.164010,316.498443,0.092409,0.092409,374.240265,0.613959,30.0,Lumos,35,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,3,...,515.604920,516.273787,0.218114,0.218114,473.192261,0.780636,30.0,Lumos,696,684
80,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,4,...,386.955509,387.457159,0.218114,0.218114,605.540649,0.749235,30.0,Lumos,708,696
81,FGHIKLMNPQRST,0,2,False,True,,,13,0,2,...,764.906285,765.909585,0.252718,0.252718,416.934204,1.031637,30.0,Lumos,720,708
82,FGHIKLMNPQRST,0,2,False,True,,,13,0,3,...,510.273282,510.942149,0.252718,0.252718,477.759918,0.788098,30.0,Lumos,732,720


In [None]:
_lib.import_and_process_protein_dict(protein_dict)
_lib.add_peptide_labeling({
    'light': ['Dimethyl@Any N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.predict_all()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'isotope_apex_index' in _lib.precursor_df.columns
assert 'isotope_apex_intensity' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

2022-09-07 10:23:00> Calculating precursor isotope distributions ...
2022-09-07 10:23:00> Predicting RT/IM/MS2 ...
2022-09-07 10:23:01> End Predicting RT/IM/MS2
2022-09-07 10:23:01> Calculating precursor isotope distributions ...
2022-09-07 10:23:02> Predicting RT/IM/MS2 ...
2022-09-07 10:23:02> End Predicting RT/IM/MS2


Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,...,isotope_right_most_mz,rt_pred,rt_norm_pred,ccs_pred,mobility_pred,nce,instrument,label_channel,frag_end_idx,frag_start_idx
0,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any N-term,2;0,8,0,2,...,496.257134,0.242660,0.242660,345.390900,0.850475,30.0,Lumos,light,7,0
1,YTSEDCAK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Dimeth...,6;0;8,8,1,2,...,526.744975,0.106988,0.106988,347.019012,0.855165,30.0,Lumos,heavy,14,7
2,YTSEDCAK,0,0,True,False,Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...,6;0;8,8,1,4,...,262.392402,0.009153,0.009153,459.639069,0.566321,30.0,Lumos,heavy,21,14
3,YTSEDCAK,0,0,True,False,Carbamidomethyl@C;Dimethyl:2H(6)13C(2)@Any N-t...,6;0;8,8,1,2,...,523.777528,0.009153,0.009153,331.465332,0.816775,30.0,Lumos,heavy,28,21
4,SRQPNMLT,0,1,False,True,Dimethyl:2H(6)13C(2)@Any N-term,0,8,1,4,...,246.644569,0.152593,0.152593,425.942719,0.524368,30.0,Lumos,heavy,35,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,SRQPNMLKIHGFT,0,2,False,True,Oxidation@M;Dimethyl@Any N-term;Dimethyl@K,6;0;8,13,1,4,...,401.472809,0.468698,0.468698,574.154480,0.710622,30.0,Lumos,light,1440,1428
164,SRQPNMLKIHGFT,0,2,False,True,Oxidation@M;Dimethyl@Any N-term;Dimethyl@K,6;0;8,13,1,3,...,534.961320,0.468698,0.468698,482.796692,0.796729,30.0,Lumos,light,1452,1440
165,SRQPNMLKIHGFT,0,2,False,True,Oxidation@M;Dimethyl@Any N-term;Dimethyl@K,6;0;8,13,1,2,...,801.938342,0.468698,0.468698,428.150726,1.059819,30.0,Lumos,light,1464,1452
166,FGHIKLMNPQRST,0,2,False,True,Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...,0;5,13,0,3,...,534.992596,0.206957,0.206957,478.660187,0.789903,30.0,Lumos,heavy,1476,1464
