In [None]:
#---#| default_exp protein.fasta

# Protein and Peptide Processing

In [None]:
import numpy as np
import pandas as pd

from alphabase.protein.fasta import get_uniprot_gene_name, protease_dict, Digest, get_fix_mods, get_candidate_sites, \
    get_var_mod_sites, get_var_mods_per_sites_multi_mods_on_aa, get_var_mods, get_var_mods_per_sites_single_mod_on_aa, \
    parse_term_mod, parse_labels, add_single_peptide_labeling, create_labeling_peptide_df, protein_idxes_to_names, \
    append_special_modifications, SpecLibFasta

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
#| hide
assert get_uniprot_gene_name('sp|Q9H9K5|MER34_HUMAN Endogenous retroviral envelope protein HEMO OS=Homo sapiens OX=9606 GN=ERVMER34-1 PE=1 SV=1') == 'ERVMER34-1'

In [None]:
assert 'trypsin_not_p' in protease_dict, 'trypsin_not_p not in protease_dict, why?'

In [None]:
#| hide
digest = Digest(protease="trypsin_not_p")

idx = '0123456789012345678901234567890123456789012345'
seq = 'ABDNGKENGLANGIXHGRKTNGLANGKVHNAKHNARKANGKPFAAT'
cut_pos = digest.get_cut_positions(seq)
assert np.all(cut_pos==np.array([0, 6, 18, 19, 27, 32, 36, 37, len(seq)]))

In [None]:
#| hide
seq = 'MABCDEKHIJKLNOPQRST'
digest = Digest()
seq_list, miss_list, nterm_list, cterm_list = digest.cleave_sequence(seq)
assert len(seq_list) == len(miss_list) == len(nterm_list) == len(cterm_list)
M_start_seqs = [seq for seq in seq_list if seq.startswith('M')]
assert len(M_start_seqs)*2 == len([_ for _ in nterm_list if _])
assert np.all(nterm_list[-len(M_start_seqs):])
T_end_seqs = [seq for seq in seq_list if seq.endswith('T')]
assert len(T_end_seqs) == len([_ for _ in cterm_list if _])
seq_list

['MABCDEK',
 'MABCDEKHIJK',
 'MABCDEKHIJKLNOPQR',
 'HIJKLNOPQR',
 'HIJKLNOPQRST',
 'LNOPQR',
 'LNOPQRST',
 'ABCDEK',
 'ABCDEKHIJK',
 'ABCDEKHIJKLNOPQR']

In [None]:
seq = 'ACBCDCK'
_fix_mod_dict = {}
_fix_mod_dict['C'] = 'mod@C'
mods, mod_sites = get_fix_mods(seq, 'C', _fix_mod_dict)
assert mods==';'.join(['mod@C']*3)
assert mod_sites=='2;4;6'
get_fix_mods(seq, 'C', _fix_mod_dict)

('mod@C;mod@C;mod@C', '2;4;6')

In [None]:
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
assert np.all(np.array(candidate_sites)==np.array([2,4,5,6,7]))
sites = get_var_mod_sites(seq, 'MSTY', 0, 3, 20)
ground_truth = [
 (2,),
 (4,),
 (5,),
 (6,),
 (7,),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (4, 5),
 (4, 6),
 (4, 7),
 (5, 6),
 (5, 7),
 (6, 7),
 (2, 4, 5),
 (2, 4, 6),
 (2, 4, 7),
 (2, 5, 6),
 (2, 5, 7)
]

for s in ground_truth:
    assert s in sites

In [None]:
#| hide
get_var_mods_per_sites = get_var_mods_per_sites_multi_mods_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 0, 3, 20)
_mod_dict = {
    'M':['mod@M'],
    'S':['mod@S','modX@S'],
    'T':['mod@T'],
    'Y':['mod@Y'],
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 0, 3, 16)

assert 'mod@M;mod@M;mod@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
assert 'mod@M;mod@M;modX@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;modX@S')] == '2;4;5'
assert 'mod@M;mod@S' in _mods
assert 'mod@M;modX@S' in _mods
get_var_mods(seq, 'MSTY', _mod_dict, 0, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'modX@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'modX@S;mod@T',
  'mod@S;mod@Y',
  'modX@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S',
  'mod@M;mod@M;modX@S',
  ''],
 ['2',
  '4',
  '5',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;6',
  '5;7',
  '5;7',
  '6;7',
  '2;4;5',
  '2;4;5',
  ''])

In [None]:
#| hide
get_var_mods_per_sites = get_var_mods_per_sites_single_mod_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 1, 3, 20)
_mod_dict = {
    'M':'mod@M',
    'S':'mod@S',
    'T':'mod@T',
    'Y':'mod@Y',
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 1, 3, 16)
assert len(_mods) == len(_sites) == 16
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
get_var_mods(seq, 'MSTY', _mod_dict, 0, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'mod@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S',
  ''],
 ['2',
  '4',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;7',
  '6;7',
  '2;4;5',
  ''])

In [None]:
#| hide
assert parse_term_mod('Acetyl@Protein_N-term') == ('', 'Protein_N-term')
assert parse_term_mod('Gln->pyro-Glu@Q^Any_N-term') == ('Q', 'Any_N-term')

In [None]:
#| hide
labels = ['label@Any_N-term','label@K']
(
    label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) = parse_labels(labels)
assert add_single_peptide_labeling(
    'ABCK','','', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('label@Any_N-term;label@K', '0;4')
assert add_single_peptide_labeling(
    'ABCK','Mod@Any_N-term','0', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('Mod@Any_N-term;label@K', '0;4')
assert add_single_peptide_labeling(
    'KBCK','','', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('label@Any_N-term;label@K;label@K', '0;1;4')
assert add_single_peptide_labeling(
    'KBCK','Mod@Any_N-term','0', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('Mod@Any_N-term;label@K;label@K', '0;1;4')
pep_df = pd.DataFrame({
    'sequence': ['ABCD','ABCK','KABK','EFGK'],
    'mods': ['']*3+['Mod@Any_N-term'],
    'mod_sites': ['']*3+['0']
})
df = create_labeling_peptide_df(pep_df, labels)
assert np.all(df.mods.values!=pep_df.mods.values)
assert df[df.sequence=='ABCD'].mods.values[0] == 'label@Any_N-term'
assert df[df.sequence=='ABCD'].mod_sites.values[0] == '0'
assert df[df.sequence=='ABCK'].mods.values[0] == 'label@Any_N-term;label@K'
assert df[df.sequence=='ABCK'].mod_sites.values[0] == '0;4'
assert df[df.sequence=='KABK'].mods.values[0] == 'label@Any_N-term;label@K;label@K'
assert df[df.sequence=='KABK'].mod_sites.values[0] == '0;1;4'
assert df[df.sequence=='EFGK'].mods.values[0] == 'Mod@Any_N-term;label@K'
assert df[df.sequence=='EFGK'].mod_sites.values[0] == '0;4'
df = create_labeling_peptide_df(pep_df, [])
assert np.all(df.mods.values==pep_df.mods.values)

In [None]:
#| hide
assert protein_idxes_to_names('0;1', ['A','','B'])=='A'
assert protein_idxes_to_names('0;1', ['A','C','B'])=='A;C'

### Testing

In [None]:
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein_N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_special_modifications(df, min_mod_num=0)
assert np.sum(df.sequence=='ABSTY')==4
assert np.sum(df.sequence=='ACXSX')==2
assert np.sum(df.sequence=='ACDEFG')==1
assert all(df[df.sequence=='ABSTY'].mods.values == np.array(['Phospho@S','Phospho@T','Phospho@Y','']))
assert all(df[df.sequence=='ABSTY'].mod_sites.values == np.array(['3','4','5','']))
assert all(df[df.sequence=='ACXSX'].mods.values == np.array(['Acetyl@Protein_N-term;Phospho@S','Acetyl@Protein_N-term']))
assert all(df[df.sequence=='ACXSX'].mod_sites.values == np.array(['0;4','0']))
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ABSTY,,
4,ACXSX,Acetyl@Protein_N-term;Phospho@S,0;4
5,ACXSX,Acetyl@Protein_N-term,0
6,ACDEFG,,


In [None]:
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein_N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_special_modifications(df, min_mod_num=1)
assert np.sum(df.sequence=='ABSTY')==3
assert np.sum(df.sequence=='ACXSX')==1
assert np.sum(df.sequence=='ACDEFG')==0
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ACXSX,Acetyl@Protein_N-term;Phospho@S,0;4


In [None]:
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein_N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_special_modifications(df, min_mod_num=2, max_mod_num=2)
assert np.sum(df.sequence=='ABSTY')==3
assert np.sum(df.sequence=='ACXSX')==0
assert np.sum(df.sequence=='ACDEFG')==0
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S;Phospho@T,3;4
1,ABSTY,Phospho@S;Phospho@Y,3;5
2,ABSTY,Phospho@T;Phospho@Y,4;5


In [None]:
def get_protein_dict():
    prot1 = 'MABCDESTKAFGHIJKLMNOPQRAFGHIJK'
    prot2 = 'AFGHIJKLMNOPQR'
    protein_dict = {
        'xx': {
            'protein_id': 'xx',
            'gene_name': '',
            'sequence': prot1
        },
        'yy': {
            'protein_id': 'yy',
            'gene_name': 'gene',
            'sequence': prot2
        }
    }
    return protein_dict

In [None]:
_lib = SpecLibFasta(None, I_to_L=False, decoy='pseudo_reverse')
protein_dict = get_protein_dict()
_lib.get_peptides_from_protein_dict(protein_dict)
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,AFGHIJK,0;1,0,True,True,,,7
1,LMNOPQR,0;1,0,False,True,,,7
2,ABCDESTK,0,0,True,False,,,8
3,MABCDESTK,0,0,True,False,,,9
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14
5,LMNOPQRAFGHIJK,0,1,False,True,,,14
6,ABCDESTKAFGHIJK,0,1,True,False,,,15
7,MABCDESTKAFGHIJK,0,1,True,False,,,16
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22


In [None]:
_lib.protein_df

Unnamed: 0,protein_id,gene_name,sequence
0,xx,,MABCDESTKAFGHIJKLMNOPQRAFGHIJK
1,yy,gene,AFGHIJKLMNOPQR


In [None]:
_lib.append_protein_name()
assert 'proteins' in _lib.precursor_df.columns
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
2,ABCDESTK,0,0,True,False,,,8,xx,
3,MABCDESTK,0,0,True,False,,,9,xx,
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14,xx;yy,gene
5,LMNOPQRAFGHIJK,0,1,False,True,,,14,xx,
6,ABCDESTKAFGHIJK,0,1,True,False,,,15,xx,
7,MABCDESTKAFGHIJK,0,1,True,False,,,16,xx,
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21,xx,
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22,xx,


In [None]:
#| hide
prot1 = protein_dict['xx']['sequence']
prot2 = protein_dict['yy']['sequence']
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    # test is_prot_nterm
    if prot1.startswith(seq) or prot2.startswith(seq):
        assert _lib.precursor_df.is_prot_nterm[i], seq
    elif prot1[1:].startswith(seq): # M.xxxxx
        assert _lib.precursor_df.is_prot_nterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_nterm[i], seq
    # test is_prot_cterm
    if prot1.endswith(seq) or prot2.endswith(seq):
        assert _lib.precursor_df.is_prot_cterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_cterm[i], seq
    # test protein_idxes
    if seq in prot1 and seq in prot2:
        assert _lib.precursor_df.protein_idxes[i] == '0;1'
        assert _lib.precursor_df.proteins[i] == 'xx;yy'
        assert _lib.precursor_df.genes[i] == 'gene'
    else:
        assert ';' not in _lib.precursor_df.protein_idxes[i]
        assert ';' not in _lib.precursor_df.proteins[i]
        assert _lib.precursor_df.genes[i] == ''

In [None]:
_lib.add_modifications()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein_N-term,0,7,xx;yy,gene
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C,3,8,xx,
5,ABCDESTK,0,0,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;3,8,xx,
6,MABCDESTK,0,0,True,False,Oxidation@M;Carbamidomethyl@C,1;4,9,xx,
7,MABCDESTK,0,0,True,False,Carbamidomethyl@C,4,9,xx,
8,MABCDESTK,0,0,True,False,Acetyl@Protein_N-term;Oxidation@M;Carbamidomet...,0;1;4,9,xx,
9,MABCDESTK,0,0,True,False,Acetyl@Protein_N-term;Carbamidomethyl@C,0;4,9,xx,


In [None]:
#| hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    mods = _lib.precursor_df.mods[i]
    sites = _lib.precursor_df.mod_sites[i]
    # test fix mods
    if 'C' in seq:
        assert str(seq.find('C')+1) in sites
        assert 'Carbamidomethyl@C' in mods
    else:
        assert 'Carbamidomethyl@C' not in mods
    # test Acetyl@Protein_N-term
    if 'Acetyl@Protein_N-term' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert '0' in sites
    if '0' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert 'Acetyl@Protein_N-term' in mods
    if not _lib.precursor_df.is_prot_nterm[i]:
        assert 'Acetyl@Protein_N-term' not in mods
    # test Oxidation@M
    if 'Oxidation@M' in mods:
        assert 'M' in seq
        assert str(seq.find('M')+1) in sites
    # test unmodified
    if mods == '':
        assert sites == ''
    if sites == '':
        assert mods == ''
df = _lib.precursor_df
# at least one nterm peptide does not contain Acetyl@Protein_N-term
assert not df[df.is_prot_nterm].mod_sites.str.contains('0').all()
# at least one nterm peptide contains Acetyl@Protein_N-term
assert df[df.is_prot_nterm].mod_sites.str.contains('0').any()
# test var mod Oxidation@M
assert not df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').all()
assert df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').any()
assert '' in df.mods.values

In [None]:
#| hide
_lib.add_peptide_labeling({
    'none': [], # not labelled for reference
    'light': ['Dimethyl@Any_N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],
})

In [None]:
_lib = SpecLibFasta(
    ['b_z1','y_z1'], I_to_L=False, 
    decoy='pseudo_reverse',
    precursor_mz_min=200,
)
prot1 = 'MACDESTYKBKFGHIKLMNPQRST'
prot2 = 'FGHIKLMNPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'sequence': prot2
    }
}
_lib.import_and_process_protein_dict(protein_dict)
_lib.calc_precursor_isotope()
assert (_lib.precursor_df.charge == _lib.min_precursor_charge).any()
assert (_lib.precursor_df.charge == _lib.max_precursor_charge).any()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'i_0' in _lib.precursor_df.columns
assert 'i_5' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()

In [None]:
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,precursor_mz,i_0,i_1,i_2,i_3,i_4,i_5,mono_isotope_idx
0,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,2,481.739834,0.578990,0.277229,0.106387,0.029553,0.006609,0.001232,0
1,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,3,321.495648,0.578990,0.277229,0.106387,0.029553,0.006609,0.001232,0
2,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,4,241.373555,0.578990,0.277229,0.106387,0.029553,0.006609,0.001232,0
3,LMNPQRST,0,1,False,True,,,8,0,2,473.742377,0.580391,0.277678,0.105346,0.029013,0.006397,0.001173,0
4,LMNPQRST,0,1,False,True,,,8,0,3,316.164010,0.580391,0.277678,0.105346,0.029013,0.006397,0.001173,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,3,515.604920,0.405148,0.335637,0.170482,0.064313,0.019475,0.004945,0
80,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,4,386.955509,0.405148,0.335637,0.170482,0.064313,0.019475,0.004945,0
81,FGHIKLMNPQRST,0,2,False,True,,,13,0,2,764.906285,0.406114,0.336283,0.169926,0.063710,0.019148,0.004819,0
82,FGHIKLMNPQRST,0,2,False,True,,,13,0,3,510.273282,0.406114,0.336283,0.169926,0.063710,0.019148,0.004819,0


In [None]:
_lib.import_and_process_protein_dict(protein_dict)
_lib.add_peptide_labeling({
    'light': ['Dimethyl@Any_N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.calc_precursor_isotope()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'i_0' in _lib.precursor_df.columns
assert 'i_5' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()

Test fix mods

In [None]:
fasta_lib = SpecLibFasta(
    var_mods = ["Oxidation@M"], 
    fix_mods = ["Dimethyl@K", "Dimethyl@Any_N-term"],
    special_mods=[],
    # labeling_channels={0:['Phospho@S']}
)
protein_dict = get_protein_dict()
fasta_lib.import_and_process_protein_dict(protein_dict)
assert fasta_lib.precursor_df.mods.str.contains('Dimethyl@Any_N-term').all()
assert fasta_lib.precursor_df.mods.str.contains('Oxidation@M').any()

Test min_var_mod_num

In [None]:
fasta_lib = SpecLibFasta(
    var_mods = ["Oxidation@M"], 
    fix_mods = ["Carbamidomethyl@C"],
    special_mods=[],
    min_var_mod_num=1,
    max_var_mod_num=2,
)
protein_dict = get_protein_dict()
fasta_lib.import_and_process_protein_dict(protein_dict)
fasta_lib.calc_precursor_mz()
assert fasta_lib.precursor_df.mods.str.contains("Oxidation@M").all()

Test special mods

In [None]:
fasta_lib = SpecLibFasta(
    var_mods = [], fix_mods = [],
    special_mods=['Phospho@S'],
    # labeling_channels={0:['Phospho@S']}
    precursor_mz_max=1e100,
)
protein_dict = get_protein_dict()
fasta_lib.import_and_process_protein_dict(protein_dict)
assert fasta_lib.precursor_df.mods.str.contains('Phospho@S').any()

In [None]:
fasta_lib = SpecLibFasta(
    var_mods = ["Oxidation@M"], 
    fix_mods = ["Dimethyl@K", "Dimethyl@Any_N-term"],
    special_mods=["Phospho@S"],
    # labeling_channels={0:['Phospho@S']},
    precursor_mz_max=1e100,
)
protein_dict = get_protein_dict()
fasta_lib.import_and_process_protein_dict(protein_dict)
assert fasta_lib.precursor_df.mods.str.contains('Dimethyl@Any_N-term').all()
assert fasta_lib.precursor_df.mods.str.contains('Oxidation@M').any()
assert fasta_lib.precursor_df.mods.str.contains('Phospho@S').any()

In [None]:
fasta_lib = SpecLibFasta(
    var_mods = ["Oxidation@M"], 
    fix_mods = ["Dimethyl@K", "Dimethyl@Any_N-term"],
    special_mods=["Phospho@S","HexNAc@S"],
    # labeling_channels={0:['Phospho@S']},
    precursor_mz_max=1e100,
)
protein_dict = get_protein_dict()
fasta_lib.import_and_process_protein_dict(protein_dict)
assert fasta_lib.precursor_df.mods.str.contains('Dimethyl@Any_N-term').all()
assert fasta_lib.precursor_df.mods.str.contains('Oxidation@M').any()
assert fasta_lib.precursor_df.mods.str.contains('Phospho@S').any()
assert fasta_lib.precursor_df.mods.str.contains('HexNAc@S').any()

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,charge,precursor_mz
0,AFGHIJK,0;1,0,True,True,Dimethyl@Any_N-term;Dimethyl@K,0;7,7,2,4.212684e+02
1,LMNOPQR,0;1,0,False,True,Oxidation@M;Dimethyl@Any_N-term,2;0,7,2,5.202895e+02
2,LMNOPQR,0;1,0,False,True,Dimethyl@Any_N-term,0,7,2,5.122920e+02
3,ABCDESTK,0,0,True,False,Dimethyl@Any_N-term;Dimethyl@K;Phospho@S,0;8;6,8,2,6.000445e+06
4,ABCDESTK,0,0,True,False,Dimethyl@Any_N-term;Dimethyl@K;Phospho@S,0;8;6,8,3,4.000297e+06
...,...,...,...,...,...,...,...,...,...,...
124,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Dimethyl@Any_N-term;Dimethyl@K;Dimethyl@K;HexN...,0;9;16;7,23,3,4.000972e+06
125,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Dimethyl@Any_N-term;Dimethyl@K;Dimethyl@K;HexN...,0;9;16;7,23,4,3.000729e+06
126,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Dimethyl@Any_N-term;Dimethyl@K;Dimethyl@K,0;9;16,23,2,6.001356e+06
127,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Dimethyl@Any_N-term;Dimethyl@K;Dimethyl@K,0;9;16,23,3,4.000904e+06


In [None]:
fasta_lib = SpecLibFasta(
    var_mods = [], fix_mods = [],
    protease='([KL])',
    special_mods=['GlyGly@K'],
    special_mods_cannot_modify_pep_c_term=True,
    min_special_mod_num=1,
    # labeling_channels={0:['Phospho@S']},
    precursor_mz_max=1e100,
)
protein_dict = get_protein_dict()
fasta_lib.import_and_process_protein_dict(protein_dict)
assert fasta_lib.precursor_df.mods.str.contains('GlyGly@K').all()
assert (fasta_lib.precursor_df.nAA!=fasta_lib.precursor_df.mod_sites.apply(lambda x: int(x) if x else 0)).all()
assert (
    (fasta_lib.precursor_df.sequence.str[-1]=='L')|(fasta_lib.precursor_df.sequence.str[:-1].str.contains('K'))
).all()

Test labelling

In [None]:
fasta_lib = SpecLibFasta(
    var_mods = [], fix_mods = [],
    labeling_channels={
        0:['Dimethyl@Any_N-term','Dimethyl@K'],
        4:['Dimethyl:2H(6)13C(2)@Any_N-term','Dimethyl:2H(6)13C(2)@K'],
    }
)
protein_dict = get_protein_dict()
fasta_lib.import_and_process_protein_dict(protein_dict)
assert fasta_lib.precursor_df.mods.str.contains('Dimethyl').all()
assert fasta_lib.precursor_df.mods.str.contains('Dimethyl:2H(6)13C(2)', regex=False).any()
assert fasta_lib.precursor_df.mods.str.contains('Dimethyl@', regex=False).sum()==fasta_lib.precursor_df.mods.str.contains('Dimethyl:2H(6)13C(2)', regex=False).sum()
assert (fasta_lib.precursor_df.labeling_channel==0).sum() == fasta_lib.precursor_df.mods.str.contains('Dimethyl@', regex=False).sum()
assert (fasta_lib.precursor_df.labeling_channel==4).sum() == fasta_lib.precursor_df.mods.str.contains('Dimethyl:2H(6)13C(2)', regex=False).sum()