In [None]:
#---#| default_exp protein.fasta

# Protein and Peptide Processing

In [None]:
from alphabase.protein.fasta import *
import alphabase.protein.fasta as fasta

In [None]:
#| hide
assert get_uniprot_gene_name('sp|Q9H9K5|MER34_HUMAN Endogenous retroviral envelope protein HEMO OS=Homo sapiens OX=9606 GN=ERVMER34-1 PE=1 SV=1') == 'ERVMER34-1'

In [None]:
assert 'trypsin_not_p' in protease_dict, 'trypsin_not_p not in protease_dict, why?'

In [None]:
#| hide
p = re.compile(protease_dict['trypsin_not_p'])

idx = '0123456789012345678901234567890123456789012345'
seq = 'ABDNGKENGLANGIXHGRKTNGLANGKVHNAKHNARKANGKPFAAT'
cut_pos = np.array([m.start()+1 for m in p.finditer(seq)])
assert np.all(cut_pos==np.array([6, 18, 19, 27, 32, 36, 37]))

In [None]:
#| hide
seq = 'MABCDEKHIJKLNOPQRST'
digest = Digest()
seq_list, miss_list, nterm_list, cterm_list = digest.cleave_sequence(seq)
assert len(seq_list) == len(miss_list) == len(nterm_list) == len(cterm_list)
M_start_seqs = [seq for seq in seq_list if seq.startswith('M')]
assert len(M_start_seqs)*2 == len([_ for _ in nterm_list if _])
assert np.all(nterm_list[-len(M_start_seqs):])
T_end_seqs = [seq for seq in seq_list if seq.endswith('T')]
assert len(T_end_seqs) == len([_ for _ in cterm_list if _])

In [None]:
seq = 'ACBCDCK'
_fix_mod_dict = {}
_fix_mod_dict['C'] = 'mod@C'
mods, mod_sites = get_fix_mods(seq, 'C', _fix_mod_dict)
assert mods==';'.join(['mod@C']*3)
assert mod_sites=='2;4;6'
get_fix_mods(seq, 'C', _fix_mod_dict)

('mod@C;mod@C;mod@C', '2;4;6')

In [None]:
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
assert np.all(np.array(candidate_sites)==np.array([2,4,5,6,7]))
get_var_mod_sites(seq, 'MSTY', 3, 20)

[(2,),
 (4,),
 (5,),
 (6,),
 (7,),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (4, 5),
 (4, 6),
 (4, 7),
 (5, 6),
 (5, 7),
 (6, 7),
 (2, 4, 5),
 (2, 4, 6),
 (2, 4, 7),
 (2, 5, 6),
 (2, 5, 7)]

In [None]:
#| hide
fasta.get_var_mods_per_sites = get_var_mods_per_sites_multi_mods_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 3, 20)
_mod_dict = {
    'M':['mod@M'],
    'S':['mod@S','modX@S'],
    'T':['mod@T'],
    'Y':['mod@Y'],
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

assert 'mod@M;mod@M;mod@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
assert 'mod@M;mod@M;modX@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;modX@S')] == '2;4;5'
assert 'mod@M;mod@S' in _mods
assert 'mod@M;modX@S' in _mods
get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'modX@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'modX@S;mod@T',
  'mod@S;mod@Y',
  'modX@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S',
  'mod@M;mod@M;modX@S'],
 ['2',
  '4',
  '5',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;6',
  '5;7',
  '5;7',
  '6;7',
  '2;4;5',
  '2;4;5'])

In [None]:
#| hide
fasta.get_var_mods_per_sites = get_var_mods_per_sites_single_mod_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 3, 20)
_mod_dict = {
    'M':'mod@M',
    'S':'mod@S',
    'T':'mod@T',
    'Y':'mod@Y',
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)
assert len(_mods) == len(_sites) == 16
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'mod@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S'],
 ['2',
  '4',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;7',
  '6;7',
  '2;4;5'])

In [None]:
#| hide
assert parse_term_mod('Acetyl@Protein N-term') == ('', 'Protein N-term')
assert parse_term_mod('Gln->pyro-Glu@Q^Any N-term') == ('Q', 'Any N-term')

In [None]:
#| hide
labels = ['label@Any N-term','label@K']
(
    label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) = parse_labels(labels)
assert add_single_peptide_labeling(
    'ABCK','','', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('label@Any N-term;label@K', '0;4')
assert add_single_peptide_labeling(
    'ABCK','Mod@Any N-term','0', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('Mod@Any N-term;label@K', '0;4')
assert add_single_peptide_labeling(
    'KBCK','','', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('label@Any N-term;label@K;label@K', '0;1;4')
assert add_single_peptide_labeling(
    'KBCK','Mod@Any N-term','0', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('Mod@Any N-term;label@K;label@K', '0;1;4')
pep_df = pd.DataFrame({
    'sequence': ['ABCD','ABCK','KABK','EFGK'],
    'mods': ['']*3+['Mod@Any N-term'],
    'mod_sites': ['']*3+['0']
})
df = create_labeling_peptide_df(pep_df, labels)
assert np.all(df.mods.values!=pep_df.mods.values)
assert df[df.sequence=='ABCD'].mods.values[0] == 'label@Any N-term'
assert df[df.sequence=='ABCD'].mod_sites.values[0] == '0'
assert df[df.sequence=='ABCK'].mods.values[0] == 'label@Any N-term;label@K'
assert df[df.sequence=='ABCK'].mod_sites.values[0] == '0;4'
assert df[df.sequence=='KABK'].mods.values[0] == 'label@Any N-term;label@K;label@K'
assert df[df.sequence=='KABK'].mod_sites.values[0] == '0;1;4'
assert df[df.sequence=='EFGK'].mods.values[0] == 'Mod@Any N-term;label@K'
assert df[df.sequence=='EFGK'].mod_sites.values[0] == '0;4'
df = create_labeling_peptide_df(pep_df, [])
assert np.all(df.mods.values==pep_df.mods.values)

In [None]:
#| hide
assert protein_idxes_to_names('0;1', ['A','','B'])=='A'
assert protein_idxes_to_names('0;1', ['A','C','B'])=='A;C'

### Testing

In [None]:
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_regular_modifications(df, keep_unmodified=True)
assert np.sum(df.sequence=='ABSTY')==4
assert np.sum(df.sequence=='ACXSX')==2
assert np.sum(df.sequence=='ACDEFG')==1
assert all(df[df.sequence=='ABSTY'].mods.values == np.array(['Phospho@S','Phospho@T','Phospho@Y','']))
assert all(df[df.sequence=='ABSTY'].mod_sites.values == np.array(['3','4','5','']))
assert all(df[df.sequence=='ACXSX'].mods.values == np.array(['Acetyl@Protein N-term;Phospho@S','Acetyl@Protein N-term']))
assert all(df[df.sequence=='ACXSX'].mod_sites.values == np.array(['0;4','0']))
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ABSTY,,
4,ACXSX,Acetyl@Protein N-term;Phospho@S,0;4
5,ACXSX,Acetyl@Protein N-term,0
6,ACDEFG,,


In [None]:
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_regular_modifications(df, keep_unmodified=False)
assert np.sum(df.sequence=='ABSTY')==3
assert np.sum(df.sequence=='ACXSX')==1
assert np.sum(df.sequence=='ACDEFG')==0
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ACXSX,Acetyl@Protein N-term;Phospho@S,0;4


In [None]:
_lib = FastaLib(None, I_to_L=False, decoy='pseudo_reverse')
prot1 = 'MABCDESTKAFGHIJKLMNOPQRAFGHIJK'
prot2 = 'AFGHIJKLMNOPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'gene_name': '',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'gene_name': 'gene',
        'sequence': prot2
    }
}
_lib.get_peptides_from_protein_dict(protein_dict)
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,AFGHIJK,0;1,0,True,True,,,7
1,LMNOPQR,0;1,0,False,True,,,7
2,ABCDESTK,0,0,True,False,,,8
3,MABCDESTK,0,0,True,False,,,9
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14
5,LMNOPQRAFGHIJK,0,1,False,True,,,14
6,ABCDESTKAFGHIJK,0,1,True,False,,,15
7,MABCDESTKAFGHIJK,0,1,True,False,,,16
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22


In [None]:
_lib.protein_df

Unnamed: 0,protein_id,gene_name,sequence
0,xx,,MABCDESTKAFGHIJKLMNOPQRAFGHIJK
1,yy,gene,AFGHIJKLMNOPQR


In [None]:
_lib.append_protein_name()
assert 'proteins' in _lib.precursor_df.columns
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
2,ABCDESTK,0,0,True,False,,,8,xx,
3,MABCDESTK,0,0,True,False,,,9,xx,
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14,xx;yy,gene
5,LMNOPQRAFGHIJK,0,1,False,True,,,14,xx,
6,ABCDESTKAFGHIJK,0,1,True,False,,,15,xx,
7,MABCDESTKAFGHIJK,0,1,True,False,,,16,xx,
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21,xx,
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22,xx,


In [None]:
#| hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    # test is_prot_nterm
    if prot1.startswith(seq) or prot2.startswith(seq):
        assert _lib.precursor_df.is_prot_nterm[i], seq
    elif prot1[1:].startswith(seq): # M.xxxxx
        assert _lib.precursor_df.is_prot_nterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_nterm[i], seq
    # test is_prot_cterm
    if prot1.endswith(seq) or prot2.endswith(seq):
        assert _lib.precursor_df.is_prot_cterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_cterm[i], seq
    # test protein_idxes
    if seq in prot1 and seq in prot2:
        assert _lib.precursor_df.protein_idxes[i] == '0;1'
        assert _lib.precursor_df.proteins[i] == 'xx;yy'
        assert _lib.precursor_df.genes[i] == 'gene'
    else:
        assert ';' not in _lib.precursor_df.protein_idxes[i]
        assert ';' not in _lib.precursor_df.proteins[i]
        assert _lib.precursor_df.genes[i] == ''

In [None]:
_lib.add_modifications()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C,3,8,xx,
5,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,3;0,8,xx,
6,MABCDESTK,0,0,True,False,Carbamidomethyl@C;Oxidation@M,4;1,9,xx,
7,MABCDESTK,0,0,True,False,Carbamidomethyl@C,4,9,xx,
8,MABCDESTK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1,9,xx,
9,MABCDESTK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,4;0,9,xx,


In [None]:
#| hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    mods = _lib.precursor_df.mods[i]
    sites = _lib.precursor_df.mod_sites[i]
    # test fix mods
    if 'C' in seq:
        assert str(seq.find('C')+1) in sites
        assert 'Carbamidomethyl@C' in mods
    else:
        assert 'Carbamidomethyl@C' not in mods
    # test Acetyl@Protein N-term
    if 'Acetyl@Protein N-term' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert '0' in sites
    if '0' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert 'Acetyl@Protein N-term' in mods
    if not _lib.precursor_df.is_prot_nterm[i]:
        assert 'Acetyl@Protein N-term' not in mods
    # test Oxidation@M
    if 'Oxidation@M' in mods:
        assert 'M' in seq
        assert str(seq.find('M')+1) in sites
    # test unmodified
    if mods == '':
        assert sites == ''
    if sites == '':
        assert mods == ''
df = _lib.precursor_df
# at least one nterm peptide does not contain Acetyl@Protein N-term
assert not df[df.is_prot_nterm].mod_sites.str.contains('0').all()
# at least one nterm peptide contains Acetyl@Protein N-term
assert df[df.is_prot_nterm].mod_sites.str.contains('0').any()
# test var mod Oxidation@M
assert not df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').all()
assert df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').any()
assert '' in df.mods.values

In [None]:
_lib.add_additional_modifications(['Phospho@S','Phospho@T'])
assert _lib.precursor_df.mods.str.contains('Phospho').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Phospho@S,3;6,8,xx,
...,...,...,...,...,...,...,...,...,...,...
79,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;8,23,xx,
80,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18,23,xx,
81,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phospho@S,4;0;7,23,xx,
82,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phospho@T,4;0;8,23,xx,


In [None]:
#| hide
_lib.add_peptide_labeling({
    'none': [], # not labelled for reference
    'light': ['Dimethyl@Any N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes,label_channel
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene,none
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene,none
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene,none
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene,none
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Phospho@S,3;6,8,xx,,none
...,...,...,...,...,...,...,...,...,...,...,...
247,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;8;9;16,23,xx,,heavy
248,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;9;16,23,xx,,heavy
249,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;7;9;16,23,xx,,heavy
250,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;8;9;16,23,xx,,heavy


In [None]:
_lib = FastaLib(
    ['b_z1','y_z1'], I_to_L=False, 
    decoy='pseudo_reverse',
    precursor_mz_min=200,
)
prot1 = 'MACDESTYKBKFGHIKLMNPQRST'
prot2 = 'FGHIKLMNPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'sequence': prot2
    }
}
_lib.import_and_process_protein_dict(protein_dict)
_lib.calc_precursor_isotope()
assert (_lib.precursor_df.charge == _lib.min_precursor_charge).any()
assert (_lib.precursor_df.charge == _lib.max_precursor_charge).any()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'isotope_apex_offset' in _lib.precursor_df.columns
assert 'isotope_apex_intensity' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,precursor_mz,isotope_m1_intensity,isotope_apex_intensity,isotope_apex_offset,isotope_right_most_intensity,isotope_right_most_offset,isotope_m1_mz,isotope_apex_mz,isotope_right_most_mz
0,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,2,481.739834,0.478814,1.0,0,0.478814,1,482.241484,481.739834,482.241484
1,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,3,321.495648,0.478814,1.0,0,0.478814,1,321.830081,321.495648,321.830081
2,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,4,241.373555,0.478814,1.0,0,0.478814,1,241.624380,241.373555,241.624380
3,LMNPQRST,0,1,False,True,,,8,0,2,473.742377,0.478433,1.0,0,0.478433,1,474.244027,473.742377,474.244027
4,LMNPQRST,0,1,False,True,,,8,0,3,316.164010,0.478433,1.0,0,0.478433,1,316.498443,316.164010,316.498443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,3,515.604920,0.828432,1.0,0,0.420789,2,515.939354,515.604920,516.273787
80,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,4,386.955509,0.828432,1.0,0,0.420789,2,387.206334,386.955509,387.457159
81,FGHIKLMNPQRST,0,2,False,True,,,13,0,2,764.906285,0.828051,1.0,0,0.418418,2,765.407935,764.906285,765.909585
82,FGHIKLMNPQRST,0,2,False,True,,,13,0,3,510.273282,0.828051,1.0,0,0.418418,2,510.607715,510.273282,510.942149


In [None]:
_lib.import_and_process_protein_dict(protein_dict)
_lib.add_peptide_labeling({
    'light': ['Dimethyl@Any N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.calc_precursor_isotope()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'isotope_apex_offset' in _lib.precursor_df.columns
assert 'isotope_apex_intensity' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,label_channel,precursor_mz,isotope_m1_intensity,isotope_apex_intensity,isotope_apex_offset,isotope_right_most_intensity,isotope_right_most_offset,isotope_m1_mz,isotope_apex_mz,isotope_right_most_mz
0,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any N-term,2;0,8,0,2,light,495.755484,0.500906,1.0,0,0.500906,1,496.257134,495.755484,496.257134
1,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any N-term,2;0,8,0,3,light,330.839415,0.500906,1.0,0,0.500906,1,331.173848,330.839415,331.173848
2,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any N-term,2;0,8,0,4,light,248.381380,0.500906,1.0,0,0.500906,1,248.632205,248.381380,248.632205
3,LMNPQRST,0,1,False,True,Dimethyl@Any N-term,0,8,0,2,light,487.758027,0.500525,1.0,0,0.500525,1,488.259677,487.758027,488.259677
4,LMNPQRST,0,1,False,True,Dimethyl@Any N-term,0,8,0,3,light,325.507777,0.500525,1.0,0,0.500525,1,325.842210,325.507777,325.842210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,FGHIKLMNPQRST,0,2,False,True,Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term;Di...,7;0;5,13,0,3,heavy,539.655367,0.788273,1.0,0,0.392103,2,539.989801,539.655367,540.324234
164,FGHIKLMNPQRST,0,2,False,True,Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term;Di...,7;0;5,13,0,4,heavy,404.993344,0.788273,1.0,0,0.392103,2,405.244169,404.993344,405.494994
165,FGHIKLMNPQRST,0,2,False,True,Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...,0;5,13,0,2,heavy,800.981955,0.787646,1.0,0,0.389779,2,801.483605,800.981955,801.985255
166,FGHIKLMNPQRST,0,2,False,True,Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...,0;5,13,0,3,heavy,534.323729,0.787646,1.0,0,0.389779,2,534.658162,534.323729,534.992596
