In [1]:
#---#| default_exp spectral_library.base

# Base Class for Spectral Libraries

In [2]:
from alphabase.spectral_library.base import SpecLibBase, annotate_fragments_from_speclib

### Testing

In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
#| hide
lib = SpecLibBase([])
lib._precursor_df = pd.DataFrame({
    'precursor_mz': [100,1000,1500,2000],
    'charge': 2,
})
lib.clip_by_precursor_mz_()
assert np.allclose(lib.precursor_df.precursor_mz.values, [1000,1500,2000])

Test `save_df(...,save_mod_seq_in_other_df=True)`

In [5]:

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
target_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)
target_lib._precursor_df = precursor_df
target_lib.calc_precursor_mz()
target_lib._fragment_mz_df = pd.DataFrame()
target_lib._fragment_intensity_df = pd.DataFrame()
if not os.path.isdir('sandbox'):
    os.makedirs('sandbox')
target_lib.save_hdf('sandbox/test_lib.hdf', save_mod_seq_in_other_df=True)
target_lib.save_df_to_hdf('sandbox/test_lib.hdf','protein_df',pd.DataFrame(
    {
        'id':[1,2],
        'full_name': [1,2],
        'description': [1,2],
        'sequence': [1,2]
    })
)

new_lib = SpecLibBase([])
new_lib.load_hdf('sandbox/test_lib.hdf', load_mod_seq=True)
assert len(new_lib.precursor_df) > 0
assert len(new_lib.fragment_mz_df) == 0
assert len(new_lib.fragment_intensity_df) == 0

assert 'sequence' in new_lib.precursor_df.columns
assert 'mod_seq_hash' in new_lib.precursor_df.columns

df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'precursor_df')
assert len(precursor_df)==len(df)
df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'protein_df')
assert len(df)==2
#os.remove('sandbox/test_lib.hdf')
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,mod_seq_hash,mod_seq_charge_hash
0,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988
1,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988
2,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988
3,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137
4,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137
5,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137


Test `save_df(...,save_mod_seq_in_other_df=False)`, the default setting

In [6]:

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
target_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)
target_lib._precursor_df = precursor_df
target_lib.calc_precursor_mz()
target_lib._fragment_mz_df = pd.DataFrame()
target_lib._fragment_intensity_df = pd.DataFrame()
if not os.path.isdir('sandbox'):
    os.makedirs('sandbox')
target_lib.save_hdf('sandbox/test_lib.hdf', save_mod_seq_in_other_df=False)
target_lib.save_df_to_hdf('sandbox/test_lib.hdf','protein_df',pd.DataFrame(
    {
        'id':[1,2],
        'full_name': [1,2],
        'description': [1,2],
        'sequence': [1,2]
    })
)

new_lib = SpecLibBase([])
new_lib.load_hdf('sandbox/test_lib.hdf', load_mod_seq=True)
assert len(new_lib.precursor_df) > 0
assert len(new_lib.fragment_mz_df) == 0
assert len(new_lib.fragment_intensity_df) == 0

assert 'sequence' in new_lib.precursor_df.columns
assert 'mod_seq_hash' in new_lib.precursor_df.columns

df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'precursor_df')
assert len(precursor_df)==len(df)
df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'protein_df')
assert len(df)==2
#os.remove('sandbox/test_lib.hdf')
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,mod_seq_hash,mod_seq_charge_hash
0,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988
1,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988
2,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988
3,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137
4,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137
5,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137


In [7]:
target_lib.append_decoy_sequence()
assert len(target_lib.precursor_df) == len(precursor_df)*2
target_lib.precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,mod_seq_hash,mod_seq_charge_hash,decoy
0,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988,0
1,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988,0
2,AGHCEWQMK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988,0
3,MQWECHGAK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988,1
4,MQWECHGAK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988,1
5,MQWECHGAK,Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,10801817329622117986,10801817329622117988,1
6,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137,0
7,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137,0
8,AGHCEWQMKAADER,,,14,2,816.356299,6831658824673244135,6831658824673244137,0
9,EDAAKMQWECHGAR,,,14,2,816.356299,6831658824673244135,6831658824673244137,1


In [8]:
target_lib.calc_fragment_mz_df()
assert 'b_z1' in target_lib.fragment_mz_df
assert len(target_lib.fragment_mz_df) == np.sum(target_lib.precursor_df.nAA-1)
target_lib.fragment_mz_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,114.054955,57.531116,1091.439697,546.223511
1,171.076416,86.041847,1034.418213,517.712769
2,308.135345,154.571304,897.359314,449.183319
3,468.165985,234.586624,737.328674,369.167969
4,597.208557,299.107910,608.286072,304.646698
...,...,...,...,...
121,1089.466919,545.237122,543.245605,272.126465
122,1192.476196,596.741699,440.236450,220.621857
123,1329.535034,665.271179,303.177521,152.092407
124,1386.556519,693.781921,246.156067,123.581673


In [9]:

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
empty_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)
empty_lib._precursor_df = precursor_df
empty_lib.calc_precursor_mz()
empty_lib.append_decoy_sequence()


# annotate only fragment mz
fragment_lib = SpecLibBase()
fragment_lib._precursor_df = empty_lib.precursor_df.copy()
fragment_lib.calc_fragment_mz_df()

empty_lib._precursor_df = empty_lib._precursor_df.sample(4)
empty_lib.annotate_fragments_from_speclib(fragment_lib, verbose=True)

size_before = len(empty_lib.fragment_mz_df)
empty_lib.remove_unused_fragments()
assert(size_before > len(empty_lib.fragment_mz_df))

# annotate both fragment mz and fragment intensity 
fragment_lib._fragment_intensity_df = pd.DataFrame(0, index=np.arange(len(fragment_lib.fragment_mz_df)), columns=fragment_lib.fragment_mz_df.columns)

empty_lib.annotate_fragments_from_speclib(fragment_lib, verbose=True)
size_before_mz = len(empty_lib.fragment_mz_df)
size_before_intensity = len(empty_lib.fragment_intensity_df)

empty_lib.remove_unused_fragments()
assert(size_before_mz > len(empty_lib.fragment_mz_df))
assert(size_before_intensity > len(empty_lib.fragment_intensity_df))

In [10]:

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
empty_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)
empty_lib._precursor_df = precursor_df
empty_lib.calc_precursor_mz()
empty_lib.append_decoy_sequence()

fragment_lib = SpecLibBase()
fragment_lib._precursor_df = empty_lib.precursor_df
fragment_lib.calc_fragment_mz_df()

empty_lib = annotate_fragments_from_speclib(empty_lib, fragment_lib)

# test SpecLibBase.copy()

In [11]:
repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
spec_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)

spec_lib._precursor_df = precursor_df
spec_lib.calc_precursor_mz()
spec_lib.append_decoy_sequence()
spec_lib.calc_fragment_mz_df()

spec_lib_copy = spec_lib.copy()

assert len(spec_lib_copy.precursor_df) == len(spec_lib.precursor_df)
assert len(spec_lib_copy.fragment_mz_df) == len(spec_lib.fragment_mz_df)
assert len(spec_lib_copy.fragment_intensity_df) == len(spec_lib.fragment_intensity_df)

spec_lib._precursor_df['precursor_mz'] = 0
assert all(spec_lib_copy._precursor_df['precursor_mz'] != 0)

In [12]:
lib1 = spec_lib.copy()
lib2 = spec_lib.copy()
lib1.append(lib2)
assert len(lib1.precursor_df) == len(spec_lib.precursor_df)*2
assert len(lib1.fragment_mz_df) == len(spec_lib.fragment_mz_df)*2
assert len(lib1.fragment_intensity_df) == len(spec_lib.fragment_intensity_df)*2

for i, (frag_start, frag_stop) in enumerate(zip(lib1._precursor_df['frag_start_idx'].values, lib1._precursor_df['frag_stop_idx'].values)):
    assert frag_start < frag_stop
    fragments = lib1.fragment_mz_df.iloc[frag_start:frag_stop]
    assert len(fragments) == lib1._precursor_df.iloc[i]['nAA']-1

In [13]:
lib1 = spec_lib.copy()
lib2 = spec_lib.copy()
lib2._fragment_mz_df = pd.DataFrame()

error = False
try:
    lib1.append(lib2)
except ValueError as e:
    print(e)
    error = True
assert error

The columns are not compatible. {'y_z2', 'b_z1', 'b_z2', 'y_z1'} are missing in the dataframe which should be appended.


In [14]:
lib1 = spec_lib.copy()
lib2 = spec_lib.copy()
lib2._precursor_df.drop('sequence', axis=1, inplace=True)

error = False
try:
    lib1.append(lib2)
except ValueError as e:
    print(e)
    error = True
assert error

The columns are not compatible. {'sequence'} are missing in the dataframe which should be appended.


In [15]:
lib1 = spec_lib.copy()
lib2 = spec_lib.copy()
lib1._precursor_df.drop('sequence', axis=1, inplace=True)
lib1.append(lib2)




In [16]:
repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

spec_lib = SpecLibBase()
spec_lib._precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
spec_lib._precursor_df['charge'] = 2
spec_lib.calc_precursor_mz()
spec_lib.calc_fragment_mz_df()
spec_lib._fragment_intensity_df = spec_lib._fragment_mz_df.copy()

# add random intensity
for col in spec_lib._fragment_intensity_df.columns:
    spec_lib._fragment_intensity_df[col] = np.random.rand(len(spec_lib._fragment_intensity_df)) * np.random.randint(0,2, len(spec_lib._fragment_intensity_df))

# calculate fragment number
spec_lib.calc_fragment_count()

# set maximum number of fragments to random number
spec_lib._precursor_df['n_fragments_allowed'] = np.random.randint(0, 10, len(spec_lib._precursor_df))
spec_lib.filter_fragment_number()
spec_lib.calc_fragment_count()

assert np.all(spec_lib._precursor_df['n_fragments_allowed'].values >= spec_lib._precursor_df['n_fragments'].values)

In [30]:
from alphabase.peptide import fragment

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein_N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

spec_lib = SpecLibBase(charged_frag_types=fragment.get_charged_frag_types(['a','b','x','y'],2))
spec_lib._precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
spec_lib._precursor_df['charge'] = 2
spec_lib.calc_precursor_mz()
spec_lib.calc_fragment_mz_df()

In [39]:
import tempfile
tmp_folder = tempfile.mkdtemp()
tmp_file = os.path.join(tmp_folder, 'test.hdf')
spec_lib.save_hdf(tmp_file)

# expects inference by default
spec_lib_loaded = SpecLibBase()
spec_lib_loaded.load_hdf(tmp_file)
assert spec_lib_loaded.charged_frag_types == spec_lib.charged_frag_types
assert np.all(spec_lib_loaded._fragment_mz_df.columns == spec_lib._fragment_mz_df.columns)

# set charged_frag_types manually
spec_lib_loaded = SpecLibBase(charged_frag_types=fragment.get_charged_frag_types(['a','b','x','y'],2))
spec_lib_loaded.load_hdf(tmp_file, infer_charged_frag_types=False)
assert spec_lib_loaded.charged_frag_types == spec_lib.charged_frag_types
assert np.all(spec_lib_loaded._fragment_mz_df.columns == spec_lib._fragment_mz_df.columns)

# enable inference
spec_lib_loaded = SpecLibBase()
spec_lib_loaded.load_hdf(tmp_file, infer_charged_frag_types=True)
assert spec_lib_loaded.charged_frag_types == spec_lib.charged_frag_types
assert np.all(spec_lib_loaded._fragment_mz_df.columns == spec_lib._fragment_mz_df.columns)

# disable inference
spec_lib_loaded = SpecLibBase()
spec_lib_loaded.load_hdf(tmp_file, infer_charged_frag_types=False)
assert spec_lib_loaded.charged_frag_types == fragment.get_charged_frag_types(['b','y'],2)
assert np.all(spec_lib_loaded._fragment_mz_df.columns == fragment.get_charged_frag_types(['b','y'],2))