In [None]:
#---#| default_exp spectral_library.base

# Base Class for Spectral Libraries

In [None]:
from alphabase.spectral_library.base import *

### Testing

In [None]:
import pandas as pd
import os

In [None]:
#| hide
lib = SpecLibBase([])
lib._precursor_df = pd.DataFrame({
    'precursor_mz': [100,1000,1500,2000],
    'charge': 2,
})
lib.clip_by_precursor_mz_()
assert np.allclose(lib.precursor_df.precursor_mz.values, [1000,1500,2000])

In [None]:

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
target_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)
target_lib._precursor_df = precursor_df
target_lib.calc_precursor_mz()
target_lib._fragment_mz_df = pd.DataFrame()
target_lib._fragment_intensity_df = pd.DataFrame()
if not os.path.isdir('sandbox'):
    os.makedirs('sandbox')
target_lib.save_hdf('sandbox/test_lib.hdf')
target_lib.save_df_to_hdf('sandbox/test_lib.hdf','protein_df',pd.DataFrame(
    {
        'id':[1,2],
        'full_name': [1,2],
        'description': [1,2],
        'sequence': [1,2]
    })
)
new_lib = SpecLibBase([])
new_lib.load_hdf('sandbox/test_lib.hdf')

assert len(new_lib.precursor_df) > 0
assert len(new_lib.fragment_mz_df) == 0
assert len(new_lib.fragment_intensity_df) == 0

assert 'sequence' not in new_lib.precursor_df.columns
assert 'mod_seq_hash' in new_lib.precursor_df.columns


new_lib = SpecLibBase([])
new_lib.load_hdf('sandbox/test_lib.hdf', load_mod_seq=True)
assert 'sequence' in new_lib.precursor_df.columns
assert 'mod_seq_hash' in new_lib.precursor_df.columns

df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'precursor_df')
assert len(precursor_df)==len(df)
df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'protein_df')
assert len(df)==2
#os.remove('sandbox/test_lib.hdf')
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,mod_seq_hash,mod_seq_charge_hash
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188
3,AGHCEWQMKAADER,,,14,2,816.356299,-1606275412423975023,-1606275412423975021
4,AGHCEWQMKAADER,,,14,2,816.356299,-1606275412423975023,-1606275412423975021
5,AGHCEWQMKAADER,,,14,2,816.356299,-1606275412423975023,-1606275412423975021


In [None]:
target_lib.append_decoy_sequence()
assert len(target_lib.precursor_df) == len(precursor_df)*2
target_lib.precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,mod_seq_hash,mod_seq_charge_hash,decoy
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188,0
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188,0
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188,0
3,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188,1
4,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188,1
5,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-5783464648586361190,-5783464648586361188,1
6,AGHCEWQMKAADER,,,14,2,816.356299,-1606275412423975023,-1606275412423975021,0
7,AGHCEWQMKAADER,,,14,2,816.356299,-1606275412423975023,-1606275412423975021,0
8,AGHCEWQMKAADER,,,14,2,816.356299,-1606275412423975023,-1606275412423975021,0
9,EDAAKMQWECHGAR,,,14,2,816.356299,-1606275412423975023,-1606275412423975021,1


In [None]:
target_lib.calc_fragment_mz_df()
assert 'b_z1' in target_lib.fragment_mz_df
assert len(target_lib.fragment_mz_df) == np.sum(target_lib.precursor_df.nAA-1)
target_lib.fragment_mz_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,114.054955,57.531116,1091.439712,546.223494
1,171.076419,86.041848,1034.418248,517.712762
2,308.135331,154.571303,897.359336,449.183306
3,468.165979,234.586628,737.328687,369.167982
4,597.208572,299.107924,608.286094,304.646685
...,...,...,...,...
121,1089.466972,545.237124,543.245626,272.126451
122,1192.476157,596.741717,440.236442,220.621859
123,1329.535069,665.271173,303.177530,152.092403
124,1386.556532,693.781904,246.156066,123.581671


In [None]:

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
empty_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)
empty_lib._precursor_df = precursor_df
empty_lib.calc_precursor_mz()
empty_lib.append_decoy_sequence()


# annotate only fragment mz
fragment_lib = SpecLibBase()
fragment_lib._precursor_df = empty_lib.precursor_df.copy()
fragment_lib.calc_fragment_mz_df()

empty_lib._precursor_df = empty_lib._precursor_df.sample(4)
empty_lib.annotate_fragments_from_speclib(fragment_lib, verbose=True)

size_before = len(empty_lib.fragment_mz_df)
empty_lib.remove_unused_fragments()
assert(size_before > len(empty_lib.fragment_mz_df))

# annotate both fragment mz and fragment intensity 
fragment_lib._fragment_intensity_df = pd.DataFrame(0, index=np.arange(len(fragment_lib.fragment_mz_df)), columns=fragment_lib.fragment_mz_df.columns)

empty_lib.annotate_fragments_from_speclib(fragment_lib, verbose=True)
size_before_mz = len(empty_lib.fragment_mz_df)
size_before_intensity = len(empty_lib.fragment_intensity_df)

empty_lib.remove_unused_fragments()
assert(size_before_mz > len(empty_lib.fragment_mz_df))
assert(size_before_intensity > len(empty_lib.fragment_intensity_df))

2022-12-21 13:45:10> Speclib with 4 precursors will be reannotated with speclib with 12 precursors and 504 fragments
2022-12-21 13:45:11> A total of 4 precursors were succesfully annotated, 0 precursors were not matched
2022-12-21 13:45:11> Speclib with 4 precursors will be reannotated with speclib with 12 precursors and 504 fragments
2022-12-21 13:45:11> A total of 4 precursors were succesfully annotated, 0 precursors were not matched


In [None]:

repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
empty_lib = SpecLibBase(
    ['b_z1','b_z2','y_z1','y_z2'],
    decoy='pseudo_reverse'
)
empty_lib._precursor_df = precursor_df
empty_lib.calc_precursor_mz()
empty_lib.append_decoy_sequence()

fragment_lib = SpecLibBase()
fragment_lib._precursor_df = empty_lib.precursor_df
fragment_lib.calc_fragment_mz_df()

empty_lib = annotate_fragments_from_speclib(empty_lib, fragment_lib)

2022-12-21 13:45:11> Speclib with 12 precursors will be reannotated with speclib with 12 precursors and 504 fragments
2022-12-21 13:45:11> A total of 12 precursors were succesfully annotated, 0 precursors were not matched
