In [1]:
import pandas as pd
import numpy as np
import os

import logging
logging.getLogger().setLevel(logging.INFO)

from alphabase.spectral_library.flat import SpecLibFlat
from alphabase.spectral_library.base import SpecLibBase
from alphabase.spectral_library.reader import SWATHLibraryReader
from alphadia.extraction.utils import reannotate_fragments
from alphabase.psm_reader import psm_reader_provider

INFO:root:
INFO:root:Cupy is not available


In [5]:
reader = psm_reader_provider.get_reader('maxquant', rt_unit='minute', keep_decoy=True, fdr=1)
reader.import_file('/Users/georgwallmann/Documents/data/raw_data/synchro_PASEF_4prot/dda_psms_maxquant/evidence.txt')

from peptdeep.pretrained_models import ModelManager
from alphabase.peptide.fragment import get_charged_frag_types

frag_types = get_charged_frag_types(
    ['b','y'], 
    2
)

model_mgr = ModelManager()

model_mgr.nce = 30
model_mgr.instrument = 'timsTOF'

res = model_mgr.predict_all(
    reader.psm_df,
    predict_items=['ms2'],
    frag_types = frag_types,
)

2023-02-01 11:14:28> Predicting MS2 ...


100%|██████████| 19/19 [00:00<00:00, 63.62it/s]


In [12]:
target_lib = SpecLibBase()
target_lib._precursor_df = res['precursor_df']
target_lib._fragment_intensity_df = res['fragment_intensity_df']
target_lib._fragment_mz_df = res['fragment_mz_df']

In [14]:
# create dense library from diann psm file
target_lib._precursor_df['elution_group_idx'] = np.arange(len(target_lib._precursor_df))

In [15]:
decoy_lib = SpecLibBase()
decoy_lib._precursor_df = target_lib.precursor_df.copy()
decoy_lib._fragment_mz_df = target_lib._fragment_mz_df.copy()
decoy_lib._fragment_intensity_df = target_lib._fragment_intensity_df.copy()

#decoy_lib.import_file(brunner_lib)
decoy_lib.decoy = 'diann'
decoy_lib.append_decoy_sequence()
decoy_lib.calc_precursor_mz()
decoy_lib.precursor_df = decoy_lib.precursor_df[decoy_lib.precursor_df['decoy'] == 1]

In [16]:
from peptdeep.pretrained_models import ModelManager
from alphabase.peptide.fragment import get_charged_frag_types

frag_types = get_charged_frag_types(
    ['b','y'], 
    2
)

model_mgr = ModelManager()

model_mgr.nce = 30
model_mgr.instrument = 'timsTOF'

In [17]:
res = model_mgr.predict_all(
    decoy_lib.precursor_df,
    predict_items=['ms2'],
    frag_types = frag_types,
)

2023-02-01 11:16:48> Predicting MS2 ...


100%|██████████| 19/19 [00:00<00:00, 69.72it/s]


In [18]:
decoy_lib._precursor_df = res['precursor_df']
decoy_lib._fragment_mz_df = res['fragment_mz_df']
decoy_lib._fragment_intensity_df = res['fragment_intensity_df']

In [19]:
decoy_lib._precursor_df[['frag_start_idx']] += len(target_lib._fragment_mz_df)
decoy_lib._precursor_df[['frag_stop_idx']] += len(target_lib._fragment_mz_df)

In [20]:
output_lib = SpecLibBase()

output_lib._precursor_df = pd.concat([target_lib._precursor_df, decoy_lib._precursor_df], join='inner')
output_lib._fragment_mz_df = pd.concat([target_lib._fragment_mz_df, decoy_lib._fragment_mz_df], join='inner')
output_lib._fragment_intensity_df = pd.concat([target_lib._fragment_intensity_df, decoy_lib._fragment_intensity_df], join='inner')

In [21]:
output_lib._precursor_df = output_lib._precursor_df.sort_values(by=['elution_group_idx']).reset_index(drop=True)
output_lib.precursor_df['precursor_idx'] = np.arange(len(output_lib.precursor_df))

In [22]:
output_lib.precursor_df

from alphabase.constants.isotope import IsotopeDistribution
isotope_dist = IsotopeDistribution()

output_lib.precursor_df[['sequence','mods']].iloc[0]

sequence    DYSQYYR
mods               
Name: 0, dtype: object

In [23]:
output_lib.precursor_df.sort_values(by=['nAA'], ascending=False).iloc[0]

sequence                                  LVSHSLLVTLASHLPSDFTPAVHASLEK
charge                                                               4
rt                                                              18.173
ccs                                                         807.139316
mobility                                                      1.002955
scan_num                                                         26754
raw_name             20220923_TIMS03_PaSk_SA_4prot_HeLa_Evo05_21min...
precursor_mz                                                743.157828
score                                                           31.674
proteins                                                   CON__P01966
decoy                                                                1
intensity                                                      80869.0
spec_idx                                                         26753
mods                                                                  
mod_si

In [42]:
max_isotope = 8



In [24]:
from tqdm import tqdm
from alphabase.peptide.precursor import get_mod_seq_formula

def calc_isotope_dist(precursor_df, max_isotope = 6, min_intensity = 0.001):

    col_names = ['i_{}'.format(i) for i in range(max_isotope)]
    output_lib.precursor_df[col_names] = 0.

    for i in tqdm(range(len(output_lib.precursor_df))):

        row = output_lib.precursor_df.iloc[i]
        dist, mono = isotope_dist.calc_formula_distribution(
            get_mod_seq_formula(row['sequence'], row['mods'])
        )
        dist[dist < 0.001] = 0.
        dist = dist / dist.sum()
        output_lib.precursor_df.loc[i, col_names] = dist[:max_isotope]

In [25]:
calc_isotope_dist(output_lib.precursor_df)

100%|██████████| 527/527 [00:00<00:00, 2469.02it/s]


NameError: name 'col_names' is not defined

In [49]:
output_lib.calc_precursor_isotope()
output_lib._precursor_df.drop(columns=[
    'isotope_m1_intensity', 
    'isotope_apex_intensity', 
    'isotope_right_most_intensity',
    'isotope_right_most_offset',
    'isotope_m1_mz',
    'isotope_apex_mz',
    'isotope_right_most_mz'
    ], inplace=True)

In [29]:
output_lib.fragment_intensity_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,0.000000,0.0,0.000000,0.0
1,0.071987,0.0,1.000000,0.0
2,0.026730,0.0,0.356210,0.0
3,0.000000,0.0,0.554461,0.0
4,0.000000,0.0,0.190057,0.0
...,...,...,...,...
3022,0.000000,0.0,0.452760,0.0
3023,0.000000,0.0,0.256635,0.0
3024,0.000000,0.0,0.063062,0.0
3025,0.000000,0.0,0.056902,0.0


In [31]:
output_location = '/Users/georgwallmann/Documents/data/raw_data/synchro_PASEF_4prot/dda_psms_maxquant/alpha_lib.hdf'

output_lib.save_hdf(output_location)

In [52]:
def check_fragment_coverage(speclib):
    nAA = speclib.precursor_df['nAA'].values
    frag_end = speclib.precursor_df['frag_end_idx'].values
    frag_start = speclib.precursor_df['frag_start_idx'].values
    frag_num = frag_end - frag_start
    aa_count = np.sum(nAA - 1)
    frag_count = np.sum(frag_num)
    print(aa_count, frag_count)