In [1]:
import pandas as pd
import numpy as np
import os

import logging
logging.getLogger().setLevel(logging.INFO)

from alphabase.spectral_library.flat import SpecLibFlat
from alphabase.spectral_library.base import SpecLibBase
from alphabase.spectral_library.reader import SWATHLibraryReader
from alphadia.extraction.utils import reannotate_fragments
from alphabase.psm_reader import psm_reader_provider

INFO:root:
INFO:root:Cupy is not available


In [2]:
brunner_lib = '/Users/georgwallmann/Documents/data/brunner2022/brunner_1ng_lib.tsv'
insilico_lib_location = '/Users/georgwallmann/Documents/data/libraries/2022_111_brunner_2022_1ng_all_29_human_trypsin_diann_decoy.hdf'
output_location = '/Users/georgwallmann/Library/CloudStorage/OneDrive-Personal/Studium/MPI/AG Mann/alphaDIA/2022_12_18_performance_testing/out_lib_eg.hdf'

In [3]:
# create dense library from diann psm file
target_lib = SWATHLibraryReader()
target_lib.import_file(brunner_lib)
target_lib.precursor_df['decoy'] = 0
target_lib._fragment_mz_df = target_lib._fragment_mz_df[['b_z1', 'b_z2', 'y_z1', 'y_z2']]
target_lib._fragment_intensity_df = target_lib._fragment_intensity_df[['b_z1', 'b_z2', 'y_z1', 'y_z2']]
target_lib._precursor_df['elution_group_idx'] = np.arange(len(target_lib._precursor_df))

Index(['FileName', 'PrecursorMz', 'ProductMz', 'Tr_recalibrated',
       'IonMobility', 'transition_name', 'LibraryIntensity',
       'transition_group_id', 'decoy', 'PeptideSequence', 'Proteotypic',
       'QValue', 'PGQValue', 'Ms1ProfileCorr', 'ProteinGroup', 'ProteinName',
       'Genes', 'FullUniModPeptideName', 'ModifiedPeptide', 'PrecursorCharge',
       'PeptideGroupLabel', 'UniprotID', 'NTerm', 'CTerm', 'FragmentType',
       'FragmentCharge', 'FragmentSeriesNumber', 'FragmentLossType',
       'ExcludeFromAssay'],
      dtype='object')


In [4]:
print(target_lib.precursor_df)

                        sequence  charge         rt  mobility     proteins  \
0                    AAAAAAALQAK       2 -15.465085  0.856250    RL4_HUMAN   
1                AAAAALSQQQSLQER       2  -3.104374  1.030682  CATIN_HUMAN   
2              AAAAASAAGPGGLVAGK       2   2.835199  0.943333  U119B_HUMAN   
3              AAAAAWEEPSSGNGTAR       2   2.707207  1.039318   RCC2_HUMAN   
4      AAAASAAEAGIATTGTEDSDDALLK       3  38.684788  1.000227  PSMD4_HUMAN   
...                          ...     ...        ...       ...          ...   
27878                YYTVFDRDNNR       3  13.692140  0.745104   CATD_HUMAN   
27879                   YYVLNALK       2  46.346928  0.849271   PRP8_HUMAN   
27880               YYVTIIDAPGHR       2  30.804520  1.032708  EF1A1_HUMAN   
27881               YYVTIIDAPGHR       3  30.787134  0.792500  EF1A1_HUMAN   
27882                   YYYIPQYK       2  35.303143  0.908542  NDUF2_HUMAN   

      mods mod_sites  nAA  frag_start_idx  frag_stop_idx   rt_n

In [5]:
decoy_lib = SpecLibBase()
decoy_lib._precursor_df = target_lib.precursor_df.copy()
decoy_lib._fragment_mz_df = target_lib._fragment_mz_df.copy()
decoy_lib._fragment_intensity_df = target_lib._fragment_intensity_df.copy()

#decoy_lib.import_file(brunner_lib)
decoy_lib.decoy = 'diann'
decoy_lib.append_decoy_sequence()
decoy_lib.calc_precursor_mz()
decoy_lib.precursor_df = decoy_lib.precursor_df[decoy_lib.precursor_df['decoy'] == 1]

In [6]:
from peptdeep.pretrained_models import ModelManager
from alphabase.peptide.fragment import get_charged_frag_types

frag_types = get_charged_frag_types(
    ['b','y'], 
    2
)

model_mgr = ModelManager()

model_mgr.nce = 30
model_mgr.instrument = 'timsTOF'

In [7]:
res = model_mgr.predict_all(
    decoy_lib.precursor_df,
    predict_items=['ms2'],
    frag_types = frag_types,
)

2023-01-25 14:51:46> Using multiprocessing with 8 processes ...
2023-01-25 14:51:46> Predicting ms2 ...


100%|██████████| 24/24 [00:08<00:00,  2.96it/s]


In [8]:
decoy_lib._precursor_df = res['precursor_df']
decoy_lib._fragment_mz_df = res['fragment_mz_df']
decoy_lib._fragment_intensity_df = res['fragment_intensity_df']

In [9]:
decoy_lib._precursor_df[['frag_start_idx']] += len(target_lib._fragment_mz_df)
decoy_lib._precursor_df[['frag_stop_idx']] += len(target_lib._fragment_mz_df)

In [10]:
output_lib = SpecLibBase()

output_lib._precursor_df = pd.concat([target_lib._precursor_df, decoy_lib._precursor_df], join='inner')
output_lib._fragment_mz_df = pd.concat([target_lib._fragment_mz_df, decoy_lib._fragment_mz_df], join='inner')
output_lib._fragment_intensity_df = pd.concat([target_lib._fragment_intensity_df, decoy_lib._fragment_intensity_df], join='inner')

In [11]:
output_lib._precursor_df = output_lib._precursor_df.sort_values(by=['elution_group_idx']).reset_index(drop=True)
output_lib.precursor_df['precursor_idx'] = np.arange(len(output_lib.precursor_df))

In [21]:
output_lib.precursor_df

from alphabase.constants.isotope import IsotopeDistribution
isotope_dist = IsotopeDistribution()

output_lib.precursor_df[['sequence','mods']].iloc[0]

sequence    AAAAAAALQAK
mods                   
Name: 0, dtype: object

In [29]:
output_lib.precursor_df.sort_values(by=['nAA'], ascending=False).iloc[0]

sequence             DMAIATGGAVFGEEGLTLNLEDVQPHDLGK
charge                                            4
rt                                        92.312187
mobility                                    1.01125
proteins                                 CH60_HUMAN
mods                                               
mod_sites                                          
nAA                                              30
frag_start_idx                                41205
frag_stop_idx                                 41234
rt_norm                                    0.684538
precursor_mz                             775.134122
ccs                                      813.659782
decoy                                             0
elution_group_idx                              3553
precursor_idx                                  7095
Name: 7095, dtype: object

In [42]:
max_isotope = 8



In [47]:
from tqdm import tqdm
from alphabase.peptide.precursor import get_mod_seq_formula

def calc_isotope_dist(precursor_df, max_isotope = 6, min_intensity = 0.001):

    col_names = ['i_{}'.format(i) for i in range(max_isotope)]
    output_lib.precursor_df[col_names] = 0.

    for i in tqdm(range(len(output_lib.precursor_df))):

        row = output_lib.precursor_df.iloc[i]
        dist, mono = isotope_dist.calc_formula_distribution(
            get_mod_seq_formula(row['sequence'], row['mods'])
        )
        dist[dist < 0.001] = 0.
        dist = dist / dist.sum()
        output_lib.precursor_df.loc[i, col_names] = dist[:max_isotope]

In [48]:
calc_isotope_dist(output_lib.precursor_df)
print(output_lib.precursor_df[['sequence','mods']+col_names])

100%|██████████| 55681/55681 [00:20<00:00, 2738.90it/s]

                sequence mods       i_0       i_1       i_2       i_3  \
0            AAAAAAALQAK       0.591407  0.297877  0.088130  0.019204   
1            ALAAAAALQLK       0.553814  0.315646  0.102037  0.023983   
2        AAAAALSQQQSLQER       0.433274  0.343916  0.155286  0.051035   
3        ALAAALSQQQSLQDR       0.423885  0.345828  0.159408  0.053328   
4      AAAAASAAGPGGLVAGK       0.480321  0.336298  0.133837  0.038759   
...                  ...  ...       ...       ...       ...       ...   
55676       YYVTIIDAPGHR       0.443512  0.347062  0.149550  0.046192   
55677       YYVTIIDAPGHR       0.443512  0.347062  0.149550  0.046192   
55678       YSVTIIDAPGSR       0.491233  0.332846  0.129275  0.036661   
55679           YSYIPQSK       0.565667  0.309193  0.098054  0.022809   
55680           YYYIPQYK       0.496425  0.336233  0.125411  0.033534   

            i_4       i_5  i_6  i_7  
0      0.003382  0.000000  0.0  0.0  
1      0.004519  0.000000  0.0  0.0  
2      0.




In [49]:
output_lib.calc_precursor_isotope()
output_lib._precursor_df.drop(columns=[
    'isotope_m1_intensity', 
    'isotope_apex_intensity', 
    'isotope_right_most_intensity',
    'isotope_right_most_offset',
    'isotope_m1_mz',
    'isotope_apex_mz',
    'isotope_right_most_mz'
    ], inplace=True)

In [50]:
output_lib._precursor_df['decoy'].value_counts()

0    27883
1    27798
Name: decoy, dtype: int64

In [51]:
output_lib.save_hdf(output_location)

In [52]:
def check_fragment_coverage(speclib):
    nAA = speclib.precursor_df['nAA'].values
    frag_end = speclib.precursor_df['frag_end_idx'].values
    frag_start = speclib.precursor_df['frag_start_idx'].values
    frag_num = frag_end - frag_start
    aa_count = np.sum(nAA - 1)
    frag_count = np.sum(frag_num)
    print(aa_count, frag_count)