In [1]:
%reload_ext autoreload
%autoreload 2

# Building spectral library from search engines instead of prediction

Taking alphapept as an example:

### 1. Loading PSMs from alphapept

In [2]:
import os
ap_ms_data_hdf = os.path.expanduser('~/Workspace/Data/HeLa_500ng/HeLa_DDA_tims.ms_data.hdf')

In [3]:
from alphabase.io.psm_reader.alphapept_reader import AlphaPeptReader

modification_mapping = {
    'Carbamidomethyl@C': 'cC',
    'Oxidation@M': 'oxM',
    'Phospho@S': 'pS',
    'Phospho@T': 'pT',
    'Phospho@Y': 'pY',
    'Acetyl@Protein N-term': 'a',
}

reader = AlphaPeptReader(modification_mapping=modification_mapping)
reader.import_file(ap_ms_data_hdf)
reader.psm_df

Unnamed: 0,rt,spec_idx,query_id,mobility,score,precursor_mz,charge,raw_name,fdr,decoy,sequence,mods,mod_sites,nAA,rt_norm,ccs
0,12.402511,27910,5573,1.087069,0.960469,716.432733,1,HeLa_DDA_tims,0.002481,0,NLGTIAK,,,7,0.606578,221.898442
1,7.155962,10550,13546,0.819828,0.922066,474.731523,2,HeLa_DDA_tims,0.006084,0,FYEQFSK,,,7,0.349982,333.146918
2,5.523916,6537,10518,0.806897,0.959866,447.213808,2,HeLa_DDA_tims,0.002481,0,LCDFNPK,Carbamidomethyl@C,2,7,0.270162,328.181079
3,5.407323,6140,10555,0.803664,0.915675,447.254210,2,HeLa_DDA_tims,0.006558,0,NIHPWVK,,,7,0.264460,326.865803
4,5.485055,6376,9621,0.754095,0.950975,437.711773,2,HeLa_DDA_tims,0.003260,0,LDNNWGR,,,7,0.268261,306.806590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18901,19.125378,50817,130546,0.989009,0.976114,943.805461,3,HeLa_DDA_tims,0.001371,0,EGTDSSQGIPQLVSNISACQVIAEAVR,Carbamidomethyl@C,19,27,0.935378,597.076939
18902,13.218696,30646,130497,1.037500,0.994264,940.768646,3,HeLa_DDA_tims,0.000217,0,LESPSFTGTGDTEIAHATEDLENNGSK,,,27,0.646496,626.361693
18903,15.317208,38291,130486,0.998707,0.996083,939.802123,3,HeLa_DDA_tims,0.000076,0,HVGPGVLSMANAGPNTNGSQFFICTIK,Carbamidomethyl@C,24,27,0.749130,602.944485
18904,17.881807,46954,131310,1.136638,0.984757,984.830810,3,HeLa_DDA_tims,0.000905,0,LDLWNLNNDTEVPTASISVEGNPALNR,,,27,0.874558,686.062638


### 2. Extract fragment intensities from the spectrum file

In [8]:
from peptdeep.rescore.feature_extractor import match_one_raw

psm_df, frag_mz_df, frag_intensity_df, frag_merr_df = match_one_raw(
    reader.psm_df, ms2_file=ap_ms_data_hdf,
    ms2_file_type='alphapept',
    frag_types_to_match=['b_z1','b_z2','y_z1','y_z2'],
    ms2_ppm=True, ms2_tol=50.0,
    calibrate_frag_mass_error=False,
)
psm_df

Unnamed: 0,rt,spec_idx,query_id,mobility,score,precursor_mz,charge,raw_name,fdr,decoy,sequence,mods,mod_sites,nAA,rt_norm,ccs,frag_start_idx,frag_end_idx
0,12.402511,27910,5573,1.087069,0.960469,716.432733,1,HeLa_DDA_tims,0.002481,0,NLGTIAK,,,7,0.606578,221.898442,0,6
1,7.155962,10550,13546,0.819828,0.922066,474.731523,2,HeLa_DDA_tims,0.006084,0,FYEQFSK,,,7,0.349982,333.146918,6,12
2,5.523916,6537,10518,0.806897,0.959866,447.213808,2,HeLa_DDA_tims,0.002481,0,LCDFNPK,Carbamidomethyl@C,2,7,0.270162,328.181079,12,18
3,5.407323,6140,10555,0.803664,0.915675,447.254210,2,HeLa_DDA_tims,0.006558,0,NIHPWVK,,,7,0.264460,326.865803,18,24
4,5.485055,6376,9621,0.754095,0.950975,437.711773,2,HeLa_DDA_tims,0.003260,0,LDNNWGR,,,7,0.268261,306.806590,24,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18901,19.125378,50817,130546,0.989009,0.976114,943.805461,3,HeLa_DDA_tims,0.001371,0,EGTDSSQGIPQLVSNISACQVIAEAVR,Carbamidomethyl@C,19,27,0.935378,597.076939,261320,261346
18902,13.218696,30646,130497,1.037500,0.994264,940.768646,3,HeLa_DDA_tims,0.000217,0,LESPSFTGTGDTEIAHATEDLENNGSK,,,27,0.646496,626.361693,261346,261372
18903,15.317208,38291,130486,0.998707,0.996083,939.802123,3,HeLa_DDA_tims,0.000076,0,HVGPGVLSMANAGPNTNGSQFFICTIK,Carbamidomethyl@C,24,27,0.749130,602.944485,261372,261398
18904,17.881807,46954,131310,1.136638,0.984757,984.830810,3,HeLa_DDA_tims,0.000905,0,LDLWNLNNDTEVPTASISVEGNPALNR,,,27,0.874558,686.062638,261398,261424


#### Normalize the fragment intensities?

In [13]:
# from peptdeep.model.ms2 import normalize_training_intensities

# psm_df.sort_values('frag_start_idx', inplace=True)
# psm_df, frag_intensity_df = normalize_training_intensities(
#     psm_df, frag_intensity_df
# )
# frag_intensity_df

### 3. Generate the library for DiaNN

In [14]:
from peptdeep.protein.fasta import FastaLib

fasta_lib = FastaLib()

In [16]:
fasta_lib._precursor_df = psm_df
fasta_lib._fragment_mz_df = frag_mz_df
fasta_lib._fragment_intensity_df = frag_intensity_df

In [18]:
from peptdeep.spec_lib.translate import translate_to_tsv
translate_to_tsv(
    fasta_lib, 
    tsv = ap_ms_data_hdf[:-len('ms_data.hdf')]+'tsv',
    min_frag_intensity=0.001,
    keep_k_highest_fragments=1000, #keep all matched fragments
)

100%|██████████| 1/1 [00:20<00:00, 20.76s/it]


Translation finished, it will take several minutes to export the rest precursors to the tsv file...
