In [1]:
#---#| default_exp psm_reader.sage_reader

In [2]:
#| hide
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

In [3]:
from alphabase.psm_reader import psm_reader_provider
from alphabase.psm_reader.sage_reader import (
    SageModificationTranslator,
    _sage_spec_idx_from_scan_nr,
    _match_modified_sequence,
    _get_annotated_mod_df,
    _lookup_modification,
    register_readers,
)
register_readers()

In [4]:
#| hide
assert _sage_spec_idx_from_scan_nr('controllerType=0 controllerNumber=1 scan=7846') == 7845

In [5]:
test_df = pd.DataFrame({
    'modified_sequence': [
        '[-100.0]-PEPTIDE',
        'PEPTIDE-[-100.0]',
        'PEPTIDE[-100.0]',
        'P[-100.0]EPTIDE',
        'PEPT[-100.0]IDE',
        'PE[-100.0]PTIDE[-100.0]P',
    ],
    'expected_signature': [
        [('[-100.0]', '', True, False, -100)],
        [('[-100.0]', '', False, True, -100)],
        [('[-100.0]', 'E', False, False, -100)],
        [('[-100.0]', 'P', False, False, -100)],
        [('[-100.0]', 'T', False, False, -100)],
        [('[-100.0]', 'E', False, False, -100), ('[-100.0]', 'E', False, False, -100)],
    ]
})

test_df['observed_signature'] = test_df['modified_sequence'].apply(_match_modified_sequence)

assert test_df['observed_signature'].equals(test_df['expected_signature'])

In [6]:
mod_annotated_df = _get_annotated_mod_df()
assert all(mod_annotated_df.columns == ['mass','previous_aa','is_nterm','is_cterm','unimod_id','localizer_rank'])

In [7]:
assert _lookup_modification(15.99490, 'M', mod_annotated_df) == 'Oxidation@M'

In [8]:
df = pd.DataFrame({
    'modified_sequence': [
        '[+114.04293]-MAGTK[+114.04293]',
        '[+114.04293]-MAGTK[+114.04293]',
        '[+114.04293]-M[+15.9949]K[+42.010567]LLAR',
        '[+1337.0]-PEPTIDEK'
    ]
})

custom_translation_df = pd.DataFrame({
    'modification': ['[+42.010567]'],
    'matched_mod_name': ['ThisModDoesNotExist@K']
})

sage_translator = SageModificationTranslator(
    custom_translation_df=custom_translation_df
    )
result_df = sage_translator.translate(df)

assert result_df['mod_sites'].equals(pd.Series([
    '0;5',
    '0;5',
    '0;1;2'
]))

assert result_df['mods'].equals(pd.Series([
    'GG@Any_N-term;GG@K',
    'GG@Any_N-term;GG@K',
    'GG@Any_N-term;Oxidation@M;ThisModDoesNotExist@K'
]))

















  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:02<00:00,  2.56s/it]

100%|██████████| 1/1 [00:02<00:00,  2.56s/it]




In [9]:
#| hide
from io import StringIO

In [10]:
#| hide
txt = StringIO("""filename	scannr	peptide	stripped_peptide	proteins	is_decoy	charge	rt	ion_mobility	spectrum_q	peptide_q	protein_q	sage_discriminant_score
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=7846	VDNDENEHQLSLR	VDNDENEHQLSLR	sp|P06748|NPM_HUMAN	False	3	9.537714	0.0	0.00010579771	0.00018581642	0.00033346	1.0614725
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=10841	VDDYSQEWAAQTEK	VDDYSQEWAAQTEK	sp|O95602|RPA1_HUMAN	False	2	12.398749	0.0	0.00010579771	0.00018581642	0.00033346	1.0588802
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_11.mzML	controllerType=0 controllerNumber=1 scan=1864	ITTGSSSAGTQSSTSNR	ITTGSSSAGTQSSTSNR	sp|O14974|MYPT1_HUMAN	False	2	3.5604227	0.0	0.00010579771	0.00018581642	0.00033346	1.0558788
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=7932	DC[+57.021465]EDPEYKPLQGPPK	DCEDPEYKPLQGPPK	sp|Q9HCK8|CHD8_HUMAN	False	3	9.552011	0.0	0.00010579771	0.00018581642	0.00033346	1.0542139
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=14771	ELGPLPDDDDMASPK	ELGPLPDDDDMASPK	sp|Q86U86|PB1_HUMAN	False	2	16.987766	0.0	0.00010579771	0.00018581642	0.00033346	1.0516068
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_12.mzML	controllerType=0 controllerNumber=1 scan=4250	YSGSEGSTQTLTK	YSGSEGSTQTLTK	sp|P25815|S100P_HUMAN	False	2	5.6583586	0.0	0.00010579771	0.00018581642	0.00033346	1.05135
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=9584	VDNDENEHQLSLR	VDNDENEHQLSLR	sp|P06748|NPM_HUMAN	False	3	11.282358	0.0	0.00010579771	0.00018581642	0.00033346	1.0509663
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=8375	VM[+15.9949]QENSSSFSDLSER	VMQENSSSFSDLSER	sp|Q86TC9|MYPN_HUMAN	False	2	9.9729395	0.0	0.00010579771	0.00018581642	0.00033346	1.0443583
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=14001	EELDVVEESHYIQQR	EELDVVEESHYIQQR	sp|Q2NKX8|ERC6L_HUMAN	False	3	15.6105	0.0	0.00010579771	0.18581642	0.00033346	1.0401766
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=7964	DGSASEVPSELSERPK	DGSASEVPSELSERPK	sp|A0A096LP01|SIM26_HUMAN	False	3	9.63896	0.0	0.00010579771	0.18581642	0.00033346	1.0398533""")

psm_df = psm_reader_provider.get_reader('sage_tsv').import_file(txt)
psm_df

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:02<00:00,  2.57s/it]

100%|██████████| 1/1 [00:02<00:00,  2.57s/it]




Unnamed: 0,sequence,charge,rt,raw_name,score,proteins,fdr,decoy,mod_sites,mods,spec_idx,nAA,rt_norm,precursor_mz
0,VDNDENEHQLSLR,3,0.158962,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML,1.061473,sp|P06748|NPM_HUMAN,0.000106,False,,,7845,13,0.561446,523.581497
1,YSGSEGSTQTLTK,2,0.094306,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_12.mzML,1.05135,sp|P25815|S100P_HUMAN,0.000106,False,,,4249,13,1.0,679.825346
2,VDNDENEHQLSLR,3,0.188039,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML,1.050966,sp|P06748|NPM_HUMAN,0.000106,False,,,9583,13,0.664146,523.581497
3,VDDYSQEWAAQTEK,2,0.206646,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML,1.05888,sp|O95602|RPA1_HUMAN,0.000106,False,,,10840,14,1.0,835.370649
4,DCEDPEYKPLQGPPK,3,0.1592,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML,1.054214,sp|Q9HCK8|CHD8_HUMAN,0.000106,False,2.0,Carbamidomethyl@C,7931,15,0.770401,591.610177
5,ELGPLPDDDDMASPK,2,0.283129,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML,1.051607,sp|Q86U86|PB1_HUMAN,0.000106,False,,,14770,15,1.0,800.363978
6,VMQENSSSFSDLSER,2,0.166216,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML,1.044358,sp|Q86TC9|MYPN_HUMAN,0.000106,False,2.0,Oxidation@M,8374,15,0.80435,866.378148
7,ITTGSSSAGTQSSTSNR,2,0.05934,20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_11.mzML,1.055879,sp|O14974|MYPT1_HUMAN,0.000106,False,,,1863,17,1.0,821.387362


In [11]:
#| hide
assert np.all(psm_df['fdr'] <= 0.01)
assert (psm_df['mods'] != "").sum() == 2
assert (psm_df['mod_sites'] != "").sum() == 2
for seq, mods, mod_sites in psm_df[["sequence","mods","mod_sites"]].values:
    if mods == "":
        assert mod_sites == ""
    else:
        mods = mods.split(";")
        mod_sites = mod_sites.split(";")
        for mod, site in zip(mods, mod_sites):
            if site == "0":
                continue
            assert seq[int(site)-1] == mod[-1]