In [None]:
#---#| default_exp psm_reader.sage_reader

In [None]:
#| hide
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

In [None]:
from alphabase.psm_reader import psm_reader_provider
from alphabase.psm_reader.sage_reader import (
    SageModificationTranslation,
    _sage_spec_idx_from_scan_nr,
    _match_modified_sequence,
    _get_annotated_mod_df,
    _lookup_modification,
    register_readers,
)
register_readers()

In [None]:
#| hide
assert _sage_spec_idx_from_scan_nr('controllerType=0 controllerNumber=1 scan=7846') == 7845

In [None]:
test_df = pd.DataFrame({
    'modified_sequence': [
        '[-100.0]-PEPTIDE',
        'PEPTIDE-[-100.0]',
        'PEPTIDE[-100.0]',
        'P[-100.0]EPTIDE',
        'PEPT[-100.0]IDE',
        'PE[-100.0]PTIDE[-100.0]P',
    ],
    'expected_signature': [
        [('[-100.0]', '', True, False, -100)],
        [('[-100.0]', '', False, True, -100)],
        [('[-100.0]', 'E', False, False, -100)],
        [('[-100.0]', 'P', False, False, -100)],
        [('[-100.0]', 'T', False, False, -100)],
        [('[-100.0]', 'E', False, False, -100), ('[-100.0]', 'E', False, False, -100)],
    ]
})

test_df['observed_signature'] = test_df['modified_sequence'].apply(_match_modified_sequence)

assert test_df['observed_signature'].equals(test_df['expected_signature'])

In [None]:
mod_annotated_df = _get_annotated_mod_df()
assert all(mod_annotated_df.columns == ['mass','previous_aa','is_nterm','is_cterm','unimod_id','localizer_rank'])

In [None]:
assert _lookup_modification(15.99490, 'M', mod_annotated_df) == 'Oxidation@M'

In [None]:
df = pd.DataFrame({
    'modified_sequence': [
        '[+114.04293]-MAGTK[+114.04293]',
        '[+114.04293]-MAGTK[+114.04293]',
        '[+114.04293]-M[+15.9949]K[+42.010567]LLAR',
        '[+1337.0]-PEPTIDEK'
    ]
})

custom_translation_df = pd.DataFrame({
    'modification': ['[+42.010567]'],
    'matched_mod_name': ['ThisModDoesNotExist@K']
})

sage_translation = SageModificationTranslation(
    custom_translation_df=custom_translation_df
    )
result_df = sage_translation(df)

assert result_df['mod_sites'].equals(pd.Series([
    '0;5',
    '0;5',
    '0;1;2'
]))

assert result_df['mods'].equals(pd.Series([
    'GG@Protein_N-term;GG@K',
    'GG@Protein_N-term;GG@K',
    'GG@Protein_N-term;Oxidation@M;ThisModDoesNotExist@K'
]))

In [None]:
#| hide
from io import StringIO

In [None]:
#| hide
txt = StringIO("""filename	scannr	peptide	stripped_peptide	proteins	is_decoy	charge	rt	ion_mobility	spectrum_q	peptide_q	protein_q	sage_discriminant_score
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=7846	VDNDENEHQLSLR	VDNDENEHQLSLR	sp|P06748|NPM_HUMAN	False	3	9.537714	0.0	0.00010579771	0.00018581642	0.00033346	1.0614725
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=10841	VDDYSQEWAAQTEK	VDDYSQEWAAQTEK	sp|O95602|RPA1_HUMAN	False	2	12.398749	0.0	0.00010579771	0.00018581642	0.00033346	1.0588802
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_11.mzML	controllerType=0 controllerNumber=1 scan=1864	ITTGSSSAGTQSSTSNR	ITTGSSSAGTQSSTSNR	sp|O14974|MYPT1_HUMAN	False	2	3.5604227	0.0	0.00010579771	0.00018581642	0.00033346	1.0558788
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=7932	DC[+57.021465]EDPEYKPLQGPPK	DCEDPEYKPLQGPPK	sp|Q9HCK8|CHD8_HUMAN	False	3	9.552011	0.0	0.00010579771	0.00018581642	0.00033346	1.0542139
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=14771	ELGPLPDDDDMASPK	ELGPLPDDDDMASPK	sp|Q86U86|PB1_HUMAN	False	2	16.987766	0.0	0.00010579771	0.00018581642	0.00033346	1.0516068
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_12.mzML	controllerType=0 controllerNumber=1 scan=4250	YSGSEGSTQTLTK	YSGSEGSTQTLTK	sp|P25815|S100P_HUMAN	False	2	5.6583586	0.0	0.00010579771	0.00018581642	0.00033346	1.05135
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=9584	VDNDENEHQLSLR	VDNDENEHQLSLR	sp|P06748|NPM_HUMAN	False	3	11.282358	0.0	0.00010579771	0.00018581642	0.00033346	1.0509663
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=8375	VM[+15.9949]QENSSSFSDLSER	VMQENSSSFSDLSER	sp|Q86TC9|MYPN_HUMAN	False	2	9.9729395	0.0	0.00010579771	0.00018581642	0.00033346	1.0443583
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_13.mzML	controllerType=0 controllerNumber=1 scan=14001	EELDVVEESHYIQQR	EELDVVEESHYIQQR	sp|Q2NKX8|ERC6L_HUMAN	False	3	15.6105	0.0	0.00010579771	0.18581642	0.00033346	1.0401766
20160107_QE5_UPLC1_AKP_Hep2_R1_Pro46F_10.mzML	controllerType=0 controllerNumber=1 scan=7964	DGSASEVPSELSERPK	DGSASEVPSELSERPK	sp|A0A096LP01|SIM26_HUMAN	False	3	9.63896	0.0	0.00010579771	0.18581642	0.00033346	1.0398533""")

psm_df = psm_reader_provider.get_reader('sage_tsv').import_file(txt)
psm_df

In [None]:
#| hide
assert np.all(psm_df['fdr'] <= 0.01)
assert (psm_df['mods'] != "").sum() == 2
assert (psm_df['mod_sites'] != "").sum() == 2
for seq, mods, mod_sites in psm_df[["sequence","mods","mod_sites"]].values:
    if mods == "":
        assert mod_sites == ""
    else:
        mods = mods.split(";")
        mod_sites = mod_sites.split(";")
        for mod, site in zip(mods, mod_sites):
            if site == "0":
                continue
            assert seq[int(site)-1] == mod[-1]