In [1]:
#default_exp reader.spectronaut_reader

In [2]:
#export
import pandas as pd
import numpy as np

from alphadeep.reader.psm_reader import \
    psm_reader_provider

from alphadeep.reader.maxquant_reader import parse_mq, \
    MaxQuantReader

class SpectronautReader(MaxQuantReader):
    def __init__(self, fragment_reader=None):
        super().__init__(None)
        self.mod_sep = '[]'
    
    def _load_file(self, filename):
        df = pd.read_csv(filename, sep='\t')
        df.drop_duplicates([
            'ReferenceRun','ModifiedPeptide', 'PrecursorCharge'
        ], inplace=True)
        df.reset_index(drop=True, inplace=True)
        psm_df = pd.DataFrame()
        psm_df['sequence'] = df['StrippedPeptide']
        df['nAA'] = df['StrippedPeptide'].str.len() # place holder for future
        psm_df['nAA'] = df['nAA']
        psm_df['mods'], psm_df['mod_sites'] = zip(
            *df['ModifiedPeptide'].apply(
                parse_mq, mod_sep=self.mod_sep
            )
        )
        psm_df['charge'] = df['PrecursorCharge']
        
        psm_df['RT'] = df['iRT']
        min_rt = psm_df.RT.min()
        psm_df.RT = (
            psm_df.RT - min_rt
        )/(psm_df.RT.max() - min_rt)

        if 'K0' in df.columns:
            psm_df['mobility'] = 1/df['K0']
        elif 'IonMobility' in df.columns:
            psm_df['mobility'] = df['IonMobility']
        else:
            psm_df['mobility'] = pd.NA

        if 'CCS' in df.columns:
            psm_df['CCS'] = df['CCS']
        else:
            psm_df['CCS'] = pd.NA

        psm_df['proteins'] = df['Protein Name']
        if 'UniProtIds' in df.columns:
            psm_df['uniprot_id'] = df['UniProtIds']
        if 'Genes' in df.columns:
            psm_df['genes'] = df['Genes']
        else:
            psm_df['genes'] = ''
        self._psm_df = psm_df

psm_reader_provider.register_reader(
    'spectronaut', SpectronautReader
)

In [3]:
from io import StringIO
tsv = StringIO('''ReferenceRun	PrecursorCharge	Workflow	IntModifiedPeptide	CV	AllowForNormalization	ModifiedPeptide	StrippedPeptide	iRT	IonMobility	iRTSourceSpecific	BGSInferenceId	IsProteotypic	IntLabeledPeptide	LabeledPeptide	PrecursorMz	ReferenceRunQvalue	ReferenceRunMS1Response	FragmentLossType	FragmentNumber	FragmentType	FragmentCharge	FragmentMz	RelativeIntensity	ExcludeFromAssay	Database	ProteinGroups	UniProtIds	Protein Name	ProteinDescription	Organisms	OrganismId	Genes	Protein Existence	Sequence Version	FASTAName
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_ALVAT[+80]PGK_		True	_ALVAT[Phospho (STY)]PGK_	ALVATPGK	-5.032703	0.758	-5.032703	P19338	False	_ALVAT[+80]PGK_	_ALVAT[Phospho (STY)]PGK_	418.717511324722	0	10352	noloss	3	y	1	301.187031733932	53.1991	False	sp	P19338	P19338	NUCL_HUMAN	Nucleolin	Homo sapiens		NCL	1	3	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_ALVAT[+80]PGK_		True	_ALVAT[Phospho (STY)]PGK_	ALVATPGK	-5.032703	0.758	-5.032703	P19338	False	_ALVAT[+80]PGK_	_ALVAT[Phospho (STY)]PGK_	418.717511324722	0	10352	H3PO4	4	y	1	384.224142529733	26.31595	False	sp	P19338	P19338	NUCL_HUMAN	Nucleolin	Homo sapiens		NCL	1	3	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_TLT[+80]PPLR_		True	_TLT[Phospho (STY)]PPLR_	TLTPPLR	27.71659	0.818	27.71659	Q5T200	False	_TLT[+80]PPLR_	_TLT[Phospho (STY)]PPLR_	439.230785875227	0.000138389150379226	23117	noloss	3	b	1	396.153027901512	6.3264	False	sp	Q5T200	Q5T200	ZC3HD_HUMAN	Zinc finger CCCH domain-containing protein 13	Homo sapiens		ZC3H13	1	1	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_TLT[+80]PPLR_		True	_TLT[Phospho (STY)]PPLR_	TLTPPLR	27.71659	0.818	27.71659	Q5T200	False	_TLT[+80]PPLR_	_TLT[Phospho (STY)]PPLR_	439.230785875227	0.000138389150379226	23117	noloss	3	y	1	385.255780000092	29.70625	False	sp	Q5T200	Q5T200	ZC3HD_HUMAN	Zinc finger CCCH domain-containing protein 13	Homo sapiens		ZC3H13	1	1	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867	2		_LFVT[+80]PPEGSSR_		True	_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_	LFVSPPEGSSR	38.05031	0.917	38.05031	Q14244;Q14244-6;Q14244-7	False	_LFVT[+80]PPEGSSR_	_LFVT[Phospho (STY)]PPEGSSR_	635.297385373987	0	14164	H3PO4	4	b	1	443.265279065723	12.24525	False	sp	Q14244;Q14244-6;Q14244-7	Q14244;Q14244-6;Q14244-7	MAP7_HUMAN	Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin	Homo sapiens		MAP7	1;;	1;;	MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867	2		_LFVT[+80]PPEGSSR_		True	_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_	LFVSPPEGSSR	38.05031	0.917	38.05031	Q14244;Q14244-6;Q14244-7	False	_LFVT[+80]PPEGSSR_	_LFVT[Phospho (STY)]PPEGSSR_	635.297385373987	0	14164	noloss	6	y	1	632.299829640042	46.07855	False	sp	Q14244;Q14244-6;Q14244-7	Q14244;Q14244-6;Q14244-7	MAP7_HUMAN	Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin	Homo sapiens		MAP7	1;;	1;;	MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867	2		_LFVT[+80]PPEGSSR_		True	_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_	LFVSPPEGSSR	38.05031	0.917	38.05031	Q14244;Q14244-6;Q14244-7	False	_LFVT[+80]PPEGSSR_	_LFVT[Phospho (STY)]PPEGSSR_	635.297385373987	0	14164	noloss	7	y	1	729.352593488892	100	False	sp	Q14244;Q14244-6;Q14244-7	Q14244;Q14244-6;Q14244-7	MAP7_HUMAN	Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin	Homo sapiens		MAP7	1;;	1;;	MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional
''')


reader = psm_reader_provider.get_reader('spectronaut')
reader.load(tsv)
reader.psm_df

Unnamed: 0,sequence,nAA,mods,mod_sites,charge,RT,mobility,CCS,proteins,uniprot_id,genes
0,ALVATPGK,8,Phospho@T,5,2,0.0,0.758,,NUCL_HUMAN,P19338,NCL
1,TLTPPLR,7,Phospho@T,3,2,0.760144,0.818,,ZC3HD_HUMAN,Q5T200,ZC3H13
2,LFVSPPEGSSR,11,Acetyl@Protein N-term;Phospho@S,0;4,2,1.0,0.917,,MAP7_HUMAN,Q14244;Q14244-6;Q14244-7,MAP7


In [4]:
#hide
def test_reader():
    assert len(reader.psm_df) == 3
    assert reader.psm_df.mods.values[0] == 'Phospho@T'
    assert reader.psm_df.mod_sites.values[0] == '5'
    assert reader.psm_df.mods.values[1] == 'Phospho@T'
    assert reader.psm_df.mod_sites.values[1] == '3'
    assert reader.psm_df.mods.values[2] == 'Acetyl@Protein N-term;Phospho@S'
    assert reader.psm_df.mod_sites.values[2] == '0;4'
test_reader()