In [8]:
#default_exp reader.spectronaut_reader

In [9]:
#export
import pandas as pd

from alphadeep.reader.psm_reader import \
    psm_reader_provider

from alphadeep.reader.maxquant_reader import \
    MaxQuantReader

class SpectronautReader(MaxQuantReader):
    def __init__(self):
        super().__init__()
        self.mod_sep = '[]'
        self.tsv_sep = '\t'

        self.column_mapping = {
            'sequence': 'StrippedPeptide',
            'charge': 'PrecursorCharge',
            'RT': ['RT','iRT','Tr_recalibrated','RetentionTime'],
            'CCS': 'CCS',
            'mobility': ['Mobility','IonMobility'],
            'proteins': 'Protein Name',
            'uniprot_ids': 'UniProtIds',
            'genes': 'Genes',
        } 
        self.modseq_col = 'ModifiedPeptide'
    
    def _load_file(self, filename):
        df = pd.read_csv(filename, sep=self.tsv_sep)
        df.drop_duplicates([
            'ReferenceRun',self.modseq_col, 'PrecursorCharge'
        ], inplace=True)
        df.reset_index(drop=True, inplace=True)

        for rt_col in self.column_mapping['RT']:
            if rt_col not in df.columns: continue
            min_rt = df[rt_col].min()
            df[rt_col] = (
                df[rt_col] - min_rt
            )/(df[rt_col].max() - min_rt)
            break
        return df

class OpenSwathReader(SpectronautReader):
    def __init__(self):
        super().__init__()
        self.mod_sep = '()'
        self.underscore_for_ncterm = False
        self.fixed_C=False

        self.column_mapping = {
            'sequence': 'PeptideSequence',
            'charge': 'PrecursorCharge',
            'RT': ['RT','iRT','Tr_recalibrated','RetentionTime'],
            'CCS': 'CCS',
            'mobility': ['Mobility','IonMobility'],
            'proteins': 'Protein Name',
            'uniprot_ids': 'UniProtID',
            'genes': 'Genes',
        }
        self._modseq_columns = [
            'ModifiedPeptide',
            'ModifiedSequence',
            'FullUniModPeptideName',
        ]
        self.modseq_col = 'FullUniModPeptideName'

    def _find_modseq_column(self, df):
        for modseq_col in self._modseq_columns:
            if modseq_col in df.columns:
                self.modseq_col = modseq_col
                break
    
    def _load_file(self, filename):
        df = pd.read_csv(filename, sep=self.tsv_sep)
        self._find_modseq_column(df)
        df.drop_duplicates([
            self.modseq_col, 'PrecursorCharge'
        ], inplace=True)
        df.reset_index(drop=True, inplace=True)

        for rt_col in self.column_mapping['RT']:
            if rt_col not in df.columns: continue
            min_rt = df[rt_col].min()
            df[rt_col] = (
                df[rt_col] - min_rt
            )/(df[rt_col].max() - min_rt)
            break
        return df

psm_reader_provider.register_reader(
    'spectronaut', SpectronautReader
)
psm_reader_provider.register_reader(
    'openswath', OpenSwathReader
)

In [10]:
from io import StringIO
tsv = StringIO('''ReferenceRun	PrecursorCharge	Workflow	IntModifiedPeptide	CV	AllowForNormalization	ModifiedPeptide	StrippedPeptide	iRT	IonMobility	iRTSourceSpecific	BGSInferenceId	IsProteotypic	IntLabeledPeptide	LabeledPeptide	PrecursorMz	ReferenceRunQvalue	ReferenceRunMS1Response	FragmentLossType	FragmentNumber	FragmentType	FragmentCharge	FragmentMz	RelativeIntensity	ExcludeFromAssay	Database	ProteinGroups	UniProtIds	Protein Name	ProteinDescription	Organisms	OrganismId	Genes	Protein Existence	Sequence Version	FASTAName
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_ALVAT[+80]PGK_		True	_ALVAT[Phospho (STY)]PGK_	ALVATPGK	-5.032703	0.758	-5.032703	P19338	False	_ALVAT[+80]PGK_	_ALVAT[Phospho (STY)]PGK_	418.717511324722	0	10352	noloss	3	y	1	301.187031733932	53.1991	False	sp	P19338	P19338	NUCL_HUMAN	Nucleolin	Homo sapiens		NCL	1	3	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_ALVAT[+80]PGK_		True	_ALVAT[Phospho (STY)]PGK_	ALVATPGK	-5.032703	0.758	-5.032703	P19338	False	_ALVAT[+80]PGK_	_ALVAT[Phospho (STY)]PGK_	418.717511324722	0	10352	H3PO4	4	y	1	384.224142529733	26.31595	False	sp	P19338	P19338	NUCL_HUMAN	Nucleolin	Homo sapiens		NCL	1	3	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_TLT[+80]PPLR_		True	_TLT[Phospho (STY)]PPLR_	TLTPPLR	27.71659	0.818	27.71659	Q5T200	False	_TLT[+80]PPLR_	_TLT[Phospho (STY)]PPLR_	439.230785875227	0.000138389150379226	23117	noloss	3	b	1	396.153027901512	6.3264	False	sp	Q5T200	Q5T200	ZC3HD_HUMAN	Zinc finger CCCH domain-containing protein 13	Homo sapiens		ZC3H13	1	1	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_100ug_test_S4-A1_1_25843	2		_TLT[+80]PPLR_		True	_TLT[Phospho (STY)]PPLR_	TLTPPLR	27.71659	0.818	27.71659	Q5T200	False	_TLT[+80]PPLR_	_TLT[Phospho (STY)]PPLR_	439.230785875227	0.000138389150379226	23117	noloss	3	y	1	385.255780000092	29.70625	False	sp	Q5T200	Q5T200	ZC3HD_HUMAN	Zinc finger CCCH domain-containing protein 13	Homo sapiens		ZC3H13	1	1	MCT_human_UP000005640_9606
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867	2		_LFVT[+80]PPEGSSR_		True	_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_	LFVSPPEGSSR	38.05031	0.917	38.05031	Q14244;Q14244-6;Q14244-7	False	_LFVT[+80]PPEGSSR_	_LFVT[Phospho (STY)]PPEGSSR_	635.297385373987	0	14164	H3PO4	4	b	1	443.265279065723	12.24525	False	sp	Q14244;Q14244-6;Q14244-7	Q14244;Q14244-6;Q14244-7	MAP7_HUMAN	Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin	Homo sapiens		MAP7	1;;	1;;	MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867	2		_LFVT[+80]PPEGSSR_		True	_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_	LFVSPPEGSSR	38.05031	0.917	38.05031	Q14244;Q14244-6;Q14244-7	False	_LFVT[+80]PPEGSSR_	_LFVT[Phospho (STY)]PPEGSSR_	635.297385373987	0	14164	noloss	6	y	1	632.299829640042	46.07855	False	sp	Q14244;Q14244-6;Q14244-7	Q14244;Q14244-6;Q14244-7	MAP7_HUMAN	Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin	Homo sapiens		MAP7	1;;	1;;	MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional
202106018_TIMS03_EVO03_PaSk_SA_HeLa_EGF_Phospho_library25_S4-C1_1_25867	2		_LFVT[+80]PPEGSSR_		True	_[Acetyl (Protein N-term)]LFVS[Phospho (STY)]PPEGSSR_	LFVSPPEGSSR	38.05031	0.917	38.05031	Q14244;Q14244-6;Q14244-7	False	_LFVT[+80]PPEGSSR_	_LFVT[Phospho (STY)]PPEGSSR_	635.297385373987	0	14164	noloss	7	y	1	729.352593488892	100	False	sp	Q14244;Q14244-6;Q14244-7	Q14244;Q14244-6;Q14244-7	MAP7_HUMAN	Ensconsin;Isoform of Q14244, Isoform 6 of Ensconsin;Isoform of Q14244, Isoform 7 of Ensconsin	Homo sapiens		MAP7	1;;	1;;	MCT_human_UP000005640_9606;MCT_human2_UP000005640_9606_additional;MCT_human2_UP000005640_9606_additional
''')


reader = psm_reader_provider.get_reader('spectronaut')
reader.load(tsv)
reader.psm_df

Unnamed: 0,sequence,charge,RT,CCS,mobility,proteins,uniprot_ids,genes,nAA,mods,mod_sites
0,ALVATPGK,2,0.0,,0.758,NUCL_HUMAN,P19338,NCL,8,Phospho@T,5
1,TLTPPLR,2,0.760144,,0.818,ZC3HD_HUMAN,Q5T200,ZC3H13,7,Phospho@T,3
2,LFVSPPEGSSR,2,1.0,,0.917,MAP7_HUMAN,Q14244;Q14244-6;Q14244-7,MAP7,11,Acetyl@Protein N-term;Phospho@S,0;4


In [11]:
#hide
assert len(reader.psm_df) == 3
assert reader.psm_df.mods.values[0] == 'Phospho@T'
assert reader.psm_df.mod_sites.values[0] == '5'
assert reader.psm_df.mods.values[1] == 'Phospho@T'
assert reader.psm_df.mod_sites.values[1] == '3'
assert reader.psm_df.mods.values[2] == 'Acetyl@Protein N-term;Phospho@S'
assert reader.psm_df.mod_sites.values[2] == '0;4'

In [12]:
from io import StringIO
tsv = StringIO('''PrecursorMz	ProductMz	Tr_recalibrated	transition_name	CE	LibraryIntensity	transition_group_id	decoy	PeptideSequence	ProteinName	Annotation	FullUniModPeptideName	PrecursorCharge	GroupLabel	UniprotID	FragmentType	FragmentCharge	FragmentSeriesNumber
685.732240417	886.020494795	59.0	255_AAAAAAAAAASGAAIPPLIPPRR_3	-1	5257.9	13_AAAAAAAAAASGAAIPPLIPPRR_3	0	AAAAAAAAAASGAAIPPLIPPRR	1/O14654	y19^2/0.002	AAAAAAAAAASGAAIPPLIPPRR	3	light	1/O14654	y	2	19
514.550999438	473.303261576	59.2	268_AAAAAAAAAASGAAIPPLIPPRR_4	-1	10000.0	14_AAAAAAAAAASGAAIPPLIPPRR_4	0	AAAAAAAAAASGAAIPPLIPPRR	1/O14654	y8^2/0.002	AAAAAAAAAASGAAIPPLIPPRR	4	light	1/O14654	y	2	8
514.550999438	629.39313922	59.2	276_AAAAAAAAAASGAAIPPLIPPRR_4	-1	5923.1	14_AAAAAAAAAASGAAIPPLIPPRR_4	0	AAAAAAAAAASGAAIPPLIPPRR	1/O14654	y12^2/0.001	AAAAAAAAAASGAAIPPLIPPRR	4	light	1/O14654	y	2	12
514.550999438	672.909153425	59.2	279_AAAAAAAAAASGAAIPPLIPPRR_4	-1	5249.8	14_AAAAAAAAAASGAAIPPLIPPRR_4	0	AAAAAAAAAASGAAIPPLIPPRR	1/O14654	y13^2/0.001	AAAAAAAAAASGAAIPPLIPPRR	4	light	1/O14654	y	2	13
514.550999438	356.19284545	59.2	262_AAAAAAAAAASGAAIPPLIPPRR_4	-1	5233.6	14_AAAAAAAAAASGAAIPPLIPPRR_4	0	AAAAAAAAAASGAAIPPLIPPRR	1/O14654	b5/0.001,b10^2/0.001,m6:10/0.001	AAAAAAAAAASGAAIPPLIPPRR	4	light	1/O14654	b	1	5
514.550999438	498.26707303	59.2	269_AAAAAAAAAASGAAIPPLIPPRR_4	-1	4976.0	14_AAAAAAAAAASGAAIPPLIPPRR_4	0	AAAAAAAAAASGAAIPPLIPPRR	1/O14654	b7/0.001,m4:10/0.001	AAAAAAAAAASGAAIPPLIPPRR	4	light	1/O14654	b	1	7
514.550999438	427.22995924	59.2	265_AAAAAAAAAASGAAIPPLIPPRR_4	-1	4859.4	14_AAAAAAAAAASGAAIPPLIPPRR_4	0	AAAAAAAAAASGAAIPPLIPPRR	1/O14654	b6/0.002,m5:10/0.002	AAAAAAAAAASGAAIPPLIPPRR	4	light	1/O14654	b	1	6
728.201724416	356.19284545	101.8	292_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	-1	10000.0	15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	0	AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR	1/O14654	b5/0.003,b10^2/0.003,m6:10/0.003	AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR	5	light	1/O14654	b	1	5
728.201724416	576.310000482	101.8	297_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	-1	7611.0	15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	0	AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR	1/O14654	y5/0.002	AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR	5	light	1/O14654	y	1	5
728.201724416	427.22995924	101.8	293_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	-1	6805.1	15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	0	AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR	1/O14654	b6/-0.002,m5:10/-0.002	AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR	5	light	1/O14654	b	1	6
728.201724416	569.30418682	101.8	296_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	-1	6312.7	15_AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR_5	0	AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR	1/O14654	b8/0.009,m3:10/0.009	AAAAAAAAAASGAAIPPLIPPRRVITLYQC(UniMod:4)FSVSQR	5	light	1/O14654	b	1	8
''')


reader = psm_reader_provider.get_reader('openswath')
reader.load(tsv)
reader.psm_df

Unnamed: 0,sequence,charge,RT,CCS,mobility,proteins,uniprot_ids,genes,nAA,mods,mod_sites
0,AAAAAAAAAASGAAIPPLIPPRR,3,0.0,,,,,,23,,
1,AAAAAAAAAASGAAIPPLIPPRR,4,0.004673,,,,,,23,,
2,AAAAAAAAAASGAAIPPLIPPRRVITLYQCFSVSQR,5,1.0,,,,,,36,Carbamidomethyl@C,30.0
