In [None]:
#default_exp psm_frag_reader.library_frag_reader

In [None]:
#export
import pandas as pd
import numpy as np

from alphabase.peptide.fragment import (
    init_fragment_by_precursor_dataframe
)
from alphabase.io.psm_reader.dia_search_reader import (
    SpectronautReader
)

from peptdeep.psm_frag_reader.psm_frag_reader import (
    PSMReader_w_FragBase,
    psm_w_frag_reader_provider
)

class SpectronautMSMSReader(SpectronautReader, PSMReader_w_FragBase):
    def __init__(self,
        frag_types=['b','y','b_modloss','y_modloss'], 
        max_frag_charge=2,
        **kwargs
    ):
        PSMReader_w_FragBase.__init__(self,
            frag_types = frag_types,
            max_frag_charge = max_frag_charge,
            **kwargs
        )

        SpectronautReader.__init__(self)
    
    @property
    def fragment_intensity_df(self):
        return self._fragment_intensity_df

    def _get_fragment_intensity(self, lib_df):

        frag_col_dict = dict(zip(
            self.charged_frag_types, 
            range(len(self.charged_frag_types))
        ))

        self._find_mapped_columns(lib_df)

        mod_seq_list = []
        seq_list = []
        charge_list = []
        rt_list = []
        frag_intens_list = []
        nAA_list = []


        for (mod_seq, seq, charge), df_group in lib_df.groupby(
            [self.mod_seq_column, self.seq_col, 'PrecursorCharge']
        ):
            if len(df_group) < 5: continue
            nAA = len(seq)
            intens = np.zeros(
                (nAA-1, len(self.charged_frag_types)),dtype=np.float32
            )
            for frag_type, frag_num, loss_type, frag_charge, inten in df_group[
                [
                    'FragmentType','FragmentNumber','FragmentLossType',
                    'FragmentCharge','RelativeIntensity'
                ]
            ].values:
                if frag_type in 'abc':
                    frag_num -= 1
                elif frag_type in 'xyz':
                    frag_num = nAA-frag_num-1
                else:
                    continue
                
                if loss_type == 'noloss':
                    frag_type = f'{frag_type}_z{frag_charge}'
                elif loss_type == 'H3PO4':
                    frag_type = f'{frag_type}_modloss_z{frag_charge}'
                else:
                    continue
                
                if frag_type not in frag_col_dict:
                    continue
                frag_col_idx = frag_col_dict[frag_type]
                intens[frag_num, frag_col_idx] = inten
            max_inten = np.max(intens)
            if max_inten <= 0: continue
            intens /= max_inten

            mod_seq_list.append(mod_seq)
            seq_list.append(seq)
            charge_list.append(charge)
            rt_list.append(df_group[self.rt_col].values[0])
            frag_intens_list.append(intens)
            nAA_list.append(nAA)
        
        df = pd.DataFrame({
            self.mod_seq_column: mod_seq_list,
            self.seq_col: seq_list,
            'PrecursorCharge': charge_list,
            self.rt_col: rt_list,
        })

        self._fragment_intensity_df = pd.DataFrame(
            np.concatenate(frag_intens_list),
            columns = self.charged_frag_types
        )

        indices = np.zeros(len(nAA_list)+1, dtype=np.int64)
        indices[1:] = np.array(nAA_list)-1
        indices = np.cumsum(indices)

        df['frag_start_idx'] = indices[:-1]
        df['frag_end_idx'] = indices[1:]

        return df

    def _find_mapped_columns(self, lib_df):
        self.seq_col = None
        for col in self.column_mapping['sequence']:
            if col in lib_df.columns:
                self.seq_col = col
                break
        self.rt_col = None
        for col in self.column_mapping['rt']:
            if col in lib_df.columns:
                self.rt_col = col
                break

    def _load_file(self, filename):
        df = pd.read_csv(filename, sep=self.csv_sep)
        self._find_mod_seq_column(df)

        df = self._get_fragment_intensity(df)
        
        min_rt = df[self.rt_col].min()
        df[self.rt_col] = (
            df[self.rt_col] - min_rt
        )/(df[self.rt_col].max() - min_rt)

        return df

    def _post_process(self, 
        lib_df
    ):  
        self._psm_df['nAA'] = self._psm_df.sequence.str.len()
        self._psm_df[
            ['frag_start_idx','frag_end_idx']
        ] = lib_df[['frag_start_idx','frag_end_idx']]

        self.normalize_rt_by_raw_name()


psm_w_frag_reader_provider.register_reader('spectronaut', SpectronautMSMSReader)

In [None]:
#hide
tsv_str = """PrecursorCharge	ModifiedPeptide	StrippedPeptide	iRT	LabeledPeptide	PrecursorMz	FragmentLossType	FragmentNumber	FragmentType	FragmentCharge	FragmentMz	RelativeIntensity
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	3	b	1	326.1710473	14.37029
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	3	y	1	361.2081611	37.7585
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	4	b	1	397.2081611	9.488808
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	4	y	1	432.2452749	100
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	5	b	1	496.276575	5.498003
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	5	y	1	545.3293389	74.56643
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	6	y	2	321.6946896	51.50719
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	noloss	3	y	1	411.1639269	6.911595
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	H3PO4	3	y	1	313.1870287	17.38582
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	noloss	4	y	1	510.2323409	10.65426
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	H3PO4	4	y	1	412.2554427	37.41231
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	noloss	5	y	1	609.3007548	45.03617
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	H3PO4	5	y	1	511.3238566	100
2	_MGS[Phospho (STY)]LDSK_	MGSLDSK	-27.5635	_MGS[Phospho (STY)]LDSK_	409.1617118	noloss	3	y	1	349.1717756	9.20575
2	_MGS[Phospho (STY)]LDSK_	MGSLDSK	-27.5635	_MGS[Phospho (STY)]LDSK_	409.1617118	noloss	6	y	1	686.2756622	10.37339
2	_MGS[Phospho (STY)]LDSK_	MGSLDSK	-27.5635	_MGS[Phospho (STY)]LDSK_	409.1617118	H3PO4	6	y	1	588.298764	100
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	3	y	1	347.2288965	88.27327
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	3	b	1	256.1291795	64.97146
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	4	y	1	494.2973105	100
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	4	b	1	403.1975934	35.17805
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	5	y	1	661.2956694	19.89741
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	5	b	1	490.2296218	40.04738
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	5	y	1	563.3187712	77.43164
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	6	b	1	701.290584	24.43497
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	6	b	1	603.3136858	63.09999
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	1(+H2+O)1(+H3+O4+P)	3	b	1	238.1186147	62.60851
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	1(+H2+O)1(+H3+O4+P)	5	b	1	472.219057	22.99903
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	1(+H2+O)1(+H3+O4+P)	6	b	1	585.303121	66.30389
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	3	y	1	329.1931797	100
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	3	b	1	268.165565	5.755442
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	4	b	2	267.0740493	8.743931
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	4	y	1	496.1915387	27.69686
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	4	b	1	435.1639239	6.162673
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	2(+H3+O4+P)	4	b	1	337.1870258	10.84257
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	4	y	1	398.2146405	26.28527
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	5	y	1	497.2830544	28.41294
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	6	y	1	762.2583115	8.490795
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	6	y	1	664.2814133	32.87384
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	2(+H3+O4+P)	6	y	1	566.3045151	35.87218
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	3	y	1	331.1975964	49.20179
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	4	y	1	498.1959553	10.89141
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	H3PO4	4	y	1	400.2190571	27.99594
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	5	y	1	611.2800193	14.11057
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	H3PO4	5	y	1	513.3031211	70.5295
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	6	y	1	698.3120477	60.23455
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	H3PO4	6	y	1	600.3351495	100
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	1(+H2+O)1(+H3+O4+P)	6	y	1	582.3245847	5.233977
"""

from io import StringIO

reader = psm_w_frag_reader_provider.get_reader('spectronaut')
psm_df = reader.import_file(StringIO(tsv_str))
for col in ['sequence','charge','rt','rt_norm','mods','mod_sites','nAA','frag_start_idx','frag_end_idx']:
    assert col in psm_df.columns
psm_df

Unnamed: 0,sequence,charge,rt,mods,mod_sites,nAA,frag_start_idx,frag_end_idx,rt_norm
0,AVVVSPK,2,0.018341,Phospho@S,5,7,0,6,0.018341
1,DPLAVDK,2,0.150034,,,7,6,12,0.150034
2,SVSFSLK,1,1.0,Phospho@S,3,7,12,18,1.0
3,VSVSPGR,2,0.0,Phospho@S;Phospho@S,2;4,7,18,24,0.0
4,YSLSPSK,2,0.296932,Phospho@S,4,7,24,30,0.296932


In [None]:
#hide
reader.fragment_intensity_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.450362,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.106543,0.0,0.0,0.0,0.374123,0.0
3,0.0,0.0,0.069116,0.0,0.0,0.0,0.173858,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.515072,0.0,0.0,0.0,0.0
7,0.0,0.0,0.745664,0.0,0.0,0.0,0.0,0.0
8,0.143703,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,0.094888,0.0,0.377585,0.0,0.0,0.0,0.0,0.0


In [None]:
#hide
df = pd.read_csv(StringIO(tsv_str), sep='\t')
seq = 'YSLSPSK'
seq,start,end = psm_df.loc[psm_df.sequence==seq,['sequence','frag_start_idx','frag_end_idx']].values[0]
y_df = df[(df['StrippedPeptide']==seq)&(df['FragmentLossType']=='noloss')&(df['FragmentType']=='y')]
y_ions = np.zeros(len(seq)-1)
y_ions[len(seq)-y_df.FragmentNumber-1] = y_df.RelativeIntensity.values / 100
assert np.allclose(
    reader.fragment_intensity_df.loc[start:end+1,'y_z1'].values,
    y_ions
)
y_df = df[(df['StrippedPeptide']==seq)&(df['FragmentLossType']=='H3PO4')&(df['FragmentType']=='y')]
y_ions = np.zeros(len(seq)-1)
y_ions[len(seq)-y_df.FragmentNumber-1] = y_df.RelativeIntensity.values / 100
assert np.allclose(
    reader.fragment_intensity_df.loc[start:end+1,'y_modloss_z1'].values,
    y_ions
)