In [1]:
#default_exp reader.maxquant_reader

In [2]:
#hide
import pandas as pd
df = pd.DataFrame({'a':['1,2;3,4;5,6','3,4;5,6']})
def parse_str(s):
    return ';'.join([item.split(',')[0] for item in s.split(';')]), ';'.join([item.split(',')[1] for item in s.split(';')])
df['b'], df['c'] = zip(*df.a.apply(parse_str))
df

Unnamed: 0,a,b,c
0,"1,2;3,4;5,6",1;3;5,2;4;6
1,"3,4;5,6",3;5,4;6


In [3]:
#export
import pandas as pd
import numba
import numpy as np

from alphadeep.reader.psm_reader import PSMReaderBase

from alphabase.peptide.fragment import \
    init_fragment_by_precursor_dataframe

@numba.njit
def parse_mq(
    modseq, 
    fixed_C=True
):
    PeptideModSeq = modseq.strip('_')
    mod_list = []
    site_list = []
    if PeptideModSeq.startswith('('):
        site_list.append('0')
        site_end = PeptideModSeq.find(')')+1
        mod_list.append(PeptideModSeq[:site_end])
        PeptideModSeq = PeptideModSeq[site_end:]
    site = PeptideModSeq.find('(')
    while site != -1:
        site_end = PeptideModSeq.find(')',site+1)+1
        if site_end < len(PeptideModSeq) and PeptideModSeq[site_end] == ')': 
            site_end += 1
        site_list.append(str(site+1)) 
        mod_list.append(PeptideModSeq[site-1:site_end])
        PeptideModSeq = PeptideModSeq[:site] + PeptideModSeq[site_end:]
        site = PeptideModSeq.find('(', site)
    if fixed_C:
        site = PeptideModSeq.find('C')
        while site != -1:
            site_list.append(str(site+1))
            mod_list.append('C(Carbamidomethyl (C))')
            site = PeptideModSeq.find('C',site+1)
    return ';'.join(mod_list), ';'.join(site_list)

class MaxQuantReader(PSMReaderBase):
    def __init__(self, 
        frag_types=['b','y','b-modloss','y-modloss'], 
        max_frag_charge=2,
        load_frag_inten=False,
    ):
        super().__init__(frag_types, max_frag_charge)

        self.if_load_frag_inten = load_frag_inten
        
        self.modification_convert_dict = {}
        self.modification_convert_dict['(Acetyl (Protein N-term))'] = 'Acetyl@Protein N-term'
        self.modification_convert_dict['C(Carbamidomethyl (C))'] = 'Carbamidomethyl@C'
        self.modification_convert_dict['M(Oxidation (M))'] = 'Oxidation@M'
        self.modification_convert_dict['S(Phospho (S))'] = 'Phospho@S'
        self.modification_convert_dict['T(Phospho (T))'] = 'Phospho@T'
        self.modification_convert_dict['Y(Phospho (Y))'] = 'Phospho@Y'
        self.modification_convert_dict['S(Phospho (ST))'] = 'Phospho@S'
        self.modification_convert_dict['T(Phospho (ST))'] = 'Phospho@T'
        self.modification_convert_dict['S(Phospho (STY))'] = 'Phospho@S'
        self.modification_convert_dict['T(Phospho (STY))'] = 'Phospho@T'
        self.modification_convert_dict['Y(Phospho (STY))'] = 'Phospho@Y'
        self.modification_convert_dict['N(Deamidation (NQ))'] = 'Deamidated@N'
        self.modification_convert_dict['Q(Deamidation (NQ))'] = 'Deamidated@Q'
        self.modification_convert_dict['K(GlyGly (K))'] = 'GlyGly@K'
        self.modification_convert_dict['(ac)'] = 'Acetyl@Protein N-term'
        self.modification_convert_dict['M(ox)'] = 'Oxidation@M'
        self.modification_convert_dict['S(ph)'] = 'Phospho@S'
        self.modification_convert_dict['T(ph)'] = 'Phospho@T'
        self.modification_convert_dict['Y(ph)'] = 'Phospho@Y'
        self.modification_convert_dict['K(gl)'] = 'GlyGly@K'

    def _load_file(self, filename):
        df = pd.read_csv(filename, sep='\t')
        df = df[(df['Reverse']!='+')&(~pd.isna(df['Retention time']))]
        df = df.reset_index(drop=True)
        psm_df = pd.DataFrame()
        psm_df['sequence'] = df['Sequence']
        psm_df['nAA'] = psm_df.sequence.str.len()
        psm_df['mods'], psm_df['mod_sites'] = zip(*df['Modified sequence'].apply(parse_mq))
        psm_df['charge'] = df['Charge']
        psm_df['RT'] = df['Retention time']*60
        if 'Scan number' in df.columns:
            # msms.txt
            psm_df['scan'] = df['Scan number']
        else:
            # evidence.txt
            psm_df['scan'] = df['MS/MS scan number']
        if 'K0' in df.columns:
            psm_df['mobility'] = 1/df['K0']
        else:
            psm_df['mobility'] = pd.NA
        if 'CCS' in df.columns:
            psm_df['CCS'] = df['CCS']
        else:
            psm_df['CCS'] = pd.NA
        psm_df['raw_name'] = df['Raw file']
        psm_df['score'] = df['Score']
        psm_df['proteins'] = df['Proteins']
        if 'Gene Names' in df.columns:
            psm_df['genes'] = df['Gene Names']
        elif 'Gene names' in df.columns:
            psm_df['genes'] = df['Gene names']
        else:
            psm_df['genes'] = ''
        self._psm_df = psm_df

        if self.if_load_frag_inten:
            self._load_fragment_inten(df)

    def _load_fragment_inten(self, mq_df):
        self._fragment_inten_df = init_fragment_by_precursor_dataframe(
            self._psm_df, self.charged_ion_types
        )

        frag_col_dict = dict(zip(
            self.charged_ion_types, 
            range(len(self.charged_ion_types))
        ))

        for ith_psm, (nAA, start,end) in enumerate(
            self.psm_df[['nAA','frag_start_idx','frag_end_idx']].values
        ):
            intens = np.zeros((nAA-1, len(self.charged_ion_types)))

            frag_types = mq_df.loc[ith_psm,'Matches']
            frag_intens = mq_df.loc[ith_psm,'Intensities']
            for frag_type, frag_inten in zip(
                frag_types.split(';'), frag_intens.split(';')
            ):
                if '-' in frag_type: continue
                idx = frag_type.find('(')
                charge = '1+'
                if idx > 0:
                    frag_type, charge = frag_type[:idx], frag_type[idx+1:-1]
                frag_type, frag_pos = frag_type[0], int(frag_type[1:])
                if frag_type in 'xyz':
                    frag_pos = nAA - frag_pos -1
                else:
                    frag_pos -= 1 
                frag_type += '_'+charge
                if frag_type not in frag_col_dict: continue
                frag_col = frag_col_dict[frag_type]
                
                intens[frag_pos,frag_col] = float(frag_inten)

            if np.any(intens==0):
                intens /= np.max(intens)
            self._fragment_inten_df.iloc[
                start:end,:
            ] = intens

    def load_fragment_inten_df(self, ms_files=None):
        pass

In [4]:
#hide
import os
import sys
filename = '/Users/zengwenfeng/Workspace/Data/HeLa_500ng/share/txt/msms.txt'
if not os.path.isfile(filename): sys.exit(-1)
mq_reader = MaxQuantReader(load_frag_inten=True)
mq_reader.load(filename)
mq_reader.psm_df[mq_reader.psm_df.mods.str.contains('Oxidation')]

  self._load_file(filename)


Unnamed: 0,sequence,nAA,mods,mod_sites,charge,RT,scan,mobility,CCS,raw_name,score,proteins,genes,frag_start_idx,frag_end_idx
38,AAALEAMK,8,Oxidation@M,8,2,936.48,10442,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,89.550,sp|P31948|STIP1_HUMAN,,990,997
46,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,2,3164.64,58999,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,153.230,sp|P26641|EF1G_HUMAN,,1086,1107
47,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,3,3164.76,59002,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,93.851,sp|P26641|EF1G_HUMAN,,1107,1128
48,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,2,3195.00,59650,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,202.320,sp|P26641|EF1G_HUMAN,,1128,1149
49,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,3,3195.06,59651,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,124.690,sp|P26641|EF1G_HUMAN,,1149,1170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50243,YVMTTTTLER,10,Oxidation@M,4,2,2031.06,34495,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,84.213,sp|P05198|IF2A_HUMAN,,634823,634832
50279,YWDLMNLSEK,10,Oxidation@M,6,2,4328.10,83722,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,84.615,sp|Q9BZE4|NOG1_HUMAN,,635232,635241
50297,YYAVNFPMR,9,Oxidation@M,9,2,3565.50,67576,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,66.087,sp|Q92769|HDAC2_HUMAN,,635443,635451
50344,YYTSASGDEMVSLK,14,Oxidation@M,11,2,2662.74,48198,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,83.652,sp|P07900|HS90A_HUMAN,,635953,635966


In [5]:
mq_reader.psm_df

Unnamed: 0,sequence,nAA,mods,mod_sites,charge,RT,scan,mobility,CCS,raw_name,score,proteins,genes,frag_start_idx,frag_end_idx
0,AAAAAAAAAPAAAATAPTTAATTAATAAQ,29,,,3,4215.66,81358,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,41.423,sp|P37108|SRP14_HUMAN,,0,28
1,AAAAAAAAAPAAAATAPTTAATTAATAAQ,29,,,2,4217.22,81391,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,118.210,sp|P37108|SRP14_HUMAN,,28,56
2,AAAAAAALQAK,11,,,2,1602.48,25126,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,92.439,sp|P36578|RL4_HUMAN,,131,141
3,AAAAADLANR,10,,,2,1302.12,18541,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,97.635,sp|O76031|CLPX_HUMAN,,176,185
4,AAAAAWEEPSSGNGTAR,17,,,2,2233.20,38884,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,165.580,sp|Q9P258|RCC2_HUMAN,,238,254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50355,YYVTIIDAPGHR,12,,,2,3607.44,68466,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,199.460,sp|P68104|EF1A1_HUMAN;sp|Q5VTE0|EF1A3_HUMAN,,636073,636084
50356,YYVTIIDAPGHR,12,,,2,3638.10,69117,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,102.070,sp|P68104|EF1A1_HUMAN;sp|Q5VTE0|EF1A3_HUMAN,,636084,636095
50357,YYYAVVDCDSPETASK,16,Carbamidomethyl@C,8,2,3400.50,64050,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,186.810,sp|Q9H501|ESF1_HUMAN,,636095,636110
50358,YYYAVYDMVVR,11,,,2,5141.70,100746,,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,96.331,sp|P07942|LAMB1_HUMAN,,636110,636120


In [6]:
mq_reader.fragment_inten_df.loc[:27]

Unnamed: 0,b_1+,b_2+,y_1+,y_2+,b-modloss_1+,b-modloss_2+,y-modloss_1+,y-modloss_2+
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0
8,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
