In [1]:
#default_exp reader.pfind_reader

In [2]:
#hide
%reload_ext autoreload
%autoreload 2

In [3]:
#export
import pandas as pd
import numpy as np
import typing
from tqdm import tqdm

import alphabase.constants.modification as ap_mod
from alphabase.peptide.fragment import \
    concat_precursor_fragment_dataframes,\
    init_fragment_by_precursor_dataframe


from alphadeep.reader.psm_reader import \
    PSMReaderBase, psm_reader_provider, \
    FragmentReaderBase, fragment_reader_provider


def convert_one_pFind_mod(mod):
    if mod[-1] == ')':
        mod = mod[:(mod.find('(')-1)]
        idx = mod.rfind('[')
        name = mod[:idx]
        site = mod[(idx+1):]
    else:
        idx = mod.rfind('[')
        name = mod[:idx]
        site = mod[(idx+1):-1]
    if len(site) == 1:
        return name + '@' + site
    elif site == 'AnyN-term':
        return name + '@' + 'Any N-term'
    elif site == 'ProteinN-term':
        return name + '@' + 'Protein N-term'
    elif site.startswith('AnyN-term'):
        return name + '@' + site[-1] + '^Any_N-term'
    elif site.startswith('ProteinN-term'):
        return name + '@' + site[-1] + '^Protein_N-term'
    elif site == 'AnyC-term':
        return name + '@' + 'Any C-term'
    elif site == 'ProteinC-term':
        return name + '@' + 'Protein C-term'
    elif site.startswith('AnyC-term'):
        return name + '@' + site[-1] + '^Any_C-term'
    elif site.startswith('ProteinC-term'):
        return name + '@' + site[-1] + '^Protein_C-term'
    else:
        return None

def translate_pFind_mod(mod_str):
    if not mod_str: return ""
    ret_mods = []
    for mod in mod_str.split(';'):
        mod = convert_one_pFind_mod(mod)
        if not mod: return pd.NA
        elif mod not in ap_mod.MOD_INFO_DICT: return pd.NA
        else: ret_mods.append(mod)
    return ';'.join(ret_mods)

def get_pFind_mods(pfind_mod_str):
    pfind_mod_str = pfind_mod_str.strip(';')
    if not pfind_mod_str: return "", ""

    items = [item.split(',',3) for item in pfind_mod_str.split(';')]
    items = list(zip(*items))
    return ';'.join(items[1]), ';'.join(items[0])

def remove_pFind_decoy_protein(protein):
    proteins = protein[:-1].split('/')
    return ';'.join([protein for protein in proteins if not protein.startswith('REV_')])


In [4]:
get_pFind_mods('1,Oxidation[M];')

('Oxidation[M]', '1')

In [5]:
get_pFind_mods('1,Oxidation[M];5,Phosphos[S];')

('Oxidation[M];Phosphos[S]', '1;5')

In [6]:
convert_one_pFind_mod('Oxidation[M]')

'Oxidation@M'

In [7]:
translate_pFind_mod('Oxidation[M];Phospho[S]')

'Oxidation@M;Phospho@S'

In [8]:
#export
class pFindReader(PSMReaderBase):
    def __init__(self, 
        fragment_reader = None
    ):
        super().__init__(
            fragment_reader
        )

    def translate_modification(self):
        pass

    def _load_file(self, filename):
        pfind_df = pd.read_csv(filename, index_col=False, sep='\t')
        pfind_df.fillna('', inplace=True)
        columns = pfind_df.columns.values.copy()
        pfind_df = pfind_df.iloc[:,:len(columns)]
        self._from_pfind_df(pfind_df)

    def _from_pfind_df(self, pfind_df):
        psm_df = pd.DataFrame()
        psm_df['sequence'] = pfind_df['Sequence']
        psm_df['nAA'] = pfind_df['Sequence'].str.len()
        psm_df['mods'], psm_df['mod_sites'] = zip(*pfind_df['Modification'].apply(get_pFind_mods))
        psm_df['charge'] = pfind_df['Charge']
        psm_df['RT'] = np.nan
        psm_df['mobility'] = np.nan
        psm_df['raw_name'] = pfind_df['File_Name'].str.split('.').apply(lambda x: x[0])
        psm_df['scan'] = pfind_df['Scan_No']
        psm_df['spec_id'] = pfind_df['File_Name']
        psm_df['score'] = -np.log(pfind_df['Final_Score'].values)
        psm_df['proteins'] = pfind_df['Proteins'].apply(remove_pFind_decoy_protein)
        psm_df['genes'] = psm_df['Proteins']
        psm_df['q_value'] = pfind_df['Q-value']

        if 'Target/Decoy' in pfind_df.columns:
            psm_df['decoy'] = (pfind_df['Target/Decoy']=='decoy').astype(int)
        else:
            psm_df['decoy'] = (pfind_df['Targe/Decoy']=='decoy').astype(int)
        

        psm_df['mods'] = psm_df['mods'].apply(translate_pFind_mod)
        psm_df = psm_df[~psm_df['mods'].isna()]

        self._psm_df = psm_df

psm_reader_provider.register_reader('pfind', pFindReader)

In [9]:
#export

class PSMLabelFragmentReader(FragmentReaderBase):
    def __init__(self,
        frag_types=['b','y','b-modloss','y-modloss'], 
        max_frag_charge=2,
        frag_tol=20, frag_ppm=True,
    ):
        super().__init__(
            frag_types, max_frag_charge,
            frag_tol, frag_ppm
        )

        psmlabel_columns = 'b,b-NH3,b-H20,b-ModLoss,y,y-HN3,y-H20,y-ModLoss'.split(',')
        self.psmlabel_frag_columns = []
        self.frag_df_columns = {}
        for _type in psmlabel_columns:
            frag_idxes = [
                i for i,_t in enumerate(
                    self.charged_ion_types
                ) if _t.startswith(_type.lower()+'_')
            ]
            if frag_idxes:
                self.psmlabel_frag_columns.append(_type)
                self.frag_df_columns[_type] = np.array(
                    frag_idxes, dtype=int
                )
    def load_fragment_inten_df(self, 
        psmlabel_df, raw_files=None
    ):
        self._fragment_inten_df = init_fragment_by_precursor_dataframe(
            psmlabel_df, self.charged_ion_types
        )

        for ith_psm, (nAA, start,end) in enumerate(
            psmlabel_df[['nAA','frag_start_idx','frag_end_idx']].values
        ):
            intens = np.zeros((nAA-1, len(self.charged_ion_types)))
            for ion_type in self.psmlabel_frag_columns:
                if ion_type not in psmlabel_df.columns: continue

                pos_end = ion_type.find('-')-len(ion_type)-2 if '-' in ion_type else -2
                typed_frags = psmlabel_df.loc[ith_psm,ion_type]
                if not typed_frags: continue
                typed_frags = typed_frags.strip(';').split(';')
                frag_pos = []
                frag_charge = []
                frag_inten = []

                for frag in typed_frags:
                    frag, inten = frag.split(',')
                    frag_pos.append(int(frag[1:pos_end]))
                    frag_charge.append(int(frag[-1]))
                    frag_inten.append(float(inten))
                if not frag_inten: continue
                
                frag_pos = np.array(frag_pos, dtype=int)
                frag_col = np.array(frag_charge, dtype=int)-1
                
                if ion_type[0] in 'xyz':
                    frag_pos = nAA - frag_pos -1
                else:
                    frag_pos -= 1
                intens[frag_pos,self.frag_df_columns[ion_type][frag_col]] = frag_inten
            if np.any(intens>0):
                intens /= np.max(intens)
            self._fragment_inten_df.iloc[
                start:end,:
            ] = intens

class PSMLabelReader(pFindReader):
    def __init__(self, 
        fragment_reader = None
    ):
        super().__init__(
            fragment_reader
        )

    def _load_file(self, filename):
        psmlabel_df = pd.read_csv(filename, sep="\t")
        psmlabel_df.fillna('', inplace=True)
        self._from_psmlabel_df(psmlabel_df)

    def _from_psmlabel_df(self, psmlabel_df):
        psm_df = pd.DataFrame()
        psm_df['sequence'] = psmlabel_df['peptide']
        psmlabel_df['nAA'] = psmlabel_df['peptide'].str.len()
        psm_df['nAA'] = psmlabel_df['nAA']
        psm_df['mods'], psm_df['mod_sites'] = zip(*psmlabel_df['modinfo'].apply(get_pFind_mods))
        if 'charge' in psmlabel_df.columns:
            psm_df['charge'] = psmlabel_df['charge']
        else:
            psm_df['charge'] = psmlabel_df['spec'].str.split('.').apply(lambda x: x[-3]).astype(int)

        if 'RT' in psmlabel_df.columns:
            psm_df['RT'] = psmlabel_df['RT']
        else:
            psm_df['RT'] = pd.NA
        psm_df['mobility'] = pd.NA
        psm_df['raw_name'] = psmlabel_df['spec'].str.split('.').apply(lambda x: x[0])

        psm_df['mods'] = psm_df['mods'].apply(translate_pFind_mod)

        psmlabel_df = psmlabel_df[
            ~psm_df['mods'].isna()
        ].reset_index(drop=True)
        psm_df = psm_df[
            ~psm_df['mods'].isna()
        ].reset_index(drop=True)

        self._psm_df = psm_df

        if self.fragment_reader:
            self.fragment_reader.load_fragment_inten_df(psmlabel_df)
            self._psm_df[
                ['frag_start_idx','frag_end_idx']
            ] = psmlabel_df[['frag_start_idx','frag_end_idx']]

psm_reader_provider.register_reader('psmlabel', PSMLabelReader)
fragment_reader_provider.register_reader(
    'psmlabel', PSMLabelFragmentReader
)

In [10]:
import io
psmlabel_str = '''spec	peptide	modinfo	b	b-NH3	b-H2O	b-ModLoss	y	y-NH3	y-H2O	y-ModLoss
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.31809.31809.2.0.dta	PSTDLLMLK	2,Phospho[S];7,Oxidation[M];	b2+1,11394796;b3+1,1242152.8;b4+1,3736963.3;b4+2,169730.9;b5+1,1963146.4;b6+1,1264694.9;b6+2,265013.9;b7+1,1253226.5;b7+2,909294.6;b8+1,720161.7;		b2-H2O+1,1392711.1;b3-H2O+1,2807275.5;b4-H2O+1,656366;b5-H2O+1,341585;b6-H2O+1,209442.1;	b7-ModLoss+1,473386.4;b8-ModLoss+1,208994.1;	y8+1,22006548;y8+2,256042.3;y7+1,19231634;y7+2,213004.9;y6+1,6696723;y5+1,5890172;y4+1,4885660.5;y3+1,3570823.5;y2+1,1857323.8;y1+1,1636183.8;	y8-NH3+1,567207.4;y1-NH3+1,531551.1;	y8-H2O+1,1416820.1;y8-H2O+2,256081;y7-H2O+1,900931.1;y7-H2O+2,2961118.5;y3-H2O+1,184890.4;y2-H2O+1,306988.6;y1-H2O+1,1126237.5;	y8-ModLoss+1,4600049;y7-ModLoss+1,3840026.3;y6-ModLoss+1,1045096.9;y5-ModLoss+1,868705.3;y4-ModLoss+1,573257.7;y3-ModLoss+1,518627;
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.23862.23862.2.0.dta	HTAYSDFLSDK		b1+1,299364.8;b2+1,3488062;b3+1,308160.7;b4+1,233294.5;b5+1,55810.8;b6+1,650653.9;b7+1,485245;b8+1,328604.8;b9+1,160565.1;b10+1,376348.6;	b7-NH3+1,63030.5;b10-NH3+1,129601.2;	b2-H2O+1,176123.1;b3-H2O+1,114956.5;b4-H2O+1,59385.5;b5-H2O+1,41324.8;b6-H2O+1,527812.9;b7-H2O+1,275831.8;b8-H2O+1,365457.2;b9-H2O+1,227540.1;b9-H2O+2,59055.5;b10-H2O+1,265041.1;b10-H2O+2,55810.8;		y10+1,2513661;y9+1,3651241.3;y8+1,989975.4;y7+1,594356.4;y6+1,155207.8;y5+1,1266161.9;y4+1,321580;y3+1,1227822.8;y2+1,636557.6;y1+1,697604.3;	y10-NH3+1,75562.7;y7-NH3+1,102006.4;y1-NH3+1,185766.1;	y10-H2O+1,189888.1;y9-H2O+1,73236.7;y4-H2O+1,56329.2;y3-H2O+1,91522.7;y2-H2O+1,98231.2;y1-H2O+1,375849.7;	
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.23431.23431.2.0.dta	HTAYSDFLSDK		b1+1,45976.2;b2+1,568759.5;b3+1,49093.1;b4+1,49601;b5+1,23729.4;b6+1,141218;b7+1,104082.9;b8+1,115693.4;b9+1,60744.1;b10+1,98634.1;	b5-NH3+1,12496.8;b8-NH3+1,33514.1;b9-NH3+1,34818.7;	b2-H2O+1,13616.9;b3-H2O+1,9902.4;b4-H2O+1,29442.6;b5-H2O+1,13391.7;b6-H2O+1,54826.9;b7-H2O+1,62953.9;b8-H2O+1,69100.3;b9-H2O+1,60146.4;b10-H2O+1,50907.2;b10-H2O+2,23729.4;		y10+1,361255.9;y9+1,552602.6;y8+1,160028.2;y7+1,102606.7;y6+1,22479.1;y5+1,167033.7;y4+1,76430.6;y3+1,273281.6;y2+1,165234.1;y1+1,142589;	y7-NH3+1,22439.1;y1-NH3+1,37364.8;	y10-H2O+1,29709;y9-H2O+1,16514.8;y3-H2O+1,36499.1;y2-H2O+1,17987.4;y1-H2O+1,96955.6;	
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.32733.32733.2.0.dta	HFALFSTDVTK		b1+1,27135.7;b2+1,361137.4;b3+1,68835.3;b4+1,70138.3;b5+1,45754.8;b7+1,11576.6;b8+1,91503.8;b9+1,64331.7;b10+1,27626.7;b10+2,25667;		b3-H2O+1,48033;b9-H2O+1,14316.2;b10-H2O+1,11975.8;		y10+1,219460.2;y10+2,13433.4;y9+1,442455.6;y8+1,97392.2;y7+1,108960.5;y6+1,60849.7;y5+1,26771.3;y4+1,17036.4;y3+1,45523.9;y2+1,103608.1;y1+1,62643;	y6-NH3+2,11445.5;y1-NH3+1,18111.4;	y2-H2O+1,15362.3;y1-H2O+1,34004.8;	
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.23669.23669.2.0.dta	HTAYSDFLSDK		b1+1,262855;b2+1,3235572.3;b3+1,268667.7;b4+1,237506.8;b5+1,80077.3;b6+1,557696.8;b7+1,336325.9;b7+2,31299.9;b8+1,247175;b8+2,28601.6;b9+1,116897.4;b9+2,18714.8;b10+1,275498.9;	b2-NH3+1,19037.2;	b2-H2O+1,141344.2;b3-H2O+1,92893.6;b4-H2O+1,56392;b5-H2O+1,46386.1;b6-H2O+1,404526;b7-H2O+1,203047.2;b7-H2O+2,13485.6;b8-H2O+1,231333.9;b8-H2O+2,30468.7;b9-H2O+1,151952.4;b9-H2O+2,53914;b10-H2O+1,172398.7;b10-H2O+2,80077.3;		y10+1,1652851.5;y10+2,31706.2;y9+1,2379192.5;y8+1,664060.9;y8+2,26944.2;y7+1,418105.1;y6+1,118890.7;y5+1,1026599.5;y4+1,309265.2;y3+1,1084321;y2+1,608127.8;y1+1,617369.5;	y10-NH3+1,41452.9;y7-NH3+1,61761.1;y2-NH3+1,32386.8;y1-NH3+1,199112.3;	y10-H2O+1,127643.4;y9-H2O+1,49576.6;y8-H2O+1,26233.2;y6-H2O+1,13648.5;y5-H2O+1,34467.8;y4-H2O+1,28410.1;y3-H2O+1,75421.2;y2-H2O+1,106013.4;y1-H2O+1,351150.3;	
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.32408.32408.2.0.dta	HFALFSTDVTK		b1+1,39174;b2+1,547471.8;b3+1,97899.3;b4+1,102380.5;b5+1,64629.1;b6+1,18020.3;b6+2,11095.4;b8+1,102782.6;b9+1,70052.5;b10+1,32341.3;b10+2,19485.5;	b2-NH3+1,17124.9;	b3-H2O+1,81865.9;b8-H2O+1,16527.8;b10-H2O+1,22699.3;		y10+1,251072.3;y9+1,579795.4;y8+1,126733.6;y7+1,154207.1;y6+1,73626;y5+1,19896.1;y4+1,20851.9;y3+1,70220.2;y2+1,127975.4;y1+1,81733.5;	y1-NH3+1,23140.9;	y2-H2O+1,18157.2;y1-H2O+1,43263.3;	
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.31708.31708.2.0.dta	PSTDLLMLK	7,Oxidation[M];	b2+1,15111598;b3+1,1437139.1;b4+1,4843799.5;b4+2,308051.3;b5+1,2508311;b6+1,1812935.6;b6+2,332253.3;b7+1,1607183.6;b7+2,1064015.8;b8+1,1010607.1;		b2-H2O+1,1739067.1;b3-H2O+1,3721898;b4-H2O+1,832363.4;b5-H2O+1,598314.4;b6-H2O+1,370911.8;b7-H2O+1,342069.2;b8-H2O+1,267002.9;	b7-ModLoss+1,667185.4;b8-ModLoss+1,233204;	y8+1,28281878;y8+2,367999.1;y7+1,22845608;y7+2,591054.7;y6+1,8346688;y5+1,6896273;y5+2,145534.7;y4+1,5982969;y3+1,4114733.5;y2+1,2586525.3;y1+1,2430101.8;	y7-NH3+1,376892.8;y6-NH3+1,217853;y1-NH3+1,732089.9;	y8-H2O+1,1979115.4;y8-H2O+2,324478.8;y7-H2O+1,1182049.3;y7-H2O+2,4076522;y6-H2O+1,180659.8;y5-H2O+1,166679.8;y3-H2O+1,163515.6;y2-H2O+1,357436.9;y1-H2O+1,1743094.4;	y8-ModLoss+1,6069443.5;y7-ModLoss+1,4348249;y6-ModLoss+1,1384640.9;y5-ModLoss+1,1112314.8;y4-ModLoss+1,563618.3;y3-ModLoss+1,625598.4;
01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1.21837.21837.2.0.dta	HDLDYGIDSYK		b1+1,41983.3;b2+1,379322.4;b3+1,56831.5;b4+1,115561.6;b5+1,42836.4;b6+1,50857.8;b7+1,51406.2;b7+2,18378.2;b8+1,113548.1;b8+2,9950.8;b9+1,59166.4;b9+2,15431;b10+1,48784.2;	b10-NH3+1,13714.9;	b2-H2O+1,13046.2;b3-H2O+1,64792.4;b4-H2O+1,11627.2;b9-H2O+1,41938.1;b10-H2O+1,26039.3;		y10+1,305846.1;y9+1,262227.6;y8+1,113308;y7+1,140019.9;y6+1,74818.5;y4+1,50029.2;y3+1,135117.1;y2+1,55729.5;y1+1,93491.5;	y10-NH3+1,15723.9;y9-NH3+1,10144.8;y1-NH3+1,28375.2;	y7-H2O+1,11441.7;y1-H2O+1,42537.6;	
'''
psmlabel_df = pd.read_csv(io.StringIO(psmlabel_str), sep='\t')
psmlabel_df.fillna('', inplace=True)
psmlabel_df

Unnamed: 0,spec,peptide,modinfo,b,b-NH3,b-H2O,b-ModLoss,y,y-NH3,y-H2O,y-ModLoss
0,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,PSTDLLMLK,"2,Phospho[S];7,Oxidation[M];","b2+1,11394796;b3+1,1242152.8;b4+1,3736963.3;b4...",,"b2-H2O+1,1392711.1;b3-H2O+1,2807275.5;b4-H2O+1...","b7-ModLoss+1,473386.4;b8-ModLoss+1,208994.1;","y8+1,22006548;y8+2,256042.3;y7+1,19231634;y7+2...","y8-NH3+1,567207.4;y1-NH3+1,531551.1;","y8-H2O+1,1416820.1;y8-H2O+2,256081;y7-H2O+1,90...","y8-ModLoss+1,4600049;y7-ModLoss+1,3840026.3;y6..."
1,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,HTAYSDFLSDK,,"b1+1,299364.8;b2+1,3488062;b3+1,308160.7;b4+1,...","b7-NH3+1,63030.5;b10-NH3+1,129601.2;","b2-H2O+1,176123.1;b3-H2O+1,114956.5;b4-H2O+1,5...",,"y10+1,2513661;y9+1,3651241.3;y8+1,989975.4;y7+...","y10-NH3+1,75562.7;y7-NH3+1,102006.4;y1-NH3+1,1...","y10-H2O+1,189888.1;y9-H2O+1,73236.7;y4-H2O+1,5...",
2,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,HTAYSDFLSDK,,"b1+1,45976.2;b2+1,568759.5;b3+1,49093.1;b4+1,4...","b5-NH3+1,12496.8;b8-NH3+1,33514.1;b9-NH3+1,348...","b2-H2O+1,13616.9;b3-H2O+1,9902.4;b4-H2O+1,2944...",,"y10+1,361255.9;y9+1,552602.6;y8+1,160028.2;y7+...","y7-NH3+1,22439.1;y1-NH3+1,37364.8;","y10-H2O+1,29709;y9-H2O+1,16514.8;y3-H2O+1,3649...",
3,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,HFALFSTDVTK,,"b1+1,27135.7;b2+1,361137.4;b3+1,68835.3;b4+1,7...",,"b3-H2O+1,48033;b9-H2O+1,14316.2;b10-H2O+1,1197...",,"y10+1,219460.2;y10+2,13433.4;y9+1,442455.6;y8+...","y6-NH3+2,11445.5;y1-NH3+1,18111.4;","y2-H2O+1,15362.3;y1-H2O+1,34004.8;",
4,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,HTAYSDFLSDK,,"b1+1,262855;b2+1,3235572.3;b3+1,268667.7;b4+1,...","b2-NH3+1,19037.2;","b2-H2O+1,141344.2;b3-H2O+1,92893.6;b4-H2O+1,56...",,"y10+1,1652851.5;y10+2,31706.2;y9+1,2379192.5;y...","y10-NH3+1,41452.9;y7-NH3+1,61761.1;y2-NH3+1,32...","y10-H2O+1,127643.4;y9-H2O+1,49576.6;y8-H2O+1,2...",
5,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,HFALFSTDVTK,,"b1+1,39174;b2+1,547471.8;b3+1,97899.3;b4+1,102...","b2-NH3+1,17124.9;","b3-H2O+1,81865.9;b8-H2O+1,16527.8;b10-H2O+1,22...",,"y10+1,251072.3;y9+1,579795.4;y8+1,126733.6;y7+...","y1-NH3+1,23140.9;","y2-H2O+1,18157.2;y1-H2O+1,43263.3;",
6,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,PSTDLLMLK,"7,Oxidation[M];","b2+1,15111598;b3+1,1437139.1;b4+1,4843799.5;b4...",,"b2-H2O+1,1739067.1;b3-H2O+1,3721898;b4-H2O+1,8...","b7-ModLoss+1,667185.4;b8-ModLoss+1,233204;","y8+1,28281878;y8+2,367999.1;y7+1,22845608;y7+2...","y7-NH3+1,376892.8;y6-NH3+1,217853;y1-NH3+1,732...","y8-H2O+1,1979115.4;y8-H2O+2,324478.8;y7-H2O+1,...","y8-ModLoss+1,6069443.5;y7-ModLoss+1,4348249;y6..."
7,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1...,HDLDYGIDSYK,,"b1+1,41983.3;b2+1,379322.4;b3+1,56831.5;b4+1,1...","b10-NH3+1,13714.9;","b2-H2O+1,13046.2;b3-H2O+1,64792.4;b4-H2O+1,116...",,"y10+1,305846.1;y9+1,262227.6;y8+1,113308;y7+1,...","y10-NH3+1,15723.9;y9-NH3+1,10144.8;y1-NH3+1,28...","y7-H2O+1,11441.7;y1-H2O+1,42537.6;",


In [11]:
frag_reader = fragment_reader_provider.get_reader('psmlabel')
psm_reader = psm_reader_provider.get_reader('psmlabel', frag_reader)
psm_reader._from_psmlabel_df(psmlabel_df)
psm_reader.fragment_inten_df

Unnamed: 0,b_1+,b_2+,y_1+,y_2+,b-modloss_1+,b-modloss_2+,y-modloss_1+,y-modloss_2+
0,0.000000,0.000000,1.000000,0.011635,0.0,0.0,0.209031,0.0
1,0.517791,0.000000,0.873905,0.009679,0.0,0.0,0.174495,0.0
2,0.056445,0.000000,0.304306,0.000000,0.0,0.0,0.047490,0.0
3,0.169811,0.007713,0.267655,0.000000,0.0,0.0,0.039475,0.0
4,0.089207,0.000000,0.222009,0.000000,0.0,0.0,0.026049,0.0
...,...,...,...,...,...,...,...,...
71,0.134075,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
72,0.135521,0.048450,0.131891,0.000000,0.0,0.0,0.000000,0.0
73,0.299345,0.026233,0.356206,0.000000,0.0,0.0,0.000000,0.0
74,0.155979,0.040680,0.146919,0.000000,0.0,0.0,0.000000,0.0


In [12]:
psm_reader.psm_df

Unnamed: 0,sequence,nAA,mods,mod_sites,charge,RT,mobility,raw_name,frag_start_idx,frag_end_idx
0,PSTDLLMLK,9,Phospho@S;Oxidation@M,2;7,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,0,8
1,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,8,18
2,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,18,28
3,HFALFSTDVTK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,28,38
4,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,38,48
5,HFALFSTDVTK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,48,58
6,PSTDLLMLK,9,Oxidation@M,7,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,58,66
7,HDLDYGIDSYK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,66,76


In [13]:
#export
def load_psmlabel_list(
    psmlabel_list,
    nce_list,
    instrument_list,
    frag_types=['b','y','b-modloss','y-modloss'], 
    frag_charge=2,
    include_mod_list=[
        'Oxidation@M','Phospho@S','Phospho@T','Phospho@Y','Acetyl@Protein N-term'
    ]
):
    psm_df_list = []
    fragment_inten_df_list = []
    for i,psmlabel in tqdm(enumerate(psmlabel_list)):
        frag_reader = PSMLabelFragmentReader(
            frag_types=frag_types, max_frag_charge=frag_charge
        )
        psm_reader = PSMLabelReader(
            frag_reader
        )
        psm_reader.load(psmlabel)
        psm_reader.filter_psm_by_modifications(include_mod_list)
        psm_reader.psm_df['NCE'] = nce_list[i]
        psm_reader.psm_df['instrument'] = instrument_list[i]
        psm_df_list.append(psm_reader.psm_df)
        fragment_inten_df_list.append(psm_reader.fragment_inten_df)
    return concat_precursor_fragment_dataframes(psm_df_list, fragment_inten_df_list)


In [14]:
psm_df, fragment_inten_df = load_psmlabel_list(
    [io.StringIO(psmlabel_str), io.StringIO(psmlabel_str)],
    [30,30],['Lumos','QE'],
    include_mod_list=[]
)

2it [00:00, 50.50it/s]


In [15]:
psm_df

Unnamed: 0,sequence,nAA,mods,mod_sites,charge,RT,mobility,raw_name,frag_start_idx,frag_end_idx,NCE,instrument
0,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,8,18,30,Lumos
1,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,18,28,30,Lumos
2,HFALFSTDVTK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,28,38,30,Lumos
3,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,38,48,30,Lumos
4,HFALFSTDVTK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,48,58,30,Lumos
5,HDLDYGIDSYK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,66,76,30,Lumos
6,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,84,94,30,QE
7,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,94,104,30,QE
8,HFALFSTDVTK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,104,114,30,QE
9,HTAYSDFLSDK,11,,,2,,,01625b_GA3-TUM_first_pool_17_01_01-3xHCD-1h-R1,114,124,30,QE


In [16]:
fragment_inten_df

Unnamed: 0,b_1+,b_2+,y_1+,y_2+,b-modloss_1+,b-modloss_2+,y-modloss_1+,y-modloss_2+
0,0.000000,0.000000,1.000000,0.011635,0.0,0.0,0.209031,0.0
1,0.517791,0.000000,0.873905,0.009679,0.0,0.0,0.174495,0.0
2,0.056445,0.000000,0.304306,0.000000,0.0,0.0,0.047490,0.0
3,0.169811,0.007713,0.267655,0.000000,0.0,0.0,0.039475,0.0
4,0.089207,0.000000,0.222009,0.000000,0.0,0.0,0.026049,0.0
...,...,...,...,...,...,...,...,...
147,0.134075,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
148,0.135521,0.048450,0.131891,0.000000,0.0,0.0,0.000000,0.0
149,0.299345,0.026233,0.356206,0.000000,0.0,0.0,0.000000,0.0
150,0.155979,0.040680,0.146919,0.000000,0.0,0.0,0.000000,0.0
