In [1]:
#| default_exp psm_reader.pfind_reader

# pFind PSM reader

In [2]:
#| hide
%reload_ext autoreload
%autoreload 2

In [3]:
#| export

import pandas as pd
import numpy as np

import alphabase.constants.modification as ap_mod

from alphabase.psm_reader.psm_reader import (
    PSMReaderBase, psm_reader_provider,
    psm_reader_yaml
)

In [4]:
#| export

def convert_one_pFind_mod(mod):
    if mod[-1] == ')':
        mod = mod[:(mod.find('(')-1)]
        idx = mod.rfind('[')
        name = mod[:idx]
        site = mod[(idx+1):]
    else:
        idx = mod.rfind('[')
        name = mod[:idx]
        site = mod[(idx+1):-1]
    if len(site) == 1:
        return name + '@' + site
    elif site == 'AnyN-term':
        return name + '@' + 'Any N-term'
    elif site == 'ProteinN-term':
        return name + '@' + 'Protein N-term'
    elif site.startswith('AnyN-term'):
        return name + '@' + site[-1] + '^Any N-term'
    elif site.startswith('ProteinN-term'):
        return name + '@' + site[-1] + '^Protein N-term'
    elif site == 'AnyC-term':
        return name + '@' + 'Any C-term'
    elif site == 'ProteinC-term':
        return name + '@' + 'Protein C-term'
    elif site.startswith('AnyC-term'):
        return name + '@' + site[-1] + '^Any C-term'
    elif site.startswith('ProteinC-term'):
        return name + '@' + site[-1] + '^Protein C-term'
    else:
        return None

def translate_pFind_mod(mod_str):
    if not mod_str: return ""
    ret_mods = []
    for mod in mod_str.split(';'):
        mod = convert_one_pFind_mod(mod)
        if not mod: return pd.NA
        elif mod not in ap_mod.MOD_INFO_DICT: return pd.NA
        else: ret_mods.append(mod)
    return ';'.join(ret_mods)

def get_pFind_mods(pfind_mod_str):
    pfind_mod_str = pfind_mod_str.strip(';')
    if not pfind_mod_str: return "", ""

    items = [
        item.split(',',3) 
        for item in pfind_mod_str.split(';')
    ]
    
    items = [
        ('-1',mod) if (mod.endswith('C-term]') 
        or mod[:-2].endswith('C-term'))
        #else ('0', mod) if mod.endswith('N-term]')
        else (site, mod) for site, mod in items
    ]
    items = list(zip(*items))
    return ';'.join(items[1]), ';'.join(items[0])

def parse_pfind_protein(protein, keep_reverse=True):
    proteins = protein.strip('/').split('/')
    return ';'.join(
        [
            protein for protein in proteins 
            if (
                not protein.startswith('REV_') 
                or keep_reverse
            )
        ]
    )


In [5]:
#| hide
assert parse_pfind_protein('A/REV_B/C/', keep_reverse=True) == 'A;REV_B;C'
assert parse_pfind_protein('A/REV_B/C/', keep_reverse=False) == 'A;C'
assert get_pFind_mods('1,Oxidation[M]') == ('Oxidation[M]','1')
assert get_pFind_mods('1,Oxidation[ProteinC-term]') == ('Oxidation[ProteinC-term]','-1')
assert get_pFind_mods('1,A[N];9,B[ProteinC-term]') == ('A[N];B[ProteinC-term]','1;-1')
assert get_pFind_mods('1,A[N];9,B[AnyC-termB]') == ('A[N];B[AnyC-termB]','1;-1')

In [6]:
#| export
class pFindReader(PSMReaderBase):
    def __init__(self,
        *,
        column_mapping:dict = None,
        modification_mapping:dict = None,
        fdr = 0.01,
        keep_decoy = False,
    ):
        super().__init__(
            column_mapping=column_mapping,
            modification_mapping=modification_mapping,
            fdr = fdr,
            keep_decoy = keep_decoy,
        )

    def _init_column_mapping(self):
        self.column_mapping = psm_reader_yaml[
            'pfind'
        ]['column_mapping']
        
    def _init_modification_mapping(self):
        self.modification_mapping = {}

    def _translate_modifications(self):
        pass

    def _load_file(self, filename):
        pfind_df = pd.read_csv(filename, index_col=False, sep='\t')
        pfind_df.fillna('', inplace=True)
        pfind_df = pfind_df[pfind_df.Sequence != '']
        pfind_df['raw_name'] = pfind_df[
            'File_Name'
        ].str.split('.').apply(lambda x: x[0])
        pfind_df['Proteins'] = pfind_df[
            'Proteins'
        ].apply(parse_pfind_protein)
        return pfind_df

    def _translate_decoy(self, origin_df=None):
        self._psm_df.decoy = (
            self._psm_df.decoy == 'decoy'
        ).astype(np.int8)
        
    def _translate_score(self, origin_df=None):
        self._psm_df.score = -np.log(
            self._psm_df.score.astype(float)+1e-100
        )

    def _load_modifications(self, pfind_df):
        (
            self._psm_df['mods'], self._psm_df['mod_sites']
        ) = zip(*pfind_df['Modification'].apply(get_pFind_mods))

        self._psm_df['mods'] = self._psm_df['mods'].apply(
            translate_pFind_mod
        )
        
psm_reader_provider.register_reader('pfind', pFindReader)

### Column and modification mapping from alphabase to pFind

In [7]:
psm_reader_yaml['pfind']['column_mapping']

{'sequence': 'Sequence',
 'charge': 'Charge',
 'rt': 'RT',
 'raw_name': 'raw_name',
 'query_id': 'File_Name',
 'scan_num': 'Scan_No',
 'score': 'Final_Score',
 'proteins': 'Proteins',
 'uniprot_ids': 'Proteins',
 'fdr': 'Q-value',
 'decoy': ['Target/Decoy', 'Targe/Decoy']}

There is no modification mapping as pFind also uses unimod name for all modifications, we just need to convert the sites (AAs) accordingly.

### Testing

In [8]:
#| hide
from io import StringIO

In [9]:
#| hide
txt = StringIO("""File_Name	Scan_No	Exp.MH+	Charge	Q-value	Sequence	Calc.MH+	Mass_Shift(Exp.-Calc.)	Raw_Score	Final_Score	Modification	Specificity	Proteins	Positions	Label	Target/Decoy	Miss.Clv.Sites	Avg.Frag.Mass.Shift	Others
Ecoli-1to1to1-un-C13-N15-10mM-20150823.30507.30507.2.0.dta	30507	2074.030369	2	0	AMIEAGAAAVHFEDQLASVK	2074.027271	0.003098	35.299588	5.15726e-013	2,Oxidation[M];	3	gi|16131841|ref|NP_418439.1|/	173,K,K/	1|0|	target	0	0.948977	131070	0	0	0	262143	0	0	0	32
Ecoli-1to1to1-un-C13-N15-10mM-20150823.21592.21592.2.0.dta	21592	1901.970273	2	0	VVIVGCGAQGLNQGLNMR	1901.968323	0.001950	32.774898	3.37982e-012	6,Carbamidomethyl[C];17,Oxidation[M];	3	gi|16131632|ref|NP_418222.1|/	39,K,D/	1|0|0|	target	0	-0.762750	2046	0	0	0	131070	0	0	0	32
Ecoli-1to1to1-un-C13-N15-1000mM-20150823.48516.48516.2.0.dta	48516	2478.354259	2	0	TEIIAETGAGQHGVASALASALLGLK	2478.356108	-0.001849	35.699390	3.98494e-012		3	gi|16129222|ref|NP_415777.1|/	103,K,C/	1|	target	0	1.615154	7340030	0	0	0	16777214	0	0	0	4
Ecoli-1to1to1-un-C13-N15-1000mM-20150823.41206.41206.2.0.dta	41206	2863.379214	2	0	VHVSISNEGADTYLFGPGIDDSVDLSR	2863.374327	0.004887	33.596154	4.45008e-012		3	gi|16129141|ref|NP_415696.1|/	61,K,Y/	1|	target	0	0.476076	1896446	0	0	0	33554431	0	0	0	0
Ecoli-1to1to1-un-C13-N15-10mM-20150823.40178.40178.2.0.dta	40178	3005.455608	2	0	GMNTAVGDEGGYAPNLGSNAEALAVIAEAVK	3005.451916	0.003692	27.724815	9.86133e-012	2,Oxidation[M];	3	gi|16130686|ref|NP_417259.1|/	200,K,A/	1|0|	target	0	1.555942	40956	0	0	0	536870896	0	0	0	36
Ecoli-1to1to1-un-C13-N15-60mM-20150823.14730.14730.2.0.dta	14730	2262.060813	2	0	APGSSSSGANGDGSLAQSQTGAVVR	2262.059164	0.001649	29.205909	6.42718e-011	10,Deamidated[N];	3	CON_LYSC_LYSEN/	81,R,A/	1|0|	target	0	0.369568	327678	0	0	0	16777215	0	0	0	32
Ecoli-1to1to1-un-C13-N15-10mM-20150823.37020.37020.2.0.dta	37020	1825.014946	2	0	QVTIAQLEDVKPLLMK	1825.013859	0.001087	26.354755	9.42957e-010	0,Gln->pyro-Glu[AnyN-termQ];15,Oxidation[M];	3	gi|90111346|ref|NP_416371.4|/	100,K,S/	1|0|0|	target	1	-1.823228	16894	0	0	0	32766	0	0	0	32
Ecoli-1to1to1-un-C13-N15-150mM-20150823.41501.41501.3.0.dta	41501	2712.197421	3	0	EGDNYVVLSDILGDEDHLGDMDFK	2712.198013	-0.000592	27.073978	9.82619e-010	21,Unknown[M];	3	gi|145698316|ref|NP_417633.4|/	470,K,V/	1|0|	target	0	0.814438	65596	0	0	0	4194288	0	0	0	36
XXX.25802.25802.4.0.dta	25802	2388.339186	4	0.0032066	SVFLIKGDKVWVYPPEKKEK	2388.332468	0.006718	17.822784	0.100787	21,Didehydro[AnyC-termK];	0	sp|P02790|HEMO_HUMAN/	106,N,G/	1|0|	target	0	0.704714	36
""")
psm_df = psm_reader_provider.get_reader('pfind').import_file(txt)
psm_df

  return func(*args, **kwargs)


Unnamed: 0,sequence,charge,raw_name,query_id,scan_num,score,proteins,uniprot_ids,fdr,decoy,spec_idx,mods,mod_sites,nAA,precursor_mz
0,QVTIAQLEDVKPLLMK,2,Ecoli-1to1to1-un-C13-N15-10mM-20150823,Ecoli-1to1to1-un-C13-N15-10mM-20150823.37020.3...,37020,20.782,gi|90111346|ref|NP_416371.4|,gi|90111346|ref|NP_416371.4|,0.0,0,37019,Gln->pyro-Glu@Q^Any N-term;Oxidation@M,0;15,16,913.010613
1,VVIVGCGAQGLNQGLNMR,2,Ecoli-1to1to1-un-C13-N15-10mM-20150823,Ecoli-1to1to1-un-C13-N15-10mM-20150823.21592.2...,21592,26.413199,gi|16131632|ref|NP_418222.1|,gi|16131632|ref|NP_418222.1|,0.0,0,21591,Carbamidomethyl@C;Oxidation@M,6;17,18,951.487845
2,AMIEAGAAAVHFEDQLASVK,2,Ecoli-1to1to1-un-C13-N15-10mM-20150823,Ecoli-1to1to1-un-C13-N15-10mM-20150823.30507.3...,30507,28.293201,gi|16131841|ref|NP_418439.1|,gi|16131841|ref|NP_418439.1|,0.0,0,30506,Oxidation@M,2,20,1037.517322
3,SVFLIKGDKVWVYPPEKKEK,4,XXX,XXX.25802.25802.4.0.dta,25802,2.294746,sp|P02790|HEMO_HUMAN,sp|P02790|HEMO_HUMAN,0.003207,0,25801,Didehydro@K^Any C-term,-1,20,597.838602
4,APGSSSSGANGDGSLAQSQTGAVVR,2,Ecoli-1to1to1-un-C13-N15-60mM-20150823,Ecoli-1to1to1-un-C13-N15-60mM-20150823.14730.1...,14730,23.4679,CON_LYSC_LYSEN,CON_LYSC_LYSEN,0.0,0,14729,Deamidated@N,10,25,1131.533274
5,TEIIAETGAGQHGVASALASALLGLK,2,Ecoli-1to1to1-un-C13-N15-1000mM-20150823,Ecoli-1to1to1-un-C13-N15-1000mM-20150823.48516...,48516,26.248499,gi|16129222|ref|NP_415777.1|,gi|16129222|ref|NP_415777.1|,0.0,0,48515,,,26,1239.681753
6,VHVSISNEGADTYLFGPGIDDSVDLSR,2,Ecoli-1to1to1-un-C13-N15-1000mM-20150823,Ecoli-1to1to1-un-C13-N15-1000mM-20150823.41206...,41206,26.138099,gi|16129141|ref|NP_415696.1|,gi|16129141|ref|NP_415696.1|,0.0,0,41205,,,27,1432.190867
7,GMNTAVGDEGGYAPNLGSNAEALAVIAEAVK,2,Ecoli-1to1to1-un-C13-N15-10mM-20150823,Ecoli-1to1to1-un-C13-N15-10mM-20150823.40178.4...,40178,25.3424,gi|16130686|ref|NP_417259.1|,gi|16130686|ref|NP_417259.1|,0.0,0,40177,Oxidation@M,2,31,1503.229666


In [10]:
#| hide
assert psm_df.mod_sites.values[3] == '-1'
assert psm_df.mods.values[4] == 'Deamidated@N'
assert psm_df.mods.values[0] == 'Gln->pyro-Glu@Q^Any N-term;Oxidation@M'
assert psm_df.mod_sites.values[4] == '10'
assert psm_df.mod_sites.values[0] == '0;15'