In [None]:
#default_exp io.psm_reader.alphapept_reader

In [None]:
#export
import numba
import os
import pandas as pd
import h5py

from alphabase.io.psm_reader.psm_reader import (
    PSMReaderBase, psm_reader_provider
)

@numba.njit
def parse_ap(precursor):
    """
    Parser to parse peptide strings
    """
    items = precursor.split('_')
    if len(items) == 3:
        decoy = 1
    else:
        decoy = 0
    modseq = items[0]
    charge = items[-1]

    parsed = []
    mods = []
    sites = []
    string = ""

    if modseq[0] == 'a':
        sites.append('0')
        mods.append('a')
        modseq = modseq[1:]
    elif modseq.startswith('tmt'):
        for l in range(3, len(modseq)):
            if modseq[l].isupper():
                break
        sites.append('0')
        mods.append(modseq[:l])
        modseq = modseq[l:]

    for i in modseq:
        string += i
        if i.isupper():
            parsed.append(i)
            if len(string) > 1:
                sites.append(str(len(parsed)))
                mods.append(string)
            string = ""

    return ''.join(parsed), ';'.join(mods), ';'.join(sites), charge, decoy

class AlphaPeptReader(PSMReaderBase):
    def __init__(self, modification_mapping:dict=None):
        super().__init__(modification_mapping)

        if self.modification_mapping is None:
            self.modification_mapping = {}
            self.modification_mapping['cC'] = 'Carbamidomethyl@C'
            self.modification_mapping['oxM'] = 'Oxidation@M'
            self.modification_mapping['pS'] = 'Phospho@S'
            self.modification_mapping['pT'] = 'Phospho@T'
            self.modification_mapping['pY'] = 'Phospho@Y'
            self.modification_mapping['a'] = 'Acetyl@Protein N-term'

        self.column_mapping = {
            'sequence': 'naked_sequence',
            'rt':'rt',
            'rt_norm': 'rt_norm',
            'spec_idx': ['scan_no','raw_idx'],
            'mobility': 'mobility',
            'score': 'score',
            'charge': 'charge',
            'raw_name': 'raw_name',
            'fdr': 'q_value',
        }

        self.hdf_dataset = 'peptide_fdr'

    def _load_file(self, filename):
        with h5py.File(filename, 'r') as _hdf:
            dataset = _hdf[self.hdf_dataset]
            df = pd.DataFrame({col:dataset[col] for col in dataset.keys()})
            df['raw_name'] = os.path.basename(filename)[:-len('.ms_data.hdf')]
            df['precursor'] = df['precursor'].str.decode('utf-8')
            if 'scan_no' in df.columns:
                df['scan_no'] = df['scan_no'].astype('int')
            df['charge'] = df['charge'].astype(int)
            # min_rt = df.rt.min()
            # df['rt_norm'] = (df.rt-min_rt)/(df.rt.max()-min_rt)
            df['rt_norm'] = df.rt/df.rt.max()
            
        return df
    
    def _load_modifications(self, df: pd.DataFrame):
        self._psm_df['sequence'], self._psm_df['mods'], \
            self._psm_df['mod_sites'], self._psm_df['charge'], \
            self._psm_df['decoy'] = zip(*df['precursor'].apply(parse_ap))
    
psm_reader_provider.register_reader('alphapept', AlphaPeptReader)

In [None]:
#hide
import os
filename = os.path.expanduser('~/Workspace/Data/PXD006109/20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgun_170519010518.ms_data.hdf')
ap_reader = psm_reader_provider.get_reader('alphapept')
ap_reader.load(filename)
ap_reader.psm_df

Unnamed: 0,sequence,rt,rt_norm,spec_idx,mobility,score,charge,raw_name,fdr,nAA,mods,mod_sites,decoy
0,AVVPSK,12.584717,0.125861,7698,,0.696928,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.005366,6,,,0
1,ALGALR,22.885657,0.228881,17422,,0.706138,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.005031,6,,,0
2,LLAAGR,12.857236,0.128586,7600,,0.724405,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.004557,6,,,0
3,VALVAK,22.165872,0.221682,16148,,0.573965,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.009438,6,,,0
4,VAIIGK,27.361039,0.273640,20821,,0.632588,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.007301,6,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38069,ENENGEEEEEEAEFGEEDLFHQQGDPR,61.068598,0.610751,54356,,0.980562,3,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.000425,27,,,0
38070,NMITQYWPDRETAPGDISPYTIPEEDR,81.592126,0.816009,73901,,0.947413,3,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.001142,27,,,0
38071,SEEMQTVQQEQLLQETQALQQSFLSEK,93.411330,0.934213,85491,,0.991908,3,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.000209,27,Oxidation@M,4,0
38072,VSILDENIAHDDKPGLYFHEEYVDMCR,72.860629,0.728684,65854,,0.954337,5,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,0.000959,27,Carbamidomethyl@C,26,0
