In [1]:
#default_exp reader.alphapept_reader

In [2]:
#export
import numba
import os
import pandas as pd
import h5py

from alphadeep.reader.psm_reader import PSMReader_w_FragBase, psm_reader_provider

@numba.njit
def parse_ap(precursor):
    """
    Parser to parse peptide strings
    """
    items = precursor.split('_')
    if len(items) == 3:
        decoy = 1
    else:
        decoy = 0
    modseq = items[0]
    charge = items[-1]

    parsed = []
    mods = []
    sites = []
    string = ""

    if modseq[0] == 'a':
        sites.append('0')
        mods.append('a')
        modseq = modseq[1:]
    elif modseq.startswith('tmt'):
        for l in range(3, len(modseq)):
            if modseq[l].isupper():
                break
        sites.append('0')
        mods.append(modseq[:l])
        modseq = modseq[l:]

    for i in modseq:
        string += i
        if i.isupper():
            parsed.append(i)
            if len(string) > 1:
                sites.append(str(len(parsed)))
                mods.append(string)
            string = ""

    return ''.join(parsed), ';'.join(mods), ';'.join(sites), charge, decoy

class AlphaPeptReader(PSMReader_w_FragBase):
    def __init__(self):
        super().__init__()

        self.modification_convert_dict['cC'] = 'Carbamidomethyl@C'
        self.modification_convert_dict['oxM'] = 'Oxidation@M'
        self.modification_convert_dict['pS'] = 'Phospho@S'
        self.modification_convert_dict['pT'] = 'Phospho@T'
        self.modification_convert_dict['pY'] = 'Phospho@Y'
        self.modification_convert_dict['a'] = 'Acetyl@Protein N-term'

        self.column_mapping = {
            'sequence': 'naked_sequence',
            'RT':'rt',
            'scan_no': 'scan_no',
            'scan_idx': 'raw_idx', #idx in ms2 list
            'mobility': 'mobility',
            'score': 'score',
            'charge': 'charge',
            'raw_name': 'raw_name',
        }

        self.hdf_dataset = 'peptide_fdr'

    def _load_file(self, filename):
        with h5py.File(filename, 'r') as _hdf:
            dataset = _hdf[self.hdf_dataset]
            df = pd.DataFrame({col:dataset[col] for col in dataset.keys()})
            df['raw_name'] = os.path.basename(filename)[:-len('.ms_data.hdf')]
            df['precursor'] = df['precursor'].str.decode('utf-8')
            if 'scan_no' in df.columns:
                df['scan_no'] = df['scan_no'].astype('int')
            df['charge'] = df['charge'].astype(int)
        return df
    
    def _translate_columns(self, df: pd.DataFrame):
        super()._translate_columns(df)
        
        self._psm_df['sequence'], self._psm_df['mods'], \
            self._psm_df['mod_sites'], self._psm_df['charge'], \
            self._psm_df['decoy'] = zip(*df['precursor'].apply(parse_ap))
    
psm_reader_provider.register_reader('alphapept', AlphaPeptReader)

In [3]:
#hide
import os
filename = os.path.expanduser('~/Workspace/Data/PXD006109/20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgun_170519010518.ms_data.hdf')
ap_reader = psm_reader_provider.get_reader('alphapept')
ap_reader.load(filename)
ap_reader.psm_df

Unnamed: 0,sequence,RT,scan_no,scan_idx,mobility,score,charge,raw_name,nAA,mods,mod_sites,decoy
0,AVVPSK,12.584717,7698,5345,,0.696928,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,6,,,0
1,ALGALR,22.885657,17422,13616,,0.706138,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,6,,,0
2,LLAAGR,12.857236,7600,5268,,0.724405,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,6,,,0
3,VALVAK,22.165872,16148,12539,,0.573965,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,6,,,0
4,VAIIGK,27.361039,20821,16590,,0.632588,2,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,6,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
38069,ENENGEEEEEEAEFGEEDLFHQQGDPR,61.068598,54356,47248,,0.980562,3,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,27,,,0
38070,NMITQYWPDRETAPGDISPYTIPEEDR,81.592126,73901,65364,,0.947413,3,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,27,,,0
38071,SEEMQTVQQEQLLQETQALQQSFLSEK,93.411330,85491,76083,,0.991908,3,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,27,Oxidation@M,4,0
38072,VSILDENIAHDDKPGLYFHEEYVDMCR,72.860629,65854,57894,,0.954337,5,20170518_QEp1_FlMe_SA_BOX0_HeLa2_Ecoli1_Shotgu...,27,Carbamidomethyl@C,26,0
