In [None]:
#| default_exp psm_reader.msfragger_reader

# MSFragger Reader

In [None]:
#| export

import numpy as np
import pandas as pd

from alphabase.psm_reader.psm_reader import (
    PSMReaderBase, psm_reader_yaml,
    psm_reader_provider
)
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.constants.aa import AA_ASCII_MASS
from alphabase.constants.modification import MOD_INFO_DICT as mod_info

#| export
try:
    import pyteomics.pepxml as pepxml
except:
    pepxml = None


In [None]:
#| export

def _is_fragger_decoy(proteins):
    for prot in proteins:
        if not prot.startswith('rev_'):
            return False
    return True

mass_mapped_mods = psm_reader_yaml['msfragger_pepxml']['mass_mapped_mods']
mod_mass_tol = psm_reader_yaml['msfragger_pepxml']['mod_mass_tol']

def _get_msf_mods(sequence, msf_aa_mods):
    mods = []
    mod_sites = []
    mod_deltas = []
    mod_delta_sites = []
    for mod in msf_aa_mods:
        mod_mass, site_str = mod.split('@')
        mod_mass = float(mod_mass)
        site = int(site_str)-1
        mod_mass = mod_mass - AA_ASCII_MASS[ord(sequence[site])]

        mod_considered = False
        for mod_name in mass_mapped_mods:
            if abs(mod_mass-mod_info[mod_name]['mass'])<mod_mass_tol:
                if site == 0 and mod_name.endswith('N-term'):
                    mods.append(mod_name)
                    mod_sites.append('0')
                    mod_considered = True
                    break
                _mod = mod_name.split('@')[0]+'@'+sequence[site]
                if _mod in mod_info:
                    mods.append(_mod)
                    mod_sites.append(site_str)
                    mod_considered = True
                    break
        if not mod_considered:
            mod_deltas.append(str(mod_mass))
            mod_delta_sites.append(site_str)
    return ';'.join(mods), ';'.join(mod_sites), ';'.join(mod_deltas), ';'.join(mod_delta_sites)


In [None]:
#| export

class MSFragger_PSM_TSV_Reader(PSMReaderBase):
    def __init__(self, *, 
        column_mapping: dict = None, 
        modification_mapping: dict = None, 
        fdr=0.01, 
        keep_decoy=False, 
        **kwargs
    ):
        raise NotImplementedError("MSFragger_PSM_TSV_Reader for psm.tsv")

psm_reader_provider.register_reader('msfragger_psm_tsv', MSFragger_PSM_TSV_Reader)
psm_reader_provider.register_reader('msfragger', MSFragger_PSM_TSV_Reader)

In [None]:
#| export
if pepxml is None:
    class MSFraggerPepXML:
        def __init__(self): raise NotImplementedError("")
else:
    class MSFraggerPepXML(PSMReaderBase):
        def __init__(self, *, 
            column_mapping: dict = None, 
            modification_mapping: dict = None,
            keep_decoy=True, 
            **kwargs
        ):
            """MSFragger is not fully supported as we can only access the pepxml file.
            """
            super().__init__(
                column_mapping=column_mapping, 
                modification_mapping=modification_mapping,
                keep_decoy=keep_decoy, 
                **kwargs
            )

        def _init_column_mapping(self):
            self.column_mapping = psm_reader_yaml[
                'msfragger_pepxml'
            ]['column_mapping']
            
        def _init_modification_mapping(self):
            self.modification_mapping = {}

        def _translate_modifications(self):
            pass

        def _load_file(self, filename):
            msf_df = pepxml.DataFrame(filename)
            msf_df.fillna('', inplace=True)
            msf_df.retention_time_sec /= 60
            msf_df['raw_name'] = msf_df[
                'spectrum'
            ].str.split('.').apply(lambda x: x[0])
            return msf_df

        def _translate_decoy(self, origin_df=None):
            self._psm_df['decoy'] = self._psm_df.proteins.apply(
                _is_fragger_decoy
            ).astype(np.int8)

            self._psm_df.proteins = self._psm_df.proteins.apply(
                lambda x: ';'.join(x)
            )
        def _translate_score(self, origin_df=None):
            if self.column_mapping['score'] == 'expect':
                # evalue score
                self._psm_df['score'] = -np.log(
                    self._psm_df['score']+1e-100
                )

        def _load_modifications(self, msf_df):
            (
                self._psm_df['mods'], self._psm_df['mod_sites'],
                self._psm_df['mod_deltas'], self._psm_df['mod_delta_sites'],
            ) = zip(*msf_df[['peptide','modifications']].apply(
                lambda x: _get_msf_mods(*x), axis=1)
            )

    psm_reader_provider.register_reader('msfragger_pepxml', MSFraggerPepXML)


In [None]:
#| hide
import os

In [None]:
#| hide
pepxml_str = """<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="pepXML_std.xsl"?>
<msms_pipeline_analysis date="2021-03-10T13:40:59" xmlns="http://regis-web.systemsbiology.net/pepXML" summary_xml="D:\Peptidome_Tests\raw\20190627_QX0_AnBr_SA_BPP_DDA_M01_02.pepXML" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/pepXML/pepXML_v118.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<msms_run_summary base_name="D:\Peptidome_Tests\raw\20190627_QX0_AnBr_SA_BPP_DDA_M01_02" raw_data_type="raw" comment="This pepXML was from calibrated spectra." raw_data="raw">
<sample_enzyme name="nonspecific">
<specificity cut="-" no_cut="" sense="C"/>
</sample_enzyme>
<search_summary base_name="D:\Peptidome_Tests\raw\20190627_QX0_AnBr_SA_BPP_DDA_M01_02" precursor_mass_type="monoisotopic" search_engine="X! Tandem" search_engine_version="MSFragger-3.1.1" fragment_mass_type="monoisotopic" search_id="1">
<search_database local_path="D:\fasta\2021-03-09-decoys-contam-uniprot_human_reviewed_20210309.fasta.fas" type="AA"/>
<enzymatic_search_constraint enzyme="default" min_number_termini="0" max_num_internal_cleavages="2"/>
<aminoacid_modification aminoacid="M" massdiff="15.9949" mass="147.0354" variable="Y"/>
<terminal_modification massdiff="42.0106" protein_terminus="Y" mass="43.0184" terminus="N" variable="Y"/>
<aminoacid_modification aminoacid="Q" massdiff="-17.0265" mass="111.0321" variable="Y" peptide_terminus="n"/>
<aminoacid_modification aminoacid="E" massdiff="-18.0106" mass="111.0320" variable="Y" peptide_terminus="n"/>
<aminoacid_modification aminoacid="C" massdiff="119.0041" mass="222.0133" variable="Y"/>
<parameter name="# MSFragger.build" value="MSFragger-3.1.1"/>
<parameter name="database_name" value="D:\fasta\2021-03-09-decoys-contam-uniprot_human_reviewed_20210309.fasta.fas"/>
<parameter name="decoy_prefix" value="rev_"/>
<parameter name="num_threads" value="31"/>
<parameter name="precursor_mass_lower" value="-20.0"/>
<parameter name="precursor_mass_upper" value="20.0"/>
<parameter name="precursor_mass_units" value="1"/>
<parameter name="precursor_true_tolerance" value="15.0"/>
<parameter name="precursor_true_units" value="1"/>
<parameter name="fragment_mass_tolerance" value="7.0"/>
<parameter name="fragment_mass_units" value="1"/>
<parameter name="calibrate_mass" value="2"/>
<parameter name="write_calibrated_mgf" value="0"/>
<parameter name="isotope_error" value="0/1"/>
<parameter name="mass_offsets" value="0"/>
<parameter name="labile_search_mode" value="OFF"/>
<parameter name="restrict_deltamass_to" value="all"/>
<parameter name="precursor_mass_mode" value="SELECTED"/>
<parameter name="intensity_transform" value="1"/>
<parameter name="remove_precursor_peak" value="0"/>
<parameter name="remove_precursor_range" value="-1.500000,1.500000"/>
<parameter name="localize_delta_mass" value="0"/>
<parameter name="delta_mass_exclude_ranges" value="(-1.5,3.5)"/>
<parameter name="fragment_ion_series" value="b,y"/>
<parameter name="ion_series_definitions" value=""/>
<parameter name="search_enzyme_name" value="nonspecific"/>
<parameter name="search_enzyme_cutafter" value="-"/>
<parameter name="search_enzyme_butnotafter" value=""/>
<parameter name="num_enzyme_termini" value="0"/>
<parameter name="allowed_missed_cleavage" value="2"/>
<parameter name="clip_nTerm_M" value="1"/>
<parameter name="allow_multiple_variable_mods_on_residue" value="0"/>
<parameter name="max_variable_mods_per_peptide" value="3"/>
<parameter name="max_variable_mods_combinations" value="5000"/>
<parameter name="mass_diff_to_variable_mod" value="0"/>
<parameter name="output_file_extension" value="pepXML"/>
<parameter name="output_format" value="pepXML"/>
<parameter name="output_report_topN" value="1"/>
<parameter name="output_max_expect" value="50.0"/>
<parameter name="report_alternative_proteins" value="0"/>
<parameter name="override_charge" value="0"/>
<parameter name="precursor_charge" value="1 4"/>
<parameter name="digest_min_length" value="7"/>
<parameter name="digest_max_length" value="40"/>
<parameter name="digest_mass_range" value="600.0 5000.0"/>
<parameter name="max_fragment_charge" value="2"/>
<parameter name="deisotope" value="1"/>
<parameter name="track_zero_topN" value="0"/>
<parameter name="zero_bin_accept_expect" value="0.0"/>
<parameter name="zero_bin_mult_expect" value="1.0"/>
<parameter name="add_topN_complementary" value="0"/>
<parameter name="minimum_peaks" value="15"/>
<parameter name="use_topN_peaks" value="300"/>
<parameter name="min_fragments_modelling" value="2"/>
<parameter name="min_matched_fragments" value="5"/>
<parameter name="minimum_ratio" value="0.0"/>
<parameter name="clear_mz_range" value="0.0 0.0"/>
<parameter name="excluded_scan_list_file" value=""/>
<parameter name="variable_mod_01" value="15.994900 M 2"/>
<parameter name="variable_mod_02" value="42.010600 [^ 1"/>
<parameter name="variable_mod_04" value="-17.026500 nQ 1"/>
<parameter name="variable_mod_05" value="-18.010600 nE 1"/>
<parameter name="variable_mod_06" value="119.004100 C 1"/>
<parameter name="add_A_alanine" value="0.000000"/>
<parameter name="add_B_user_amino_acid" value="0.000000"/>
<parameter name="add_Cterm_peptide" value="0.0"/>
<parameter name="add_Cterm_protein" value="0.0"/>
<parameter name="add_D_aspartic_acid" value="0.000000"/>
<parameter name="add_E_glutamic_acid" value="0.000000"/>
<parameter name="add_F_phenylalanine" value="0.000000"/>
<parameter name="add_G_glycine" value="0.000000"/>
<parameter name="add_H_histidine" value="0.000000"/>
<parameter name="add_I_isoleucine" value="0.000000"/>
<parameter name="add_J_user_amino_acid" value="0.000000"/>
<parameter name="add_K_lysine" value="0.000000"/>
<parameter name="add_L_leucine" value="0.000000"/>
<parameter name="add_M_methionine" value="0.000000"/>
<parameter name="add_N_asparagine" value="0.000000"/>
<parameter name="add_Nterm_peptide" value="0.0"/>
<parameter name="add_Nterm_protein" value="0.0"/>
<parameter name="add_O_user_amino_acid" value="0.000000"/>
<parameter name="add_P_proline" value="0.000000"/>
<parameter name="add_Q_glutamine" value="0.000000"/>
<parameter name="add_R_arginine" value="0.000000"/>
<parameter name="add_S_serine" value="0.000000"/>
<parameter name="add_T_threonine" value="0.000000"/>
<parameter name="add_U_user_amino_acid" value="0.000000"/>
<parameter name="add_V_valine" value="0.000000"/>
<parameter name="add_W_tryptophan" value="0.000000"/>
<parameter name="add_X_user_amino_acid" value="0.000000"/>
<parameter name="add_Y_tyrosine" value="0.000000"/>
<parameter name="add_Z_user_amino_acid" value="0.000000"/>
</search_summary>
<spectrum_query start_scan="426" uncalibrated_precursor_neutral_mass="995.496" assumed_charge="3" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.426.426.3" end_scan="426" index="128" precursor_neutral_mass="995.4981" retention_time_sec="83.65495204925537">
<search_result>
<search_hit peptide="EPDSPLDKL" massdiff="1.0010986328125" calc_neutral_pep_mass="994.497" peptide_next_aa="S" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="32" hit_rank="1" num_matched_ions="5" protein="rev_sp|O60566|BUB1B_HUMAN Mitotic checkpoint serine/threonine-protein kinase BUB1 beta OS=Homo sapiens OX=9606 GN=BUB1B PE=1 SV=3" peptide_prev_aa="R" is_rejected="0">
<modification_info>
<mod_aminoacid_mass mass="111.0320" position="1"/>
</modification_info>
<search_score name="hyperscore" value="10.838"/>
<search_score name="nextscore" value="0.0"/>
<search_score name="expect" value="1.406036e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="1717" uncalibrated_precursor_neutral_mass="997.51196" assumed_charge="3" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.1717.1717.3" end_scan="1717" index="519" precursor_neutral_mass="997.51294" retention_time_sec="334.25488471984863">
<search_result>
<search_hit peptide="ALSSQHQAR" massdiff="1.00286865234375" calc_neutral_pep_mass="996.5101" peptide_next_aa="I" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="32" hit_rank="1" num_matched_ions="6" protein="sp|P11021|BIP_HUMAN Endoplasmic reticulum chaperone BiP OS=Homo sapiens OX=9606 GN=HSPA5 PE=1 SV=2" peptide_prev_aa="R" is_rejected="0">
<search_score name="hyperscore" value="11.083"/>
<search_score name="nextscore" value="0.0"/>
<search_score name="expect" value="1.960153e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2673" uncalibrated_precursor_neutral_mass="827.417" assumed_charge="2" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2673.2673.2" end_scan="2673" index="832" precursor_neutral_mass="827.41724" retention_time_sec="516.3666343688965">
<search_result>
<search_hit peptide="IGEAGWVP" massdiff="-4.8828125E-4" calc_neutral_pep_mass="827.4177" peptide_next_aa="S" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="14" hit_rank="1" num_matched_ions="6" protein="rev_sp|Q76N32|CEP68_HUMAN Centrosomal protein of 68 kDa OS=Homo sapiens OX=9606 GN=CEP68 PE=1 SV=2" peptide_prev_aa="P" is_rejected="0">
<search_score name="hyperscore" value="11.815"/>
<search_score name="nextscore" value="10.422"/>
<search_score name="expect" value="3.031184e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2674" uncalibrated_precursor_neutral_mass="831.41174" assumed_charge="2" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2674.2674.2" end_scan="2674" index="833" precursor_neutral_mass="831.4118" retention_time_sec="516.4018821716309">
<search_result>
<search_hit peptide="HDYKPAT" massdiff="1.01953125" calc_neutral_pep_mass="830.3923" peptide_next_aa="G" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="12" hit_rank="1" num_matched_ions="7" protein="rev_sp|Q71H61|ILDR2_HUMAN Immunoglobulin-like domain-containing receptor 2 OS=Homo sapiens OX=9606 GN=ILDR2 PE=2 SV=1" peptide_prev_aa="S" is_rejected="0">
<search_score name="hyperscore" value="14.072"/>
<search_score name="nextscore" value="13.346"/>
<search_score name="expect" value="6.774665e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2675" uncalibrated_precursor_neutral_mass="1472.6216" assumed_charge="3" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2675.2675.3" end_scan="2675" index="834" precursor_neutral_mass="1472.6204" retention_time_sec="516.4582443237305">
<search_result>
<search_hit peptide="TAMEIIMCGLAW" massdiff="0.003173828125" calc_neutral_pep_mass="1472.6172" peptide_next_aa="I" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="44" hit_rank="1" num_matched_ions="5" protein="rev_sp|Q8IVW4|CDKL3_HUMAN Cyclin-dependent kinase-like 3 OS=Homo sapiens OX=9606 GN=CDKL3 PE=1 SV=1" peptide_prev_aa="G" is_rejected="0">
<modification_info>
<mod_aminoacid_mass mass="147.0354" position="3"/>
<mod_aminoacid_mass mass="222.0133" position="8"/>
</modification_info>
<search_score name="hyperscore" value="9.786"/>
<search_score name="nextscore" value="9.566"/>
<search_score name="expect" value="2.532610e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2676" uncalibrated_precursor_neutral_mass="1588.6807" assumed_charge="4" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2676.2676.4" end_scan="2676" index="835" precursor_neutral_mass="1588.6788" retention_time_sec="516.5613555908203">
<search_result>
<search_hit peptide="YTETDLEESMDKI" massdiff="-0.00244140625" calc_neutral_pep_mass="1588.6813" peptide_next_aa="E" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="48" hit_rank="1" num_matched_ions="6" protein="sp|Q9UKF6|CPSF3_HUMAN Cleavage and polyadenylation specificity factor subunit 3 OS=Homo sapiens OX=9606 GN=CPSF3 PE=1 SV=1" peptide_prev_aa="L" is_rejected="0">
<modification_info>
<mod_aminoacid_mass mass="147.0354" position="10"/>
</modification_info>
<search_score name="hyperscore" value="11.537"/>
<search_score name="nextscore" value="11.256"/>
<search_score name="expect" value="4.523504e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2677" uncalibrated_precursor_neutral_mass="1428.6484" assumed_charge="4" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2677.2677.4" end_scan="2677" index="836" precursor_neutral_mass="1428.6484" retention_time_sec="516.6494750976562">
<search_result>
<search_hit peptide="QEGDMDRSLHKP" massdiff="1.002197265625" calc_neutral_pep_mass="1427.6462" peptide_next_aa="G" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="44" hit_rank="1" num_matched_ions="6" protein="rev_sp|A8K0R7|ZN839_HUMAN Zinc finger protein 839 OS=Homo sapiens OX=9606 GN=ZNF839 PE=2 SV=1" peptide_prev_aa="L" is_rejected="0">
<modification_info>
<mod_aminoacid_mass mass="147.0354" position="5"/>
</modification_info>
<search_score name="hyperscore" value="12.122"/>
<search_score name="nextscore" value="10.786"/>
<search_score name="expect" value="1.721461e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2678" uncalibrated_precursor_neutral_mass="1687.7485" assumed_charge="4" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2678.2678.4" end_scan="2678" index="837" precursor_neutral_mass="1687.7466" retention_time_sec="516.7586517333984">
<search_result>
<search_hit peptide="TVCHQLFFSGFVSP" massdiff="0.9931640625" calc_neutral_pep_mass="1686.7534" peptide_next_aa="G" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="52" hit_rank="1" num_matched_ions="6" protein="rev_sp|P42704|LPPRC_HUMAN Leucine-rich PPR motif-containing protein, mitochondrial OS=Homo sapiens OX=9606 GN=LRPPRC PE=1 SV=3" peptide_prev_aa="M" is_rejected="0">
<modification_info>
<mod_aminoacid_mass mass="222.0133" position="3"/>
</modification_info>
<search_score name="hyperscore" value="11.213"/>
<search_score name="nextscore" value="10.51"/>
<search_score name="expect" value="7.501750e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2679" uncalibrated_precursor_neutral_mass="991.48737" assumed_charge="3" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2679.2679.3" end_scan="2679" index="838" precursor_neutral_mass="991.4877" retention_time_sec="516.7940139770508">
<search_result>
<search_hit peptide="EFGVSPDKI" massdiff="0.98553466796875" calc_neutral_pep_mass="990.50214" peptide_next_aa="S" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="32" hit_rank="1" num_matched_ions="5" protein="sp|P26640|SYVC_HUMAN Valine--tRNA ligase OS=Homo sapiens OX=9606 GN=VARS1 PE=1 SV=4" peptide_prev_aa="K" is_rejected="0">
<search_score name="hyperscore" value="10.22"/>
<search_score name="nextscore" value="9.036"/>
<search_score name="expect" value="7.164075e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2680" uncalibrated_precursor_neutral_mass="1803.81" assumed_charge="3" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2680.2680.3" end_scan="2680" index="839" precursor_neutral_mass="1803.8068" retention_time_sec="516.9300270080566">
<search_result>
<search_hit peptide="EGATIEMSAPNKSDEPK" massdiff="0.97119140625" calc_neutral_pep_mass="1802.8356" peptide_next_aa="S" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="64" hit_rank="1" num_matched_ions="6" protein="rev_sp|O14513|NCKP5_HUMAN Nck-associated protein 5 OS=Homo sapiens OX=9606 GN=NCKAP5 PE=1 SV=2" peptide_prev_aa="R" is_rejected="0">
<search_score name="hyperscore" value="11.921"/>
<search_score name="nextscore" value="11.115"/>
<search_score name="expect" value="5.792839e-01"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2682" uncalibrated_precursor_neutral_mass="854.4283" assumed_charge="2" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2682.2682.2" end_scan="2682" index="841" precursor_neutral_mass="854.42957" retention_time_sec="517.0397758483887">
<search_result>
<search_hit peptide="VAAMVIDH" massdiff="-0.00244140625" calc_neutral_pep_mass="854.432" peptide_next_aa="F" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="14" hit_rank="1" num_matched_ions="5" protein="sp|Q9UN73|PCDA6_HUMAN Protocadherin alpha-6 OS=Homo sapiens OX=9606 GN=PCDHA6 PE=2 SV=1" peptide_prev_aa="L" is_rejected="0">
<search_score name="hyperscore" value="9.522"/>
<search_score name="nextscore" value="9.318"/>
<search_score name="expect" value="4.393155e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="2685" uncalibrated_precursor_neutral_mass="697.3534" assumed_charge="2" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2685.2685.2" end_scan="2685" index="843" precursor_neutral_mass="697.3537" retention_time_sec="517.2853660583496">
<search_result>
<search_hit peptide="FPSPGPP" massdiff="0.01019287109375" calc_neutral_pep_mass="697.3435" peptide_next_aa="Q" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="12" hit_rank="1" num_matched_ions="5" protein="sp|Q9H3S7|PTN23_HUMAN Tyrosine-protein phosphatase non-receptor type 23 OS=Homo sapiens OX=9606 GN=PTPN23 PE=1 SV=1" peptide_prev_aa="P" is_rejected="0">
<search_score name="hyperscore" value="9.194"/>
<search_score name="nextscore" value="9.047"/>
<search_score name="expect" value="8.040078e+00"/>
</search_hit>
</search_result>
</spectrum_query>
<spectrum_query start_scan="57903" uncalibrated_precursor_neutral_mass="2579.8958" assumed_charge="2" spectrum="20190627_QX0_AnBr_SA_BPP_DDA_M01_02.57903.57903.2" end_scan="57903" index="47442" precursor_neutral_mass="2579.8904" retention_time_sec="7121.1767578125">
<search_result>
<search_hit peptide="AGPSCGTYDMCEDTEADMLGPPGQ" massdiff="0.9638671875" calc_neutral_pep_mass="2578.9265" peptide_next_aa="S" num_missed_cleavages="0" num_tol_term="0" num_tot_proteins="1" tot_num_ions="46" hit_rank="1" num_matched_ions="5" protein="rev_sp|Q86V15|CASZ1_HUMAN Zinc finger protein castor homolog 1 OS=Homo sapiens OX=9606 GN=CASZ1 PE=1 SV=4" peptide_prev_aa="M" is_rejected="0">
<modification_info>
<mod_aminoacid_mass mass="222.0133" position="5"/>
<mod_aminoacid_mass mass="147.0354" position="10"/>
</modification_info>
<search_score name="hyperscore" value="10.735"/>
<search_score name="nextscore" value="10.735"/>
<search_score name="expect" value="1.680861e+00"/>
</search_hit>
</search_result>
</spectrum_query>
</msms_run_summary>
</msms_pipeline_analysis>
"""
def read_msf():
    filename = 'x.pepxml'
    with open(filename,'w') as f: f.write(pepxml_str)
    reader = MSFraggerPepXML(keep_decoy=True)
    reader.import_file(filename)
    os.remove(filename)
    return reader.psm_df
df = read_msf()
assert len(df)==13
assert np.sum(df.decoy) == 8
assert df.mods.str.contains('N-term').any()
assert (df.mod_sites[df.mods.str.contains('N-term')] == '0').all()
assert df.mod_deltas.str.startswith('119').any()
df

Unnamed: 0,sequence,charge,rt,query_id,spec_idx,score,proteins,raw_name,decoy,mods,mod_sites,mod_deltas,mod_delta_sites,nAA,rt_norm,precursor_mz
0,HDYKPAT,2,8.606698,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2674.2674.2,2674,-1.91319,rev_sp|Q71H61|ILDR2_HUMAN Immunoglobulin-like ...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,1,,,,,7,0.072516,416.20341
1,FPSPGPP,2,8.621423,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2685.2685.2,2685,-2.084439,sp|Q9H3S7|PTN23_HUMAN Tyrosine-protein phospha...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,0,,,,,7,0.07264,349.67904
2,IGEAGWVP,2,8.606111,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2673.2673.2,2673,-1.108953,rev_sp|Q76N32|CEP68_HUMAN Centrosomal protein ...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,1,,,,,8,0.072511,414.716153
3,VAAMVIDH,2,8.61733,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2682.2682.2,2682,-1.480048,sp|Q9UN73|PCDA6_HUMAN Protocadherin alpha-6 OS...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,0,,,,,8,0.072606,428.223288
4,EPDSPLDKL,3,1.394249,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.426.426.3,426,-0.340774,rev_sp|O60566|BUB1B_HUMAN Mitotic checkpoint s...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,1,Glu->pyro-Glu@E^Any N-term,0.0,,,9,0.011747,332.506319
5,ALSSQHQAR,3,5.570915,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.1717.1717.3,1717,-0.673023,sp|P11021|BIP_HUMAN Endoplasmic reticulum chap...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,0,,,,,9,0.046938,333.177307
6,EFGVSPDKI,3,8.613234,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2679.2679.3,2679,-1.969079,sp|P26640|SYVC_HUMAN Valine--tRNA ligase OS=Ho...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,0,,,,,9,0.072571,331.17468
7,TAMEIIMCGLAW,3,8.607637,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2675.2675.3,2675,-0.92925,rev_sp|Q8IVW4|CDKL3_HUMAN Cyclin-dependent kin...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,1,Oxidation@M,3.0,119.00381504045,8.0,12,0.072524,491.879582
8,QEGDMDRSLHKP,4,8.610825,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2677.2677.4,2677,-0.543173,rev_sp|A8K0R7|ZN839_HUMAN Zinc finger protein ...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,1,Oxidation@M,5.0,,,12,0.072551,357.918858
9,YTETDLEESMDKI,4,8.609356,20190627_QX0_AnBr_SA_BPP_DDA_M01_02.2676.2676.4,2676,-1.509287,sp|Q9UKF6|CPSF3_HUMAN Cleavage and polyadenyla...,20190627_QX0_AnBr_SA_BPP_DDA_M01_02,0,Oxidation@M,10.0,,,13,0.072539,398.177635
