### Read in user provided file
Support: ionbot (ionbot.modifications.csv), MSFragger (psm.tsv), open pFind (.proteins)

In [1]:
import re
import pandas as pd
from Bio import SeqIO
from unimod_mapper import UnimodMapper

In [2]:
user_flag = "msfragger" # here goes the thing the user selected from the dropdown. has to be one of [ionbot, msfragger, openpfind]
user_file = "example_input/msfragger_psm.tsv" # here goes the path to the users file
fasta = "fastas/uniprot_sprot.fasta" # this is the fasta with the protein sequences. if the user doesnt provide one, we assume the protein IDs are in swissprot and use the uniprot_sprot.fasta

In [3]:
mapper = UnimodMapper()

In [4]:
uniprot_to_sequence = {}
for record in SeqIO.parse(fasta, "fasta"):
    sequence = record.seq
    uniprot_id = record.id.split("|")[1]
    uniprot_to_sequence[uniprot_id] = str(sequence)

In [5]:
def get_start_position_pept_in_prot(pept, prot):
    return prot.find(pept, 1)

In [6]:
def read_file(file, flag):
    if flag == "ionbot":
        df = read_ionbot(file)
    elif flag == "msfragger":
        df = read_msfragger(file)
    elif flag == "openpfind":
        df = read_openpfind(file)
    return df

In [7]:
def read_ionbot(file):
    df = pd.read_csv(file)
    return df

In [8]:
def map_mass_to_unimod(mods):
    modstr = ""
    for mod in mods.split(","):
        pos_aa = mod.split("(")[0]

        mass = re.search(r'\(.*?\)', mod)[0]
        mass = float(mass.replace("(", "").replace(")", ""))
        mapped = mapper.mass_to_names(mass, decimals=4)

        mapped_names = " or ".join(list(mapped))
        if len(list(mapped)) > 0:
            modstr += pos_aa + "[" + mapped_names + "]"

    return modstr



In [9]:
def from_psm_to_protein(df):
    # refactor this disgrace of a code snippet
    uniprot_ids = []
    modifications = []
    positions = []
    for index, psm in df.iterrows():
        uniprot_id = psm["Protein ID"]
        peptide = psm["Peptide"]
        try: 
            protein_sequence = uniprot_to_sequence[uniprot_id]
            for mod in psm["Assigned Modifications parsed"].split("]"):
                if mod == "":
                    continue
                modification = mod.split("[")[1]
                position_ptm_in_peptide = int(re.search(".+?(?=\D)", mod)[0])
                position_peptide_in_protein = get_start_position_pept_in_prot(peptide, protein_sequence)
                position_ptm_in_protein = position_peptide_in_protein + position_ptm_in_peptide

                modifications.append(modification)
                positions.append(position_ptm_in_protein)
                uniprot_ids.append(uniprot_id)
                
        except KeyError:
            protein_sequence = ""

    zipped = list(zip(uniprot_ids, modifications, positions))
    protein_df = pd.DataFrame(zipped, columns=["uniprot_id","modification","position"])

    return protein_df

In [10]:
def read_msfragger(file):
    print("Reading MSFragger file...")
    df = pd.read_csv(file, sep = "\t")
    df = df[["Modified Peptide", "Peptide", "Protein", "Protein ID", "Assigned Modifications"]]
    df = df.dropna(subset=["Modified Peptide"])
    df = df.drop_duplicates()

    # try to assign modifications to mass shifts
    print("Assigning modifications to mass shifts...")
    df["Assigned Modifications parsed"] = df["Assigned Modifications"].apply(lambda x: map_mass_to_unimod(x))


    # rewrite to protein level modifications
    print("Mapping modification sites onto proteins...")
    protein_modifications = from_psm_to_protein(df)

    return protein_modifications

In [11]:
def read_openpfind(file):
    
    return

In [12]:
# try to parse user file
try:
    df = read_file(user_file, user_flag)
except:
    print("Your file could not be parsed. Have you selected the right format?")

2023-04-05 19:04:22.468 | INFO     | unimod_mapper.unimod_mapper:_parse_in_more_detail_XML:408 - Parsing mod xml file (/opt/anaconda3/envs/ptmvis/lib/python3.10/site-packages/unimod_mapper/unimod.xml)


Reading MSFragger file...
Assigning modifications to mass shifts...
Mapping modification sites onto proteins...


In [13]:
df

Unnamed: 0,uniprot_id,modification,position
0,P02545,Phospho,404
1,P57081,Phospho,391
2,Q8TAD8,Phospho,152
3,P02545,Phospho,403
4,Q9BZZ5,Phospho,462
...,...,...,...
4351,P04406,Phospho,176
4352,Q13501,Phospho,249
4353,Q13501,Phospho,243
4354,Q8TD16,Phospho,329
