### Read in user provided file
Support: ionbot (ionbot.modifications.csv), MSFragger (psm.tsv), open pFind (.proteins)

In [75]:
import re
import pandas as pd
from Bio import SeqIO
from unimod_mapper import UnimodMapper

In [76]:
user_flag = "ionbot" # here goes the thing the user selected from the dropdown. has to be one of [ionbot, msfragger, openpfind]
user_file = "example_input/ionbot.modifications.csv" # here goes the path to the users file
# fasta = "fastas/uniprot_sprot.fasta" # this is the fasta with the protein sequences. if the user doesnt provide one, we assume the protein IDs are in swissprot and use the uniprot_sprot.fasta

In [77]:
mapper = UnimodMapper()

In [78]:
uniprot_to_sequence = {}
for record in SeqIO.parse(fasta, "fasta"):
    sequence = record.seq
    uniprot_id = record.id.split("|")[1]
    uniprot_to_sequence[uniprot_id] = str(sequence)

In [79]:
def get_ptm_position_in_protein(mod, protein_start):
    position_ptm_in_peptide = int(re.search(".+?(?=\D)", mod)[0]) 
    position_ptm_in_protein = protein_start + position_ptm_in_peptide - 1
    return position_ptm_in_protein

In [80]:
def read_file(file, flag):
    if flag == "ionbot":
        df = read_ionbot(file)
    elif flag == "msfragger":
        df = read_msfragger(file)
    elif flag == "openpfind":
        df = read_openpfind(file)
    return df

In [81]:
def read_ionbot(file):
    df = pd.read_csv(file)
    return df

In [82]:
def map_mass_to_unimod(mods):
    modstr = ""
    for mod in mods.split(","):
        pos_aa = mod.split("(")[0]

        mass = re.search(r'\(.*?\)', mod)[0]
        mass = float(mass.replace("(", "").replace(")", ""))
        mapped = mapper.mass_to_names(mass, decimals=4) # for unimod ID: mapper.mass_to_ids(mass, decimals = 4)

        mapped_names = " or ".join(list(mapped))
        if len(list(mapped)) > 0:
            modstr += pos_aa + "[" + mapped_names + "]"

    return modstr



In [83]:
def from_psm_to_protein(df):
    # for each psm in msfragger, get protein sequence via uniprot ID and map PTM position onto it
    uniprot_ids = []
    modifications = []
    positions = []
    
    for index, psm in df.iterrows():
        protein_start = psm["Protein Start"]
        uniprot_id = psm["Protein ID"]
        
        for mod in psm["Assigned Modifications parsed"].split("]"):
            if mod == "":
                continue
            modification = mod.split("[")[1]
            position = get_ptm_position_in_protein(mod, protein_start)

            modifications.append(modification)
            positions.append(position)
            uniprot_ids.append(uniprot_id)
            
    zipped = list(zip(uniprot_ids, modifications, positions))
    protein_df = pd.DataFrame(zipped, columns=["uniprot_id","modification","position"])

    return protein_df

In [84]:
def read_msfragger(file):
    # read file and pick relevant rows and columns
    print("Reading MSFragger file...")
    pept_mods = pd.read_csv(file, sep = "\t")
    pept_mods = pept_mods[["Modified Peptide", "Peptide", "Protein", "Protein ID", "Assigned Modifications", "Protein Start"]]
    pept_mods = pept_mods.dropna(subset=["Modified Peptide"])
    pept_mods = pept_mods.drop_duplicates()

    # try to assign modifications to mass shifts
    print("Assigning modifications to mass shifts...")
    pept_mods["Assigned Modifications parsed"] = pept_mods["Assigned Modifications"].apply(lambda x: map_mass_to_unimod(x)) 
    
    # rewrite to protein level modifications
    print("Mapping modification sites onto proteins...")
    prot_mods = from_psm_to_protein(pept_mods)

    return prot_mods

In [85]:
def read_openpfind(file):

    with open(file, 'w') as openfile:
        lines = openfile.readlines()
        print(lines)
        
    
    rows = file.read_lines()

    # Initialize a dictionary to hold the parsed data
    data = {}

    # Loop over the rows
    for i, row in enumerate(rows):
        print(row)
        
        # Split the row into cells
        cells = row.split('\t')
        
        # If this is the first row, set the column names
        if i == 0:
            columns = cells
        
        # Otherwise, add the row to the data dictionary
        else:
            row_data = {}
            for j, cell in enumerate(cells):
                row_data[columns[j]] = cell
            data[i] = row_data



    print(psm_df)
    return

In [86]:
# try to parse user file
try:
    df = read_file(user_file, user_flag)
except TabError:
    print("Your file could not be parsed. Have you selected the right format?")

In [87]:
df

Unnamed: 0,protein,uniprot_id,unexpected_modification,position,ionbot_match_id,#PSMs
0,3PASE_ECOLI,P30871,[7]Deamidated[N],61,0_13437_1,1
1,3PASE_ECOLI,P30871,[10]Met->Hse[M],266,0_53627_2,1
2,5DNU_ECOLI,P76491,[35]Oxidation[E],168,0_14780_2,1
3,6PGD_ECOLI,P00350,[35]oxidation[M],11,0_35178_2,1
4,6PGD_ECOLI,P00350,[9999575]SulfanilicAcid[E],167,0_44305_1,1
...,...,...,...,...,...,...
10060,sp|TRYP_PIG|,sp|TRYP_PIG|,[34]Methyl[N-TERM],108,0_23420_2,1
10061,sp|TRYP_PIG|,sp|TRYP_PIG|,[255]Delta:H(4)C(2)[N-TERM],108,0_24698_2,1
10062,sp|TRYP_PIG|,sp|TRYP_PIG|,[254]Delta:H(2)C(2)[N-TERM],108,0_33029_1,1
10063,sp|TRYP_PIG|,sp|TRYP_PIG|,[385]Ammonia-loss[T],110,0_22152_1,1
