In [None]:
# default_exp protein_ptm_normalization

In [None]:
#export

import pandas as pd
import glob
import re
import numpy as np
import alphaquant.ptmsite_mapping as aqptm



def get_phosphofile2protfile(phospho_result_files, proteom_result_files):

    name2phosfile = get_name2file(phospho_result_files)
    name2protfile = get_name2file(proteom_result_files)
    phosfile2protfile = {name2phosfile.get(x): name2protfile.get(x) for x in name2phosfile.keys()}
    phosfile2name = {v: k for k, v in name2phosfile.items()}
    return phosfile2protfile, phosfile2name
        
def get_name2file(filenames):
    pattern = "(.*\/results\/)(.*)(.results.tsv)"
    name2file = {}
    for file in filenames:
        matched = re.search(pattern, file)
        if matched==None:
            continue
        name = matched.group(2)
        name2file.update({name: file})
    return name2file


def normalize_with_proteome(df_phospho, df_prot, swissprot_referenceprots):
    phospho_prots = []
    for x in df_phospho["protein"]:
        if x == None:
            phospho_prots.append(None)
        elif len(x.split("_"))<2:
            phospho_prots.append(None)
        else:
            phospho_prots.append(x.split("_")[1])
    df_phospho = df_phospho[[x != None for x in phospho_prots]]
    phospho_prots = [x for x in phospho_prots if x != None]
    df_phospho["swissprot"] = aqptm.get_idmap_column(phospho_prots, swissprot_referenceprots)
    

    proteome_protname2fc = dict(zip(df_prot["protein"], df_prot["log2fc"]))
    normed_log2fcs = []
    for _,row in df_phospho.iterrows():
        prot = row["swissprot"]
        fc_phospho = row["log2fc"]
        fc_proteome = proteome_protname2fc.get(prot)
        if fc_proteome==None:
            #print(f"protein {prot} not found!")
            normed_log2fc = np.nan
        else:
            normed_log2fc = fc_phospho - fc_proteome
        normed_log2fcs.append(normed_log2fc)
    df_phospho["log2fc"] = normed_log2fcs
    df_phospho = df_phospho[~np.isnan(df_phospho["log2fc"])]
    return df_phospho

phospho_result_files = glob.glob('./results/*.results.tsv')
proteom_result_files = glob.glob('./proteome/results/*.results.tsv')
reference_swissprot = "/Users/constantin/workspace/Maria_Phospho/swissprot_mapping.tsv"

phophofile2protfile, file2name = get_phosphofile2protfile(phospho_result_files, proteom_result_files)

count = 0
for phosphofile in phospho_result_files:
    protfile = phophofile2protfile.get(phosphofile)
    if protfile == None:
        continue
    df_phospho = pd.read_csv(phosphofile, sep = "\t")
    df_prot = pd.read_csv(protfile, sep = "\t")
    df_phospho = normalize_with_proteome(df_phospho, df_prot, reference_swissprot)
    df_phospho = df_phospho.drop(columns=["swissprot"])
    filename = f"./results_protnormed/{file2name.get(phosphofile)}.results.tsv"
    df_phospho.to_csv(filename, sep = "\t", index = None)



In [None]:


class PTMtableLocalizer():
    def __init__(self, results_dir_ptm, results_dir_proteome):
        self._files = PTMFiles(results_dir_ptm=results_dir_ptm, results_dir_proteome=results_dir_proteome)
        self._name2ptmfile = self.__get_name2ptmfile__()
        self._name2protfile = self.__get_name2protfile__()

    def get_ptmfile2protfile(self):
        phosfile2protfile = {self._name2phosfile.get(x): self._name2protfile.get(x) for x in self._name2phosfile.keys()}
        return phosfile2protfile

    def get_ptmfile2name(self):
        return {v: k for k, v in self._name2ptmfile.items()}

    def get_swissprot_reference(self):
        return self._files.swissprot_reference
    
    def __get_name2ptmfile__(self):
        return self.__get_name2file__(self._files.ptm_result_files)
    
    def __get_name2protfile__(self):
        return self.__get_name2file__(self._files.proteome_result_files)

    @staticmethod
    def __get_name2file__(filenames):
        pattern = "(.*\/results\/)(.*)(.results.tsv)"
        name2file = {}
        for file in filenames:
            matched = re.search(pattern, file)
            if matched==None:
                continue
            name = matched.group(2)
            name2file.update({name: file})
        return name2file
    
import alphaquant.ptmsite_mapping as aqptm
class PTMFiles():
    def __init__(self, results_dir_ptm, results_dir_proteome):
        self._results_dir_ptm = results_dir_ptm
        self._results_dir_proteome = results_dir_proteome
        self.ptm_result_files = self.__get_ptm_result_files__()
        self.proteome_result_files = self.__get_proteome_result_files__()
        self.swissprot_reference = self.__get_swissprot_reference__()
    
    def __get_ptm_result_files__(self):
        return glob.glob(f'{self._results_dir_ptm}/*.results.tsv')

    def __get_proteome_result_files__(self):
        return glob.glob(f'{self._results_dir_proteome}/*.results.tsv')

    def __get_swissprot_reference__(self):
        return aqptm.get_swissprot_path()

class PTMtableNormalizer():
    def __init__(self,  ptm_file, proteome_file):
        self._prepared_tables = PTMtablePreparer(ptm_file, proteome_file)
        self._output_table_template = self._prepared_tables.ptm_df.copy() #use ptm table as template for the output table and update with normalized fcs and fdrs

    
    def normalize_with_proteome(self):
        for ptm in self._output_table_template.index:
            self.__update_ptm_infos__(ptm)
        return df_phospho

    def __update_ptm_infos__(self, ptm):
        regulation_infos = self._prepared_tables.get_protein_regulation_infos(ptm)
        fdr_damper = FDRDamper(regulation_infos)
        self.__update_values_for_output_table__(ptm, fdr_damper.get_fdr(), regulation_infos.diff_fc)



    def __update_values_for_output_table__(self, ptm, fdr, log2fc):
        self._output_table_template.loc[ptm]["fdr"] = fdr
        self._output_table_template[ptm]["log2fc"] = log2fc

    def __get_ptm_list__(self):
        list(self._prepared_tables.ptm_df.index)


In [None]:

class PTMtablePreparer():
    def __init__(self, ptm_file, proteome_file):
        self.ptm_df = self.__read_and_annotate_ptm_df__(ptm_file)
        self.proteome_df = self.__read_and_annotate_proteome_df__(proteome_file)
        self.output_df = self.ptm_df.copy()
        self._ptmsite2swissprot  = self.__get_ptmsite2swissprot__()

    def get_protein_regulation_infos(self, ptmsite):
        swissprot = self._ptmsite2swissprot.get(ptmsite)
        ptm_row = self.ptm_df.loc[ptmsite]
        protein_row = self.proteome_df.loc[swissprot]
        ptm_fdr = self.__get_fdr_from_table_row__(ptm_row)
        ptm_fc = self.__get_fc_from_table_row__(ptm_row)
        protein_fdr = self.__get_fdr_from_table_row__(protein_row)
        protein_fc = self.__get_fc_from_table_row__(protein_row)
        reginfos = RegulationInfos(log2fc_ptm=ptm_fc, fdr_ptm=ptm_fdr, log2fc_protein=protein_fc,fdr_protein=protein_fdr)
        
        return reginfos
    
    def __read_and_annotate_ptm_df__(self, ptm_file):
        ptm_df = self.__read_dataframe__(ptm_file)
        ptm_df = self.__add_swissprot_name_column__(ptm_df)
        ptm_df = ptm_df.set_index("protein")
        return ptm_df
    
    def __read_and_annotate_proteome_df__(self, proteome_file):
        proteome_df = self.__read_dataframe__(proteome_file)
        proteome_df["swissprot"] = aqptm.get_idmap_column(proteome_df["protein"])
        proteome_df = proteome_df.set_index("swissprot")
        return proteome_df


    def __add_swissprot_name_column__(self, ptm_df):
        swissprot_referenceprots = aqptm.get_swissprot_path()
        ptm_prots = self.__get_ptm_proteins__()
        ptm_df = self.__match_ptm_df_to_ptm_prots__(ptm_df, ptm_prots)
        ptm_df["swissprot"] = aqptm.get_idmap_column(ptm_prots, swissprot_referenceprots)
        return ptm_df
    
    def __get_ptm_proteins__(self, ptm_df):
        prots = []
        for ptmprot_name in ptm_df["protein"]:
            prot = self.__extract_protein_from_ptmprot_name__(ptmprot_name)
            prots.append(prot)
        return prots

    @staticmethod
    def __extract_protein_from_ptmprot_name__(ptmprot_name):
        if ptmprot_name == None:
            return None
        elif len(ptmprot_name.split("_"))<2:
            return None
        else:
            return ptmprot_name.split("_")[1]
    @staticmethod
    def __match_ptm_df_to_ptm_prots__(ptm_df,ptm_prots):
        return ptm_df[[x != None for x in ptm_prots]]
    
    def __filter_nonidentified_proteins__(ptm_prots):
        return [x for x in ptm_prots if x != None]
    
    @staticmethod
    def __read_dataframe__(file):
        return pd.read_csv(file, sep = "\t")

    def __get_ptmsite2swissprot__(self):
        return dict(zip(self.ptm_df["protein"], self.ptm_df["swissprot"])) #a bit weird, but the "protein" column always refers to the identifier, which in this case is the ptmsite
    
    @staticmethod
    def __get_fdr_from_table_row__(row):
        return float(row['fdr'])

    @staticmethod
    def __get_fc_from_table_row__(row):
        return float(row['log2fc'])
        

        

In [None]:
import math
import numpy as np

class RegulationInfos():
    def __init__(self, log2fc_ptm, fdr_ptm,log2fc_protein, fdr_protein):
        self.log2fc_ptm = log2fc_ptm
        self.log2fc_protein = log2fc_protein
        self.fdr_ptm = fdr_ptm
        self.fdr_protein = fdr_protein
        self.diff_fc = self.__get_protnormed_fc__()
        self.switched_regulation_direction = not self.__check_if_regulation_stayed_the_same__()

    def __get_protnormed_fc__(self):
        return self.log2fc_ptm - self.log2fc_protein
    
    def __check_if_regulation_stayed_the_same__(self):
        return np.sign(self.log2fc_ptm) == np.sign(self.diff_fc)



import math
class FDRDamper():
    """The fdr is taken from the regulation of the phosphopeptides. If the protein is regulated 
    similar to the phosphopeptide, we for the moment use a very simple heuristic to correct the fdr down:
    
    1) We only consider phosphopeptides where the fold change has become less strong, i.e. 'dampened' and where the "damping" protein was regulated significantly
    2) We correct the logged(!) fdr up with an exponnential function and then transform it back to a new fdr. This means a "double exponential" decrease in the significance
    """
    lowfc_threshold = 0.5
    def __init__(self, regulation_infos):
        self._regulation_infos = regulation_infos
    
    def get_fdr(self):
        return self.__dampen_fdr_if_needed__(self)

    def __dampen_fdr_if_needed__(self):
        if self.__check_if_needs_damping__():
            return self.__get_adjusted_fdr__()
        else:
            return self._regulation_infos.fdr_protein

    def __check_if_needs_damping__(self):
        if self._fdr_protein<0.05:
            if np.sign(self._regulation_infos.fdr_protein.log2fc_phopho) == np.sign(self._regulation_infos.fdr_protein.log2fc_protein):
                return True
        return False

    def __get_adjusted_fdr__(self):
        if self._regulation_infos.switched_regulation_direction:
            return 1.0
        else:
            return self.__calculate_damping_factor__()


    def __calculate_damping_factor__(self):
        factor = self.__calculate_order_of_magnitude_damping_factor__()
        fdr_new = 10**(math.log10(self._fdr_ptmsite)*factor)
        return math.min(fdr_new, 1)


    def __calculate_order_of_magnitude_damping_factor__(self):
        inverse_factor = 2**(abs(self._regulation_infos.diff_fc/self._regulation_infos.log2fc_ptm))-1 #the inverse factor gives 1 when the fc stays the same and 0 if the fc is at 0
        return 1/inverse_factor
