In [None]:
# default_exp ptmsite_mapping

In [None]:
from ms_empire.background_distributions import *
from ms_empire.normalization import *
from ms_empire.diff_analysis import *
from ms_empire.visualizations import *
from ms_empire.benchmarking import *
from ms_empire.diffquant_utils import *

# Helper Classes

In [None]:
#export
import numpy as np
class ModifiedPeptide():
    """
    helper class for convenient access of modified peptide variables
    """
    def __init__(self, df_line, protein_sequence, id_thresh, excl_thresh, modification_type = "[Phospho (STY)]"):

        self.id = df_line["IonID"]
        self.ionname = df_line["FG.Id"]
        self.sample = df_line["R.Label"]
        self.seq = df_line["PEP.GroupingKey"]
        self.prot = df_line["PG.UniProtIds"]
        self.start_idx = protein_sequence.find(self.seq)
        positions_parsed = np.array(df_line[f"EG.PTMPositions {modification_type}"].split(";")).astype("int")
        probabilities_parsed =  np.array(df_line[f"EG.PTMProbabilities {modification_type}"].split(";")).astype("float")
        self.positions = scale_site_idxs_to_protein(protein_sequence, self.seq, positions_parsed)
        self.num_sites = get_num_sites(probabilities_parsed)
        self.probabilities = probabilities_parsed
        self.encoded_probs = None#encode_probabilities(probabilities_parsed, id_thresh, excl_thresh)


# Helper functions

## Group ions and reduce redundancies

In [None]:
#export

import copy
def merge_samecond_modpeps(ions, sample2cond, id_thresh, excl_thresh):
    """
    identical ions from the same condition are merged and their site localization probabilities are averaged
    """
    res = []
    condid2ionids = {}
    
    condion2modpeps = {}
    for ion in ions:
        condid = f"{sample2cond.get(ion.sample)}{ion.ionname}"
        condion2modpeps[condid] = condion2modpeps.get(condid, []) + [ion]
        condid2ionids[condid] = condid2ionids.get(condid, []) + [ion.id]
    
    for condid,modpeps in condion2modpeps.items():
        modpep_selected = copy.deepcopy(modpeps[0])
        allprobs = [x.probabilities for x in modpeps]
        meanprobs = np.mean(allprobs, axis = 0)
        modpep_selected.id = condid
        modpep_selected.probabilities = meanprobs
        modpep_selected.encoded_probs = encode_probabilities(meanprobs, id_thresh, excl_thresh)
        res.append(modpep_selected)
    return res, condid2ionids

In [None]:
#export
def scale_site_idxs_to_protein(protseq, pepseq, localization_array):
    """align peptide sequence along protein, express idxs relative to start"""
    start_idx = protseq.find(pepseq)
    localization_array = localization_array + start_idx
    return localization_array

In [None]:
#export
def get_num_sites(probabilities_parsed):
    return round(sum(probabilities_parsed))


In [None]:
#export
def group_by_nummods_posv(ions):
    """ions with identical position vector and number of modifications are grouped together"""
    nmod2ions = {}
    for ion in ions:
        nmodposv = f"{ion.num_sites}_{ion.positions}"
        nmod2ions[nmodposv] = nmod2ions.get(nmodposv, []) + [ion]
    return list(nmod2ions.values())

In [None]:
#export
def condense_ions(ions):
    """
    group ions together, which have identical sequence and encoded probabilities. This way you only need to 
    compare these in the distance matrix
    """
    key2equivions = {}
    for ion in ions:
        key = f"{ion.seq}_{ion.encoded_probs}"
        key2equivions[key] = key2equivions.get(key, []) + [ion]
    ion2equiv_ions = {gr_ions[0] : gr_ions for gr_ions in key2equivions.values()}
    representative_ions = list(ion2equiv_ions.keys())
    return representative_ions, ion2equiv_ions

In [None]:
#export
def encode_probabilities(probabilties_parsed, id_thresh, excl_thresh):
    prob_copy = probabilties_parsed.copy()
    prob_copy[prob_copy>id_thresh] = 5
    prob_copy[prob_copy < excl_thresh] = 3
    prob_copy[(prob_copy!=3) & (prob_copy!=5)] = 0

    return prob_copy.astype('int')


## Compare and cluster ions

In [None]:
def compare_ion_similarities(ions):
    seqs = np.array([x.seq for x in ions])
    encoded = np.array([x.encoded_probs for x in ions])
    #distances = np.array([(8 in (x[0].encoded_probs + x[1].encoded_probs)) &(x[1].seq in x[0].seq) for x in itertools.combinations(ions, 2)]).astype('int')
    distances = get_condensed_matrix(seqs, encoded)
    return distances

In [None]:
#export

def cluster_ions(ions):
    res = []
    nmod_posv_grouped = group_by_nummods_posv(ions)
    for candidates in nmod_posv_grouped:
        if len(candidates)==1: #check if only one ion, then no pairwise comparison needed
            res.extend([candidates])
            continue
        
        representative_ions, ion2equiv_ions = condense_ions(candidates) 
        if len(representative_ions)==1:#check if only one condensed ion, then also no pairwise comparison needed
            equiv_ions = ion2equiv_ions.get(representative_ions[0])
            res.extend([equiv_ions])
            continue

        ionclustered = cluster_ions_pairwise(representative_ions) #if multiple ions to compare, do pairwise comparisons
        for cluster in ionclustered:
            clust_copy = cluster.copy()
            for ion in clust_copy:
                equiv_ions = ion2equiv_ions.get(ion)
                if len(equiv_ions)>1:
                    cluster.extend(equiv_ions)
        res.extend(ionclustered)
            
    return res

In [None]:
#export

import scipy.cluster.hierarchy as hierarchy

def cluster_ions_pairwise(ions):
    """form complete linkage clusters (every ion is a neighbor to every ion in the cluster) for a given set of ions. Distance matrix define in 'compare ion similarities'"""
    ions.sort(key = lambda x : len(x.seq),reverse = True)
    condensed_distance_matrix = compare_ion_similarities(ions)
    after_clust = hierarchy.complete(condensed_distance_matrix)
    clustered = hierarchy.fcluster(after_clust, 0.1, criterion='distance')
    clust2ions = {}
    for i in range(len(clustered)):
       clustions = clust2ions.get(clustered[i],list())
       clustions.append(ions[i])
       clust2ions[clustered[i]] = clustions

    return list(clust2ions.values())

In [None]:
#export

def compare_ion_similarities(ions):
    """returns a condensed distance matrix for a given set of ions. Distances are calculated based on the encoded site localization probabilities, as described below"""
    seqs = np.array([x.seq for x in ions])
    encoded = np.array([x.encoded_probs for x in ions])
    distances = get_condensed_matrix(seqs, encoded)

    return distances


In [None]:
#export
def get_condensed_matrix(seqs, encoded):
    """checks pairwise occpancy vectors based on the following encoding: 3 == clearly not occupied, 5 == clearly occupied. 
    If a sum=vec1+vec2 contains 3+5=8, this means it is dissimilar and is assigned distance = 1, distance =0 otherwise
    """
    res = np.zeros(int(len(seqs) * (len(seqs)-1)/2))
    count = 0
    for i in range(len(seqs)):
        for j in range(i+1, len(seqs)):
            seq1 = seqs[i]
            seq2 = seqs[j]
            if seq2 in seq1:
                encode1 = encoded[i]
                encode2 = encoded[j]
                summed = encode1 + encode2
                if 8 in summed:
                    res[count] = 1
            count+=1
    return res

## Read and reformat input files

In [None]:
#export
def get_idmap_column(protgroups, swissprots):
    """go through protein groups and map to swissprot ID if possible"""
    res = []
    for protgroup in protgroups:
        mapped = False
        proteins = list(protgroup.split(";"))
        for protein in proteins:
            if protein in swissprots:
                res.append(protein)
                mapped = True
                break
        if not mapped:
            res.append(proteins[0])
    return res

In [None]:
#export
import pandas as pd
def get_site_prob_overview(modpeps, refprot, refgene):
    """reformats the modified peptide objects for a given protein. The returned series objects contain the mean probabilities for a given site and experimental sample"""
    site2sample2probs = {}
    for modpep in modpeps:
        for idx in range(len(modpep.positions)):
            site = modpep.positions[idx]
            prob = modpep.probabilities[idx]
            sample = modpep.sample
            site2sample2probs[site] = site2sample2probs.get(site, {}) #.update({sample:[]})
            site2sample2probs.get(site)[sample] = site2sample2probs.get(site).get(sample, []) + [prob]
    
    series_collected = []
    for site in site2sample2probs.keys():
        sample2probs = site2sample2probs.get(site)
        header = list(sample2probs.keys())
        probs = [np.mean(sample2probs.get(x)) for x in header]
        site_series = pd.Series(probs, index=header)
        site_series = site_series.append(pd.Series([int(site)], index=["site"]))
        site_series = site_series.append(pd.Series(refprot, index= ["REFPROT"]))
        site_series = site_series.append(pd.Series(refgene, index= ["gene"]))
        series_collected.append(site_series)

    return series_collected

# Workflow

## Assign all ions for a given protein

In [None]:
#export
import pandas as pd
import numpy as np

def assign_protein(modpeps,condid2ionids, refprot):
    """go through ions of a given protein, cluster if necessary and map each ion to a ptm_site ID"""
    id2groupid = {}
    id2normedid = {}

    if len(modpeps) == 1:
        grouped_ions = [modpeps]
    else:
        grouped_ions = cluster_ions(modpeps)

    for group in grouped_ions:
        summed_probs = sum([x.probabilities for x in group])
        num_sites = group[0].num_sites
        idx_most_likely = np.argpartition(summed_probs, -num_sites)[-num_sites:]
        positions = np.sort(group[0].positions[idx_most_likely])
        ptm_group_id = positions#f"{refprot}_{positions}"
        ptm_group_id_normed = f"{refprot}_{positions-group[0].start_idx}"
        all_ions = sum([condid2ionids.get(x.id) for x in group], [])#the condition-level merged ions are mapped back to the existin ion-level IDs
        id2groupid.update({x:ptm_group_id for x in all_ions})

        id2normedid.update({x:ptm_group_id_normed for x in all_ions})


    return id2groupid, id2normedid

## Iterate through dataset

In [None]:
#export

def assign_dataset(ptmprob_file, id_thresh = 0.75, excl_thresh =0.15, samplemap = 'samples.map',swissprot_file = 'swissprot_mapping.tsv', sequence_file='uniprot_mapping.tsv', modification_type = "[Phospho (STY)]",sep = "\t"):

    """wrapper function reformats inputs tables and iterates through the whole dataset. Output needs to contain """""
    input_df = pd.read_csv(ptmprob_file, sep = sep).drop_duplicates()
    _,sample2cond = initialize_sample2cond(samplemap)
    len_before = len(input_df.index)
    input_df = input_df[~input_df[f"EG.PTMProbabilities {modification_type}"].isna()]
    print(f"filtered PTM peptides from {len_before} to {len(input_df.index)}")
    swissprot_ids = set(pd.read_csv(swissprot_file, sep = "\t", usecols = ["Entry"])["Entry"])
    sequence_df = pd.read_csv(sequence_file, sep = "\t", usecols = ["Entry", "Sequence", "Gene names"])
    sequence_map = dict(zip(sequence_df["Entry"], sequence_df["Sequence"]))
    sequence_df = sequence_df.dropna()
    
    refgene_map = dict(zip(sequence_df["Entry"], [x.split(" ")[0] for x in sequence_df["Gene names"]]))

    input_df["REFPROT"] = get_idmap_column(input_df["PG.UniProtIds"],swissprot_ids)
    input_df["IonID"] = input_df["R.Label"] + input_df['FG.Id']
    input_df = input_df.set_index("REFPROT")
    input_df.sort_index(inplace=True)
    #input_df.to_csv(f"{ptmprob_file}.sorted", sep = "\t")
    site_ids = []
    fg_ids = []
    run_ids = []
    prot_ids = []
    gene_ids = []
    ptmlocs = []
    locprobs = []
    siteprobs = []
    
    count_peps = 0
    one_fraction = int(len(input_df.index)/100)
    for prot in input_df.index.unique():#input_df["REFPROT"].unique():

        if count_peps%one_fraction==0:
            print(f"assigned {count_peps} of {len(input_df.index)} {count_peps/len(input_df.index)}")
        
        #filtvec = [prot in x for x in input_df["REFPROT"]]

        protein_df = input_df.loc[[prot]].copy()#input_df[filtvec].copy()
        protein_df = protein_df.reset_index()

        sequence = sequence_map.get(prot)
        if sequence == None:
            continue
        gene = refgene_map.get(prot)
        count_peps+= len(protein_df)

        modpeps_per_sample = [ModifiedPeptide(protein_df.loc[x], sequence, id_thresh, excl_thresh) for x in protein_df.index]
        merged_siteprobs = get_site_prob_overview(modpeps_per_sample, prot, gene)
        siteprobs.extend(merged_siteprobs)
        modpeps, condid2ionids = merge_samecond_modpeps(modpeps_per_sample, sample2cond, id_thresh, excl_thresh) #all ions coming from the same condition are merged
        ionid2ptmid,_ = assign_protein(modpeps, condid2ionids, prot)##after clustering, conditions are mapped back to the original run

        ptm_ids_prot = [ionid2ptmid.get(x) for x in protein_df["IonID"]]
        
        ptmlocs.extend([x for x in protein_df[f"EG.PTMPositions {modification_type}"]])
        locprobs.extend([x for x in protein_df[f"EG.PTMProbabilities {modification_type}"]])
        site_ids.extend(ptm_ids_prot)
        fg_ids.extend(protein_df["FG.Id"].tolist())
        run_ids.extend(protein_df["R.Label"].tolist())
        prot_ids.extend([prot for x in range(len(ptm_ids_prot))])
        gene_ids.extend([gene for x in range(len(ptm_ids_prot))])

    
    conditions = [sample2cond.get(x) for x in run_ids]
    mapped_df = pd.DataFrame({"R.Label" : run_ids, "conditions" : conditions, "FG.Id" : fg_ids, "REFPROT" : prot_ids, "gene" : gene_ids,"site" : site_ids, "ptmlocs":ptmlocs ,"locprob" : locprobs})
    mapped_df.to_csv(f"{ptmprob_file}.ptm_ids", sep = "\t", index = None)
    
    siteprob_df = pd.DataFrame(siteprobs)
    siteprob_df = siteprob_df.astype({"site" : "int"})
    siteprob_df.set_index(["REFPROT", "site"], inplace=True)
    siteprob_df = siteprob_df.sort_index().reset_index()
    siteprob_df.to_csv(f"{ptmprob_file}.siteprobs", sep = "\t", index = None)


In [None]:
#export
import pandas as pd
import numpy as np

def detect_site_occupancy_change(cond1, cond2, samplemap_file, ptmsite_map, minrep = 2, threshold_prob = 0.05):
    """
    reads a PTMsite table with headers "REFPROT", "gene","site", and headers for sample1, sample2, etc and determines
    whether a site appears/dissappears between conditions based on some probability threshold
    """
    samplemap_df, _ = initialize_sample2cond(samplemap_file)
    ptmsite_df = pd.read_csv(ptmsite_map, sep = "\t")
    ptmsite_df["site_id"] = ptmsite_df["REFPROT"] + ptmsite_df["site"].astype("str")
    ptmsite_df = ptmsite_df.set_index("site_id").sort_index()
    cond1_samples = list(set(samplemap_df[(samplemap_df["condition"]==cond1)]["sample"]).intersection(set(ptmsite_df.columns)))
    cond2_samples = list(set(samplemap_df[(samplemap_df["condition"]==cond2)]["sample"]).intersection(set(ptmsite_df.columns)))

    regulated_sites = []
    count = 0
    for ptmsite in ptmsite_df.index.unique():

        site_df = ptmsite_df.loc[[ptmsite]]
        count+=len(site_df.index)
  
        cond1_vals = site_df[cond1_samples].to_numpy()
        cond2_vals = site_df[cond2_samples].to_numpy()

        cond1_vals = cond1_vals[~np.isnan(cond1_vals)]
        cond2_vals = cond2_vals[~np.isnan(cond2_vals)]

        numrep_c1 = len(cond1_vals)
        numrep_c2 = len(cond2_vals)

        if(numrep_c1<minrep) | (numrep_c2 < minrep):
            continue


        cond1_prob = np.mean(cond1_vals)
        cond2_prob = np.mean(cond2_vals)
        

        unlikely_c1 = cond1_prob<threshold_prob
        unlikely_c2 = cond2_prob<threshold_prob
        likely_c1 = cond1_prob>1-threshold_prob
        likely_c2 = cond2_prob>1-threshold_prob
        direction = 0

        if(unlikely_c1&likely_c2):
            direction = -1
        if(unlikely_c2&likely_c1):
            direction = 1
        
        if direction!=0:
            refprot = site_df["REFPROT"].values[0]
            gene = site_df["gene"].values[0]
            site = site_df["site"].values[0]
            regulated_sites.append([refprot, gene, site, direction, cond1_prob, cond2_prob, numrep_c1, numrep_c2])
        

    df_occupancy_change = pd.DataFrame(regulated_sites, columns=["REFPROT", "gene", "site", "direction", "c1_meanprob", "c2_meanprob", "c1_nrep", "c2_nrep"])
    return df_occupancy_change



