In [None]:
# default_exp ptmsite_mapping

In [None]:
import sys
sys.path.append('/Users/constantin/workspace/EmpiRe/MS-EmpiRe_Python/')
from ms_empire.background_distributions import *
from ms_empire.normalization import *
from ms_empire.diff_analysis import *
from ms_empire.visualizations import *
from ms_empire.benchmarking import *
from ms_empire.diffquant_utils import *

In [None]:
import pandas as pd
import numpy as np

def detect_site_occupancy_change(cond1, cond2, samplemap_file, ptmsite_map, minrep = 2, threshold_prob = 0.05):
    """
    reads a PTMsite table with headers "REFPROT", "gene","site", and headers for sample1, sample2, etc and determines
    whether a site appears/dissappears between conditions based on some probability threshold
    """
    samplemap_df, _ = initialize_sample2cond(samplemap_file)
    ptmsite_df = pd.read_csv(ptmsite_map, sep = "\t")
    ptmsite_df["site_id"] = ptmsite_df["REFPROT"] + ptmsite_df["site"].astype("str")
    ptmsite_df = ptmsite_df.set_index("site_id").sort_index()
    cond1_samples = list(set(samplemap_df[(samplemap_df["condition"]==cond1)]["sample"]).intersection(set(ptmsite_df.columns)))
    cond2_samples = list(set(samplemap_df[(samplemap_df["condition"]==cond2)]["sample"]).intersection(set(ptmsite_df.columns)))

    regulated_sites = []
    count = 0
    for ptmsite in ptmsite_df.index.unique():

        site_df = ptmsite_df.loc[[ptmsite]]
        count+=len(site_df.index)
  
        cond1_vals = site_df[cond1_samples].to_numpy()
        cond2_vals = site_df[cond2_samples].to_numpy()

        cond1_vals = cond1_vals[~np.isnan(cond1_vals)]
        cond2_vals = cond2_vals[~np.isnan(cond2_vals)]

        numrep_c1 = len(cond1_vals)
        numrep_c2 = len(cond2_vals)

        if(numrep_c1<minrep) | (numrep_c2 < minrep):
            continue


        cond1_prob = np.mean(cond1_vals)
        cond2_prob = np.mean(cond2_vals)
        

        unlikely_c1 = cond1_prob<threshold_prob
        unlikely_c2 = cond2_prob<threshold_prob
        likely_c1 = cond1_prob>1-threshold_prob
        likely_c2 = cond2_prob>1-threshold_prob
        direction = 0

        if(unlikely_c1&likely_c2):
            direction = -1
        if(unlikely_c2&likely_c1):
            direction = 1
        
        if direction!=0:
            refprot = site_df["REFPROT"].values[0]
            gene = site_df["gene"].values[0]
            site = site_df["site"].values[0]
            regulated_sites.append([refprot, gene, site, direction, cond1_prob, cond2_prob, numrep_c1, numrep_c2])
        

    df_occupancy_change = pd.DataFrame(regulated_sites, columns=["REFPROT", "gene", "site", "direction", "c1_meanprob", "c2_meanprob", "c1_nrep", "c2_nrep"])
    return df_occupancy_change




