In [None]:
# default_exp diff_analysis_manager

In [None]:
#export

class RunConfigOfRunPipeline:
    """Stores all the parameters given to the "run_pipeline" function"""
    def __init__(self, locals):
        for k, v in locals.items():
            setattr(self, k, v)


In [None]:
#export
import pandas as pd
from itertools import combinations
import numpy as np
import statsmodels.stats.multitest as mt
from time import time
import alphaquant.diffquant_utils as aqutils
import alphaquant.visualizations as aqviz

import alphaquant.ptmsite_mapping as aqptm
import multiprocess


def run_pipeline(*,input_file = None, samplemap_file=None, samplemap_df = None, modification_type = None, input_type_to_use = None,results_dir = "./results", condpair_combinations = None, minrep = 2, 
min_num_ions = 1, minpep = 1, cluster_threshold_pval = 0.05, cluster_threshold_fcfc = 0, use_ml = True, take_median_ion = True,outlier_correction = True, normalize = True,
use_iontree_if_possible = None, write_out_results_tree = True, get_ion2clust = False, median_offset = False, pre_normed_intensity_file = None, dia_fragment_selection = False, use_multiprocessing = False,runtime_plots = False, volcano_fdr =0.05, volcano_fcthresh = 0.5, 
annotation_file = None, protein_subset_for_normalization_file = None):

    """Run the differential analyses.
    """
    
    check_input_consistency(input_file, samplemap_file, samplemap_df)

    if samplemap_df is None:
        samplemap_df = aqutils.load_samplemap(samplemap_file)
    
    if (input_file is not None) and (modification_type is not None):
        
        input_file = write_ptm_mapped_input(input_file=input_file, results_dir=results_dir, samplemap_df=samplemap_df, modification_type=modification_type)

        
    
    
    #use runconfig object to store the parameters
    runconfig = RunConfigOfRunPipeline(locals()) #all the parameters given into the function are transfered to the runconfig object!
    runconfig.use_iontree_if_possible = determine_if_ion_tree_is_used(runconfig)

    #store method parameters for reproducibility
    aqutils.store_method_parameters(locals(), results_dir)
    

    if condpair_combinations == None:
        conds = samplemap_df["condition"].unique()
        condpair_combinations = combinations(conds, 2)
    
    num_cores = multiprocess.cpu_count() if use_multiprocessing else 1

    if num_cores == 1:
        run_analysis_singleprocess(condpair_combinations=condpair_combinations, runconfig=runconfig)

    else:
        run_analysis_multiprocess(condpair_combinations=condpair_combinations, runconfig=runconfig, num_cores=num_cores)
    

def run_analysis_singleprocess(condpair_combinations, runconfig):

    for condpair in condpair_combinations:
        analyze_condpair(runconfig=runconfig, condpair=condpair)

def run_analysis_multiprocess(condpair_combinations, runconfig, num_cores):

    with multiprocess.Pool(num_cores) as pool:
        
        pool.map(lambda condpair : 

        analyze_condpair(runconfig= runconfig, condpair = condpair)
        
        ,condpair_combinations)




In [None]:
#export
import alphaquant.ptmsite_mapping as aqptm

def write_ptm_mapped_input(input_file, results_dir, samplemap_df, modification_type):
    try:
        aqptm.assign_dataset_inmemory(input_file = input_file, results_dir=results_dir, samplemap_df=samplemap_df, modification_type=modification_type)
    except:
        aqptm.assign_dataset_chunkwise(input_file = input_file, results_dir=results_dir, samplemap_df=samplemap_df, modification_type=modification_type)
    mapped_df = pd.read_csv(f"{results_dir}/ptm_ids.tsv", sep = "\t")
    ptm_mapped_file = aqptm.merge_ptmsite_mappings_write_table(input_file, mapped_df, modification_type)
    return ptm_mapped_file


In [None]:
#export

def check_input_consistency(input_file, samplemap_file, samplemap_df):
    if input_file is None:
        raise Exception("no input file!")
    if samplemap_file is None and samplemap_df is None:
        raise Exception("inputs inconsistent! Either file or dataframe needs to be specified!")
    return True

In [None]:
#export
import alphaquant.diffquant_utils as aqutils
def get_unnormed_df_condpair(input_file:str, samplemap_df:pd.DataFrame, condpair:str) -> pd.DataFrame:


    samples_c1, samples_c2 = aqutils.get_samples_used_from_samplemap_df(samplemap_df=samplemap_df, cond1 = condpair[0], cond2 = condpair[1])
    used_samples = samples_c1+samples_c2
    unnormed_df = aqutils.import_data(input_file,samples_subset=used_samples)
    unnormed_df, _ = aqutils.prepare_loaded_tables(unnormed_df, samplemap_df)
    return unnormed_df

In [None]:
#export
def get_per_condition_dataframes(samples_c1, samples_c2, unnormed_df, minrep):

    min_samples = min(len(samples_c1), len(samples_c2))

    if min_samples<2:
        raise Exception(f"condpair has not enough samples: c1:{len(samples_c1)} c2: {len(samples_c2)}, skipping")

    minrep_c1 = get_minrep_for_cond(samples_c1, minrep)
    minrep_c2 = get_minrep_for_cond(samples_c2, minrep)
    df_c1 = unnormed_df.loc[:, samples_c1].dropna(thresh=minrep_c1, axis=0)
    df_c2 = unnormed_df.loc[:, samples_c2].dropna(thresh=minrep_c2, axis=0)
    if (len(df_c1.index)<5) | (len(df_c2.index)<5):
        raise Exception(f"condpair has not enough data for processing c1: {len(df_c1.index)} c2: {len(df_c2.index)}, skipping")
        
    return df_c1, df_c2

def get_minrep_for_cond(c_samples, minrep):
    if minrep is None: #in the case of None, no nans will be allowed
        return None
    num_samples = len(c_samples)
    if num_samples<minrep:
        return num_samples
    else:
        return minrep



In [None]:
#export
import alphaquant.diffquant_utils as aqutils
def determine_if_ion_tree_is_used(runconfig):
    if runconfig.use_iontree_if_possible is not None:
        return runconfig.use_iontree_if_possible
    _, config_dict, _ =  aqutils.get_input_type_and_config_dict(runconfig.input_file)
    return config_dict.get("use_iontree")




In [None]:
#export
import alphaquant.background_distributions as aqbg
import alphaquant.diff_analysis as aqdiff
import alphaquant.normalization as aqnorm
import alphaquant.visualizations as aqviz
import alphaquant.diffquant_utils as aqutils
import alphaquant.cluster_ions as aqclust
import alphaquant.classify_ions as aqclass
import anytree

def analyze_condpair(*,runconfig, condpair):
    t_zero = time()
    print(f"start processeing condpair {condpair}")
    prot2diffions = {}
    p2z = {}
    ion2clust = {}
    protnodes = []
    quantified_peptides = []
    quantified_proteins = []
    
    
    input_df_local = get_unnormed_df_condpair(input_file=runconfig.input_file, samplemap_df=runconfig.samplemap_df, condpair=condpair)
    pep2prot = dict(zip(input_df_local.index, input_df_local['protein']))
    c1_samples, c2_samples = aqutils.get_samples_used_from_samplemap_df(runconfig.samplemap_df, condpair[0], condpair[1])

    try:
        df_c1, df_c2 = get_per_condition_dataframes(c1_samples, c2_samples, input_df_local, runconfig.minrep)
    except:
        return

    df_c1_normed, df_c2_normed = aqnorm.normalize_if_specified(df_c1 = df_c1, df_c2 = df_c2, c1_samples = c1_samples, c2_samples = c2_samples, minrep = runconfig.minrep, normalize_within_conds = runconfig.normalize, normalize_between_conds = runconfig.normalize, 
    runtime_plots = runconfig.runtime_plots, protein_subset_for_normalization_file=runconfig.protein_subset_for_normalization_file, pep2prot = pep2prot,prenormed_file = runconfig.pre_normed_intensity_file)#, "./test_data/normed_intensities.tsv")
    
    if runconfig.results_dir != None:
        write_out_normed_df(df_c1_normed,df_c2_normed, pep2prot, runconfig.results_dir, condpair)
    t_normalized = time()
    normed_c1 = aqbg.ConditionBackgrounds(df_c1_normed, p2z)
    normed_c2 = aqbg.ConditionBackgrounds(df_c2_normed, p2z)

    t_bgdist_fin = time()
    ions_to_check = normed_c1.ion2nonNanvals.keys() & normed_c2.ion2nonNanvals.keys()
    use_ion_tree = list(ions_to_check)[0].startswith("SEQ_") & runconfig.use_iontree_if_possible
    bgpair2diffDist = {}
    deedpair2doublediffdist = {}
    count_ions=0
    for ion in ions_to_check:
        t_ion = time()
        vals1 = normed_c1.ion2nonNanvals.get(ion)
        vals2 = normed_c2.ion2nonNanvals.get(ion)
        diffDist = aqbg.get_subtracted_bg(bgpair2diffDist,normed_c1, normed_c2,ion, p2z)
        t_subtract_end = time()
        diffIon = aqdiff.DifferentialIon(vals1, vals2, diffDist, ion, runconfig.outlier_correction)
        t_diffion = time()
        protein = pep2prot.get(ion)
        prot_ions = prot2diffions.get(protein, list())
        prot_ions.append(diffIon)
        prot2diffions[protein] = prot_ions
        quantified_peptide = QuantifiedResult(kwargs = {'ion' : ion, 'pval' : diffIon.p_val, 'log2fc' : diffIon.fc, 'protein' : protein, 'condpair' : aqutils.get_condpairname(condpair)})
        quantified_peptides.append(quantified_peptide)


        if count_ions%2000==0:
            print(f"checked {count_ions} of {len(ions_to_check)} ions")

        count_ions+=1

        t_iterfin = time()
        #print(f"t_init {t_subtract_start-t_ion} t_diffdist {t_subtract_end -t_subtract_start} t_diffion {t_iterfin - t_ion}")
    count_prots = 0
    for prot in prot2diffions.keys():
        ions = prot2diffions.get(prot)
        if len(ions)<runconfig.min_num_ions:
            continue
        diffprot = aqdiff.DifferentialProtein(prot, ions, runconfig.median_offset, runconfig.dia_fragment_selection)
        if use_ion_tree:
            clustered_root_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc, take_median_ion=runconfig.take_median_ion)
            #print(anytree.RenderTree(clustered_root_node))
            #if not clustered_root_node.is_included:
             #   continue
            protnodes.append(clustered_root_node)
            pval, fc, consistency_score, ions_included = aqclust.get_diffresults_from_clust_root_node(clustered_root_node)
            num_peptides = len(anytree.findall(clustered_root_node, filter_ = lambda x : x.type == 'seq'))
            if num_peptides < runconfig.minpep:
                continue

        else:
            pval, fc, consistency_score, ions_included = diffprot.pval, diffprot.fc, np.nan,diffprot.ions

        if runconfig.get_ion2clust:
            ionclust_protein = aqclust.find_fold_change_clusters_base_ions([[x] for x in ions],normed_c1,normed_c2, bgpair2diffDist,p2z, deedpair2doublediffdist, fc_threshold=runconfig.cluster_threshold_fcfc, pval_threshold_basis=runconfig.cluster_threshold_pval)
            ion2clust.update({x.name:y for x,y in ionclust_protein.items()})
        
        if count_prots%100==0:
            print(f"checked {count_prots} of {len(prot2diffions.keys())} prots")
        count_prots+=1

        pseudoint1_cond, pseudoint2_cond = aqdiff.calc_pseudo_intensities(ions, normed_c2, diffprot.fc)
        quantified_protein = QuantifiedResult(kwargs={'condpair': aqutils.get_condpairname(condpair), 'protein' : prot, 'fdr' : 1.0, 'pval':pval,'log2fc': fc, 'consistency_score' : consistency_score, 'num_ions' : len(ions_included), 'pseudoint1' : pseudoint1_cond, 'pseudoint2' : pseudoint2_cond})
        quantified_proteins.append(quantified_protein)

    if use_ion_tree:
        if runconfig.use_ml:
            ml_performance_dict = {}
            aqclass.assign_predictability_scores(protnodes, runconfig.results_dir, name = aqutils.get_condpairname(condpair), samples_used = c1_samples+ c2_samples,precursor_cutoff=3, 
            fc_cutoff=0.5, number_splits=5, plot_predictor_performance=runconfig.runtime_plots, replace_nans=True, performance_metrics=ml_performance_dict)
            if ml_performance_dict["r2_score"] >0.05: #only use the ml score, if it is meaningful
                aqclust.update_nodes_w_ml_score(protnodes)
                update_quantified_proteins_w_tree_results(quantified_proteins, protnodes)
            if runconfig.write_out_results_tree:
                aqclust.export_roots_to_json(protnodes,condpair,runconfig.results_dir)
    
    add_fdr(quantified_proteins)
    add_fdr(quantified_peptides)
    
    res_df = get_results_df(quantified_proteins)
    pep_df = get_results_df(quantified_peptides)

    if runconfig.runtime_plots:
        aqviz.volcano_plot(res_df, significance_cutoff = runconfig.volcano_fdr, log2fc_cutoff = runconfig.volcano_fcthresh)
        aqviz.volcano_plot(pep_df,significance_cutoff = runconfig.volcano_fdr, log2fc_cutoff = runconfig.volcano_fcthresh)

    if runconfig.results_dir!=None:
        if runconfig.annotation_file != None: #additional annotations can be added before saving
            annot_df = pd.read_csv(runconfig.annotation_file, sep = "\t")
            intersect_columns = annot_df.columns.intersection(pep_df.columns)
            if(len(intersect_columns)>0):
                print(list(intersect_columns))
                res_df = res_df.merge(annot_df, on=list(intersect_columns), how= 'left')
                pep_df = pep_df.merge(annot_df, on= list(intersect_columns), how = 'left')


        
        if runconfig.get_ion2clust:
            ion2clust_df = pd.DataFrame(ion2clust.items(), columns=['ion', 'cluster'])
            ion2clust_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.ion2clust.tsv", sep = "\t", index=None)

        res_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.tsv", sep = "\t", index=None)
        pep_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.ions.tsv", sep = "\t", index=None)
    


    return res_df, pep_df

In [None]:
#export
#helper class to store diffresults
class QuantifiedResult:
    def __init__(self, **kwargs):
        self.propdict = None
        if kwargs:
            self.propdict = kwargs['kwargs']
    def add_property(self, key, value):
        self.propdict[key]  = value
    def add_properties(self, dict):
        self.propdict.update(dict)


def update_quantified_proteins_w_tree_results(quantified_proteins, protnodes):
    prot2fc = {x.name : x.fc for x in protnodes}
    prot2pval = {x.name : x.p_val  for x in protnodes}
    prot2predscore = {x.name : x.predscore for x in protnodes}
    for quantified_protein in quantified_proteins:
        protname = quantified_protein.propdict['protein']
        quantified_protein.propdict['log2fc'] = prot2fc.get(protname)
        quantified_protein.propdict['pval'] = prot2pval.get(protname)
        quantified_protein.propdict['predscore'] = prot2predscore.get(protname)


def add_fdr(quantified_results):
    pvals = [x.propdict["pval"] for x in quantified_results]
    fdrs = mt.multipletests(pvals, method='fdr_bh', is_sorted=False, returnsorted=False)[1]
    for idx in range(len(quantified_results)):
        quantified_results[idx].propdict['fdr'] = fdrs[idx]

def get_results_df(quantfied_results):
    quantified_results_dicts = [x.propdict for x in quantfied_results]
    res_df = pd.DataFrame(quantified_results_dicts)
    return res_df


{'ion': 'ion', 'pval': 23, 'log2fc': 123.2, 'protein': 'protein', 'condpair': 'aqutils.get_condpairname(condpair)'}


In [None]:
#export
import numpy as np
import os
def write_out_normed_df(normed_df_1, normed_df_2, pep2prot, results_dir, condpair):
    merged_df = normed_df_1.merge(normed_df_2, left_index = True, right_index = True)
    merged_df = 2**merged_df
    merged_df = merged_df.replace(np.nan, 0)
    merged_df["protein"] = list(map(lambda x : pep2prot.get(x),merged_df.index))
    if not os.path.exists(f"{results_dir}/"):
        os.makedirs(f"{results_dir}/")
    merged_df.to_csv(f"{results_dir}/{aqutils.get_condpairname(condpair)}.normed.tsv", sep = "\t")

In [None]:
def merge_normed_dfs(normed_df_1, normed_df_2, condpair):
    merged_df = normed_df_1.merge(normed_df_2, left_index = True, right_index = True)
    merged_df = 2**merged_df
    merged_df = merged_df.replace(np.nan, 0)
    merged_df["condpair"] = condpair
    return merged_df

In [None]:
#export
import pandas as pd
import numpy as np

#read in proteomics datafiles, log the intensities
def read_tables(peptides_tsv, samplemap_tsv, pepheader = None, protheader = None):
    samplemap = pd.read_csv(samplemap_tsv, sep="\t")
    peps = pd.read_csv(peptides_tsv,sep="\t")

    if pepheader != None:
        peps = peps.rename(columns = {pepheader : "ion"})
    if protheader != None:
        peps = peps.rename(columns = {protheader: "protein"})
    peps = peps.set_index("ion")
    headers = ['protein'] + samplemap["sample"].to_list()

    for sample in samplemap["sample"]:
        peps[sample] = np.log2(peps[sample].replace(0, np.nan))
    return peps[headers], samplemap

In [None]:
#hide
import pandas as pd

def write_out_ion2nonan_ion2idx(cb, outfolder, condname):
    
    ion2nan_conv = [[a, x.tolist()] for a, x in cb.ion2nonNanvals.items()]
    
    ion2nan_df = pd.DataFrame(ion2nan_conv)
    display(ion2nan_df)
    print(ion2nan_df.iloc[41775, 1])
    idx2ion_df = pd.DataFrame(list(cb.idx2ion.items()))

    ion2nan_df.to_csv(f"{outfolder}/ion2nonans_{condname}.tsv", sep = "\t", index = False)
    idx2ion_df.to_csv(f"{outfolder}/idx2ion_{condname}.tsv", sep = "\t", index = False)


    

In [None]:
#hide
import pandas as pd
import numpy as np
def compare_context_boundaries_against_ref(ref_file, cond_bg):
    ref_bounds = pd.read_csv(ref_file, sep = "\t", names = ["l_ref", "u_ref"])
    ref_bounds["l"] = pd.Series(np.array(cond_bg.context_ranges).T[0])
    ref_bounds["u"] = pd.Series(np.array(cond_bg.context_ranges).T[1])
    ref_bounds["l-ref"] = (ref_bounds["l_ref"] / ref_bounds["l"]).abs()
    ref_bounds["u-ref"] = (ref_bounds["u_ref"] / ref_bounds["u"]).abs()
    display(ref_bounds)

In [None]:
#hide

#protein_df, peptide_df = benchmark_proteomics("./test_data/peptides.txt", "./test_data/samples.map", "./test_data/prot2organism.tsv")
#protein_df.to_csv("./test_data/AP_protein_out.tsv", sep = "\t", index= False)
#peptide_df.to_csv("./test_data/AP_peptide_out.tsv", sep = "\t", index= False)


In [None]:
# import sys
# sys.path.append('/Users/constantin/workspace/EmpiRe/nbdev/MS-EmpiRe_Python')
# from alphaquant.background_distributions import *
# from alphaquant.normalization import *
# from alphaquant.diff_analysis import *
# from alphaquant.visualizations import *
# from alphaquant.benchmarking import *
# from alphaquant.diffquant_utils import *