In [None]:
# default_exp cluster_ions

# Helper Functions

In [None]:
#export
import scipy.spatial.distance as distance
import scipy.cluster.hierarchy as hierarchy
import collections
import itertools

def find_fold_change_clusters(diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold = 0.3, pval_threshold_basis = 0.05):
    """Compares the fold changes of different ions and returns the set of ions with consistent fold changes.
    
    Args:
        diffions (list[list[ionnames]] ): contains the sets of ions to be tested, for examples [[fragion1_precursor1, fragion2_precursor1, fragion3_precursor1],[fragion1_precursor2],[fragion1_precursor3, fragion2_precursor3]]. The ions are assumed to be similar!
        normed_c1 (ConditionBackground): [description]
        normed_c2 (ConditionBackground): [description]
        ion2diffDist (dict(ion : SubtractedBackground)): [description]
        p2z ([type]): [description]
        deedpair2doublediffdist ([type]): [description]
        fc_threshold (float, optional): [description]. Defaults to 0.3.
        pval_threshold_basis (float, optional): [description]. Defaults to 0.05.
    """
    if len(diffions)==1:
        num_clusters = 1
        num_mainclust_elems = len(diffions[0])
        frac_mainclust = 1
        return {tuple(diffions[0]) : 0}, num_clusters, num_mainclust_elems, frac_mainclust
    
    diffions_idxs = [[x] for x in range(len(diffions))]
    diffions_fcs = get_fcs_ions(diffions)
    mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
    condensed_distance_matrix = distance.pdist(diffions_idxs, lambda idx1, idx2: evaluate_distance(idx1[0], idx2[0], diffions, diffions_fcs, normed_c1, normed_c2,ion2diffDist,p2z,deedpair2doublediffdist,fc_threshold,mt_corrected_pval_thresh))
    after_clust = hierarchy.complete(condensed_distance_matrix)
    clustered = hierarchy.fcluster(after_clust, 0.1, criterion='distance')
    clustered = exchange_cluster_idxs(clustered)
    print(clustered)
    num_clusters = len(set(clustered))
    num_mainclust_elems = sum([x==0 for x in clustered])
    frac_mainclust = num_mainclust_elems/len(clustered)
    ions2clust = { tuple(diffions[ion_idx]) : clust_idx for ion_idx, clust_idx in zip(list(range(len(clustered))),clustered)}
    
    
    return ions2clust, num_clusters, num_mainclust_elems, frac_mainclust



In [None]:
#export
def exchange_cluster_idxs(fclust_output_array):
    """The fcluster output assigns cluster numbers to the clustered elems, e.g. [1,2,1,2,2,2]. 
    This function here ensures that the numbers follow size of the cluster, e.g. [1,0,1,0,0,0]"""
    clustnum2count = {}
    for clustnum in fclust_output_array:
        clustnum2count[clustnum] = clustnum2count.get(clustnum, 0)+1
    clustnums = list(clustnum2count.keys())
    clustnums.sort(key = lambda x : clustnum2count.get(x), reverse= True)
    clustnum_old2clustnum_new = {clustnums[idx]: idx for idx in range(len(clustnums))}
    return [clustnum_old2clustnum_new.get(clustnum) for clustnum in fclust_output_array]


In [None]:
#export
import numpy as np
def get_fcs_ions(diffions):
    fcs = np.ones(len(diffions))
    for idx in range(len(diffions)):
        fc_ions = statistics.mean([ion.fc for ion in diffions[idx]])
        fcs[idx] = fc_ions
    return fcs


In [None]:
#export
import statistics
import alphaquant.doublediff_analysis as aqdd
def evaluate_distance(idx1, idx2, diffions, fcs, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold, pval_threshold_basis):
    ions1 = [x.name for x in diffions[idx1]]
    ions2 = [x.name for x in diffions[idx2]]
    fc1 = fcs[idx1]
    fc2 = fcs[idx2]

    if abs((fc1-fc2)) < fc_threshold:
        return 0
    else:
        fcfc, pval = aqdd.calc_doublediff_score(ions1, ions2, normed_c1, normed_c2,ion2diffDist,p2z, deedpair2doublediffdist)
        if pval<pval_threshold_basis:
            return 1
        else:
            return 0


# Group and cluster ions


In [None]:
#export
import anytree
import re
def create_hierarchical_ion_grouping(regex_patterns, gene_name, diffions):
    #regex patterns sorted from bottom to top in the following way list(list(tuple(pattern, name))): first instance of list represents the level of the tree, second instance represents the different nodes available on this level (for example FRgIon, MS1 are on the same level)
    
    nodes = [anytree.Node(x.name, type = "base", cluster = -1, is_included = True) for x in diffions]

    for level in regex_patterns:
        name2node = {}
        for pattern2name in level:
            for node in nodes:
                if (re.match(pattern2name[0], node.name)):
                    m = re.match(pattern2name[0], node.name)
                    matching_name = m.group(1)
                    name2node[matching_name] = name2node.get(matching_name, anytree.Node(matching_name,  type = pattern2name[1], cluster = -1, is_included = True))
                    parent_node = name2node.get(matching_name)
                    node.parent = parent_node

        if len(name2node.keys())>0:
            nodes = list(name2node.values())
            print(anytree.RenderTree(nodes[0]))
            print(anytree.RenderTree(name2node.get(nodes[0].name)))
            print("der node")
    
    root_node = anytree.Node(gene_name, type = "gene", cluster = -1, is_included = True)

    for node in nodes:
        print(anytree.RenderTree(node))
        node.parent = root_node

    return root_node


In [None]:
#export
def cluster_along_specified_levels(typefilter, root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold = 0.3, pval_threshold_basis = 0.05):
    #typefilter object specifies filtering and clustering of the nodes

    for idx in range(len(typefilter.type)):
        type_nodes = anytree.search.findall(root_node, filter_=lambda node: node.type == typefilter.type[idx])
        if len(type_nodes)==0:
            continue
        leaflist, node2leafs = get_ionlist(type_nodes, ionname2diffion)
        if len(leaflist) ==0:
            continue
        leafs2clust, num_clusters, num_mainclust_elems, frac_mainclust = find_fold_change_clusters(leaflist, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold, pval_threshold_basis)
        update_nodes(type_nodes, typefilter, idx, node2leafs, leafs2clust, num_clusters, num_mainclust_elems, frac_mainclust)
    return root_node


def get_ionlist(type_nodes, ionname2diffion):
    ionlist = []
    node2leafs = {}
    for node in type_nodes:
        leafs_included = [ionname2diffion.get(x.name) for x in node.descendants if x.is_leaf & x.is_included]
        if len(leafs_included)==0:
            node.is_included = False
            continue
        ionlist.append(leafs_included)
        node2leafs[node] = leafs_included
    return ionlist, node2leafs

def update_nodes(type_nodes, typefilter, type_idx, node2leafs, leafs2clust,  num_clusters, num_mainclust_elems, frac_mainclust):
    for node in type_nodes:
        if not node.is_included:
            continue
        clustid =  leafs2clust.get(tuple(node2leafs.get(node)))
        node.cluster = clustid
        if clustid!=typefilter.select_cluster[type_idx]:
            node.is_included = False
            for descendant in node.descendants:
                descendant.is_included = False

    if (num_clusters > typefilter.exclude_if_more_clusters_than[type_idx]) | (frac_mainclust < typefilter.exclude_if_fraction_of_mainclust_smaller_than[type_idx]) | (num_mainclust_elems < typefilter.exclude_if_elements_in_cluster_less_than[type_idx]):
        for node in type_nodes:
            for descendant in node.descendants:
                descendant.is_included = False


In [None]:
#export
import numpy as np
class TypeFilter():
    def __init__(self):
        self.type = ['frgion', 'ms1_isotopes', 'mod_seq_charge', 'mod_seq', 'seq', 'gene']
        self.select_cluster = [0,0,0,0,0,0,0]
        self.exclude_if_more_clusters_than = [ np.inf, np.inf, 1, np.inf, np.inf, np.inf]
        self.exclude_if_fraction_of_mainclust_smaller_than = [0.3, 0.7, 0, 0.5, 0, 0]
        self.exclude_if_elements_in_cluster_less_than = [1, 1, 1, 1, 1, 1]



# Test Clustering

In [None]:

import uuid
import alphaquant.background_distributions as aqbg
import alphaquant.benchmarking as aqbm
import alphaquant.diff_analysis as aqdiff
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


def simulate_normed_input():

    sample2cond_df = pd.DataFrame({'sample' : ['A1', 'A2', 'A3', 'B1', 'B2', 'B3','B4', 'B5', 'B6', 'B7', 'B8', 'B9','B10', 'B11', 'B12'],
    'condition' : ['A', 'A', 'A', 'B', 'B', 'B','B', 'B', 'B','B', 'B', 'B','B', 'B', 'B']})
    unnormed_df = aqbm.generate_random_input(10000, sample2cond_df,simulate_nas=True)
    df_c1, df_c2 = get_c1_c2_dfs(unnormed_df, sample2cond_df, ["A", "B"])
    p2z = {}
    normed_c1 = aqbg.ConditionBackgrounds(df_c1, p2z)
    normed_c2 = aqbg.ConditionBackgrounds(df_c2, p2z)
    return normed_c1, normed_c2

def get_c1_c2_dfs(unnormed_df, labelmap_df, condpair, minrep = 2):
    c1_samples = labelmap_df[labelmap_df["condition"]== condpair[0]]
    c2_samples = labelmap_df[labelmap_df["condition"]== condpair[1]]
    df_c1 = unnormed_df.loc[:, c1_samples["sample"]].dropna(thresh=minrep, axis=0)
    df_c2 = unnormed_df.loc[:, c2_samples["sample"]].dropna(thresh=minrep, axis=0)

    return df_c1, df_c2

def generate_diffions():
    normed_c1, normed_c2 = simulate_normed_input()
    ion2diffDist = {}
    p2z = {}
    diffions = []
    ions_to_check = normed_c1.ion2nonNanvals.keys() & normed_c2.ion2nonNanvals.keys()
    for idx, ion in enumerate(ions_to_check):
        if not (("pep12_" in ion) | ("pep23_" in ion)):
            continue

        vals1 = normed_c1.ion2nonNanvals.get(ion)
        vals2 = normed_c2.ion2nonNanvals.get(ion)
        diffDist = aqbg.get_subtracted_bg(ion2diffDist,normed_c1, normed_c2,ion, p2z)
        diffIon = aqdiff.DifferentialIon(vals1, vals2, diffDist, ion, outlier_correction = False)
        diffions.append(diffIon)
        #if idx>100:
         #   break
    
    return diffions, normed_c1, normed_c2

from anytree.exporter import dotexporter
def test_tree_construction():
    diffions, normed_c1, normed_c2 = generate_diffions()
    regex_patterns = [[("(.*_LVL0.*_LVL1.*_LVL2.*_LVL3)(_mod[0-1])","frgion"), ("(.*_LVL0.*_LVL1.*_LVL2.*_LVL3)(_mod[2-3])", "ms1_isotopes")], [("(.*_LVL0.*_LVL1.*_LVL2)(.*_LVL3)", "mod_seq_charge")], [("(.*_LVL0.*_LVL1)(.*_LVL2)", "mod_seq")], 
    [("(.*)(.*_LVL0.*_LVL1.*)", "seq")]]
    node = create_hierarchical_ion_grouping(regex_patterns,"testgene",diffions)
    print(anytree.RenderTree(node))
    dotexporter.UniqueDotExporter(node).to_picture("tmp.png")
    print(f'diffions {[x.name for x in diffions]}')
    typefilter = TypeFilter()

    ionname2diffion = {x.name:x for x in diffions}
    deedpair2doublediffdist = {}
    dpair2diffdist = {}
    p2z = {}
    node_clust = cluster_along_specified_levels(typefilter,node,ionname2diffion, normed_c1, normed_c2,dpair2diffdist,p2z, deedpair2doublediffdist, pval_threshold_basis=0.99)
    print(anytree.RenderTree(node_clust))

test_tree_construction()

