In [None]:
# default_exp cluster_ions

In [None]:
import scipy.spatial.distance as distance
import scipy.cluster.hierarchy as hierarchy
import collections

def find_fold_change_clusters(diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold = 0.3, pval_threshold_basis = 0.05):
    """Compares the fold changes of different ions and returns the set of ions with consistent fold changes.
    
    Args:
        diffions (list[list[ionnames]] ): contains the sets of ions to be tested, for examples [[fragion1_precursor1, fragion2_precursor1, fragion3_precursor1],[fragion1_precursor2],[fragion1_precursor3, fragion2_precursor3]]. The ions are assumed to be similar!
        normed_c1 (ConditionBackground): [description]
        normed_c2 (ConditionBackground): [description]
        ion2diffDist (dict(ion : SubtractedBackground)): [description]
        p2z ([type]): [description]
        deedpair2doublediffdist ([type]): [description]
        fc_threshold (float, optional): [description]. Defaults to 0.3.
        pval_threshold_basis (float, optional): [description]. Defaults to 0.05.
    """
    diffions_idxs = list(range(len(diffions)))
    diffions_fcs = get_fcs_ions(diffions)
    mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
    condensed_distance_matrix = distance.pdist(diffions_idxs, lambda idx1, idx2: evaluate_distance(idx1, idx2, diffions, diffions_fcs, normed_c1, normed_c2,ion2diffDist,p2z,deedpair2doublediffdist,fc_threshold,mt_corrected_pval_thresh))
    after_clust = hierarchy.complete(condensed_distance_matrix)
    clustered = hierarchy.fcluster(after_clust, 0.1, criterion='distance')
    ions2clust = { tuple(diffions[ion_idx]) : clust_idx for ion_idx, clust_idx in zip(list(range(clustered)),clustered)}

    return ions2clust



import numpy as np
def get_fcs_ions(diffions):
    fcs = np.ones(len(diffions))
    for idx in range(len(diffions)):
        fc_ions = statistics.mean([ion.fc for ion in diffions[idx]])
        fcs[idx] = fc_ions
    return fcs

import statistics
import alphaquant.doublediff_analysis as aqdd
def evaluate_distance(idx1, idx2, diffions, fcs, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold, pval_threshold_basis):
    ions1 = diffions[idx1]
    ions2 = diffions[idx2]
    fc1 = fcs[idx1]
    fc2 = fcs[idx2]

    if abs((fc1-fc2)) < fc_threshold:
        return 0
    else:
        pval = aqdd.calc_doublediff_score(ions1, ions2, normed_c1, normed_c2,ion2diffDist,p2z, deedpair2doublediffdist)
        if pval<pval_threshold_basis:
            return 1
        else:
            return 0

    

In [None]:
import anytree
import re
def create_hierarchical_ion_grouping(regex_patterns, gene_name, diffions):
    #regex patterns sorted from bottom to top in the following way list(list(tuple(pattern, name))): first instance of list represents the level of the tree, second instance represents the different nodes available on this level (for example FRgIon, MS1 are on the same level)
    
    nodes = [anytree.Node(x.name, type = "base", cluster = -1, is_included = True) for x in diffions]

    for level in regex_patterns:
        name2node = {}
        for name2pattern in level:
            for node in nodes:
                if (re.match(name2pattern[1], desc.name)):
                    m = re.match(name2pattern[1], node.name)
                    matching_name = m.group(1)
                    parent_node = name2node.get(matching_name, anytree.Node(matching_name,  type = name2pattern[0], cluster = -1, is_included = True))
                    node.parent = parent_node
        nodes = list(name2node.values())
    
    root_node = anytree.Node(gene_name)

    for node in nodes: 
        node.parent = root_node

    return root_node

def cluster_along_specified_levels(typefilter, root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold = 0.3, pval_threshold_basis = 0.05):
    #typefilter object specifies filtering and clustering of the nodes
    
    num_clusters_lastnode= 1
    frac_mainclust_lastnode = 1
    num_mainclust_elems_lastnode = 1

    for idx in range(len(typefilter.type)):
        type_nodes = anytree.search.findall(root_node, filter_=lambda node: node.type == typefilter.type[idx])
        ionlist = []
        for node in type_nodes:
            leafs = [ionname2diffion.get(x.name) for x in node.descendants if x.is_leaf & x.is_included]
            ionlist.append(leafs)
        leafs2clust = find_fold_change_clusters(ionlist, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, fc_threshold = 0.3, pval_threshold_basis = 0.05)
        
    
        num_clusters= len(set(leafs2clust.values()))
        num_mainclust_elems = 0

        for node in type_nodes:
            leafs = [ionname2diffion.get(x.name) for x in node.descendants if x.is_leaf & x.is_included]
            clustid = leafs2clust.get(tuple(leafs))
            node.cluster = clustid
            if clustid==0:
                num_mainclust_elems+=1
            else:
                node.is_included = False
            
        frac_mainclust = num_mainclust_elems/num_clusters

        if (num_clusters > typefilter.exclude_if_more_clusters_than[idx]) | (frac_mainclust < typefilter.exclude_if_fraction_of_mainclust_smaller_than[idx]) | (num_mainclust_elems < typefilter.exclude_if_elements_in_cluster_less_than[idx]):
            for node in type nodes:
                node.is_included = False




import numpy as np
class TypeFilter():
    def __init__(self):
        self.type = ['frgion', 'ms1_isotopes', 'mod_seq_charge', 'mod_seq', 'seq', 'gene']
        self.select_cluster = [0, 0, 0, 0, 0, 0]
        self.exclude_if_more_clusters_than = [np.inf, np.inf, 1, np.inf, np.inf, np.inf]
        self.exclude_if_fraction_of_mainclust_smaller_than = [0.3, 0.7, 0, 0.5, 0, 0]
        self.exclude_if_elements_in_cluster_less_than = [1, 1, 1, 1, 1, 1]




In [None]:
def select_representative_DIA_fragions(diffions):
    filtered_ions = []
    precursor2ions = group_ions_by_precursor(diffions)
    for precursor in precursor2ions.keys():
        ions = precursor2ions.get(precursor)
        ions.sort(key = lambda x : x.fc)
        representative_ion = ions[int(np.round(len(ions)/2))]
        filtered_ions.append(representative_ion)
    return filtered_ions


def group_ions_by_precursor(diffions):
    pattern_specnaut = "(.*\.\d{0,1}_)(.*)"
    pattern_diann = "(.*_)(fion.*)"
    if (re.match(pattern_specnaut, diffions[0].name)):
        pattern = pattern_specnaut
    if (re.match(pattern_diann, diffions[0].name)):
        pattern = pattern_diann
    if pattern == None:
        raise Exception("fragment ion not recognized!")

    precursor2ions = {}
    for ion in diffions:
        m = re.match(pattern, ion.name)
        precursor = m.group(1)
        if precursor not in precursor2ions.keys():
            precursor2ions[precursor] = list()
        precursor2ions[precursor].append(ion)
    return precursor2ions