In [1]:
# default_exp outlier_scoring

In [2]:
#hide
#links and constants
QUICKTEST_RESULTS_DIR = "test_data/system_tests/quicktests/results_outlierchecks"
COND1 = "Y10"
COND2 = "Y1"



# Outlier scoring

The default approach of AlphaQuant is to collect the most consistent set of peptides for protein quantification. Potentially, however, there can be valuable information in non-consistent, i.e. outlier peptides, which can be indicative of proteoforms. In this module, we perform a systematic evaluation of outlying groups of peptides in order to determine promising candidates for further investigation.

In particuar, we:

1) Load the clustered tree data and retrieve all proteins with their clustered child peptides
2) Compare the fold changes of the main cluster with the fold changes of the outlier-clusters
3) Filter by both strength of fc diff and the quality of the score
4) Return ranked list


In [3]:
#export
import alphaquant.diffquant_utils as aqutils
import numpy as np
import copy
import anytree

class OutlierHandler():
    def __init__(self, results_dir, cond1, cond2):
        self._protnodes = self.__load_protein_nodes__(results_dir, cond1, cond2)

    def __load_protein_nodes__(self, results_dir, cond1, cond2):
        return aqutils.read_condpair_tree(results_folder=results_dir, cond1=cond1, cond2 = cond2).children
    
    def get_diffclust_overview_list(self):
        """_summary_

        Returns:
            ClusterDiffInfo: object containing the relevant information about two differing clusters (fcfc, peptides), 
            can "reduce" protein nodes to the cluster-relevant peptides
        """
        diffclusts = []
        counter = 0
        for protnode in self._protnodes:
            counter+=1
            cluster_checker = ProtnodeClusterChecker(protnode)
            diffclusts += cluster_checker.get_diffclusts()
        return diffclusts


class ProtnodeClusterChecker():
    def __init__(self, protnode):
        self._protnode = protnode
        self._num_clusters = protnode.num_clusters

    def get_diffclusts(self):
        if not self.__check_if_multiple_clusters__():
            return []
        return self.__get_clusterdiff_info_for_each_cluster()

    def __get_clusterdiff_info_for_each_cluster(self):
        protnodes = []
        mainclust_info= self.__get_cluster_info(clustnum = 0)
        for clustnum in range(1, self._num_clusters):
            outlier_info = self.__get_cluster_info(clustnum)
            protnodes.append(self.__get_clusterdiff_info__(outlier_info, mainclust_info))
        return protnodes

    def __get_cluster_info(self, clustnum):
        mainclust_peptides = self.__get_peptides_of_cluster__(clustnum)
        return ClusterInfo(protein_name=self._protnode.name,peptide_nodes = mainclust_peptides)
    
    def __get_clusterdiff_info__(self, outlier_info, mainclust_info):
        return ClusterDiffInfo(mainclust_info, outlier_info)
    
    def __get_peptides_of_cluster__(self, clustnum):
        return [x for x in self._protnode.children if x.cluster == clustnum]
    
    def __check_if_multiple_clusters__(protein_node):
        return protein_node._num_clusters >1


class ClusterInfo():
    def __init__(self, protein_name,peptide_nodes):
        self.protein_name = protein_name
        self.cluster_number = list({x.cluster for x in peptide_nodes})[0]
        self.peptide_names = [x.name for x in peptide_nodes]
        self.median_fc = np.median(np.array([x.fc for x in peptide_nodes]))
        self.quality_score = self._get_quality_score(peptide_nodes)
    
    @staticmethod
    def _get_quality_score(peptide_nodes):
        if hasattr(peptide_nodes[0], 'predscore'):
            return min([abs(x.predscore) for x in peptide_nodes])
        else:
            return min(1/x.fraction_consistent for x in peptide_nodes)

class ClusterDiffInfo():
    def __init__(self, mainclust_info, outlier_info):
        self.protein_name = mainclust_info.protein_name
        self.clusterpair_id = f"{mainclust_info.cluster_number}_{outlier_info.cluster_number}"
        self.name = f"{self.protein_name}_{self.clusterpair_id}"
        self.fcdiff = abs(mainclust_info.median_fc - outlier_info.median_fc)
        self.quality_score = max(mainclust_info.quality_score, outlier_info.quality_score)
        self.outlier_peptide_names = outlier_info.peptide_names
        self.mainclust_peptide_names = mainclust_info.peptide_names
        self.peptide_names = self.mainclust_peptide_names + self.outlier_peptide_names
    
    def get_clusterdiff_protnode(self, protnode):
        protnode_clusterdiff = copy.deepcopy(protnode)
        self.__remove_peptides_not_in_cluster__(protnode_clusterdiff)
        self.__add_diffinfos__(protnode_clusterdiff)
        return protnode_clusterdiff
    
    def get_num_mainclust_peptides(self):
        return len(self.mainclust_peptide_names)

    def get_num_outlierclust_peptides(self):
        return len(self.outlier_peptide_names)

    def __remove_peptides_not_in_cluster__(self, protnode_clusterdiff):
        for peptide_node in protnode_clusterdiff.children:
            self.__remove_peptide_if_necessary__(peptide_node)

    def __add_diffinfos__(self, protnode):
        protnode.fcdiff = self.fcdiff
        protnode.quality_score = self.quality_score
        protnode.peptide_names = self.peptide_names
    
    def __remove_peptide_if_necessary__(self, peptide_node):
        if peptide_node.name not in self.peptide_names:
            peptide_node.parent = None



class ProtNodeRetriever():
    @staticmethod
    def get_protnodes_from_diffclust_list(condpair_tree, diffclust_list):
        proteins_wanted = {x.protein_name for x in diffclust_list}
        protnodes = list(filter(lambda x : x.name in proteins_wanted, condpair_tree.children))
        return protnodes

In [4]:
#hide
import anytree

def test_that_clusterdiff_infos_have_expected_peptides_and_fold_changes():
    
    prot = ProtNodeCreator().get_simulated_protein_node()
    
    clusterdiff_infos = ProtnodeClusterChecker(prot).get_diffclusts()
    fcdiffs = [x.fcdiff for x in clusterdiff_infos]
    quality_scores = [x.quality_score for x in clusterdiff_infos]
    peptides = [x.peptide_names for x in clusterdiff_infos]
    
    #self calculated fcdiffs
    fcdiffs_ref = [5, 11.5]
    quality_scores_ref = [0.3, 0.4]
    peptides_ref = [[f"pep{x}" for x in range(5)] + [f"pep{x}" for x in range(5, 7)], [f"pep{x}" for x in range(5)] + [f"pep{x}" for x in range(7, 10)]]
    
    assert sorted(fcdiffs) == sorted(fcdiffs_ref)
    assert sorted(quality_scores) == sorted(quality_scores_ref)
    assert sorted(peptides) == sorted(peptides_ref)
    print("performed test")
    

class ProtNodeCreator():
    def __init__(self):
        self.names = [f"pep{x}" for x in range(10)]
        self.fold_changes = [-1.5 for x in range(5)] + [3.5 for x in range(2)] + [10 for x in range(3)]
        self.clusters = [0 for x in range(5)] + [1 for x in range(2)] + [2 for x in range(3)]
        self.predscores = [0.1 for x in range(5)] + [0.3 for x in range(2)] + [0.4 for x in range(3)]

    def get_simulated_protein_node(self):
        prot = anytree.Node(name="prot1", type = "gene")
        prot.fc = 1
        prot.num_clusters = 3
        self._add_peptide_nodes(prot)
        return prot

    def _add_peptide_nodes(self, protein_node):
        for idx in range(10):
            pepnode = anytree.Node(self.names[idx])
            pepnode.fc = self.fold_changes[idx]
            pepnode.cluster = self.clusters[idx]
            pepnode.predscore = self.predscores[idx]
            pepnode.type = "seq"
            pepnode.parent = protein_node

test_that_clusterdiff_infos_have_expected_peptides_and_fold_changes()




performed test


## PeptideInfos
We extend the `ProtnodeClusterChecker` for the case that we are interested in single 'outlier' peptides

In [5]:
#export

class OutlierPeptideLoader():
    def __init__(self, condpair_tree):
        self._condpair_tree = condpair_tree
        self.outlier_peptides = []
        self._add_outlier_peptides()
        
    def _add_outlier_peptides(self):
        for protnode in self._condpair_tree.children:
            nodechecker = ProtnodeClusterCheckerPeptideInfos(protnode)
            self.outlier_peptides += nodechecker.get_outlier_peptide_infos()


class ProtnodeClusterCheckerPeptideInfos(ProtnodeClusterChecker):
    def __init__(self, protnode):
        super().__init__(protnode)
        self._outlier_peptide_infos = []

    def get_outlier_peptide_infos(self):
        diffclusts = self.get_diffclusts()
        for clusterdiffinfo in diffclusts:
            self._update_outlier_peptide_infos(clusterdiffinfo)
        return self._outlier_peptide_infos

    def _update_outlier_peptide_infos(self, clusterdiffinfo):
        peptide_nodes = self._get_outlier_peptide_nodes(clusterdiffinfo)
        for peptide_node in peptide_nodes:
            self._outlier_peptide_infos.append(OutlierPeptideInfo(peptide_node))

    def _get_outlier_peptide_nodes(self, clusterdiffinfo):
        peptide_names = set(clusterdiffinfo.outlier_peptide_names)
        return anytree.findall(self._protnode, filter_= lambda x : x.name in peptide_names, maxlevel=2)


class ProteinInfo():
    def __init__(self, peptide_node):
        self.protein_fc = self._get_protein_fc(peptide_node)
    
    def _get_protein_fc(self, peptide_node):
        return aqutils.find_node_parent_at_level(peptide_node, "gene").fc


class OutlierPeptideInfo(ProteinInfo):
    def __init__(self, peptide_node):
        super().__init__(peptide_node)
        self._peptide_node = peptide_node
        self.peptide_sequence = peptide_node.name
        self.fc = peptide_node.fc
        self.quality_score = self._get_quality_score(peptide_node)
        self.protnormed_fc = None
        self.num_mainclust_peptides = self._get_number_mainclust_peptides()
        self._calc_protnormed_fc()

    def _get_quality_score(self, peptide_node):
        has_predscore = hasattr(peptide_node, 'predscore')
        if has_predscore:
            return abs(peptide_node.predscore)
        else:
            return peptide_node.fraction_consistent

    def _calc_protnormed_fc(self):
        self.protnormed_fc = self.fc - self.protein_fc

    def _get_number_mainclust_peptides(self):
        samelevel_nodes = self._peptide_node.parent.children
        mainclust_nodes = filter(lambda x : x.cluster ==0, samelevel_nodes)
        return len(list(mainclust_nodes))


In [6]:
#hide
def test_that_clusterdiff_peptides_have_expected_fold_changes():
    protnode = ProtNodeCreator().get_simulated_protein_node()
    outlier_infos = ProtnodeClusterCheckerPeptideInfos(protnode).get_outlier_peptide_infos()
    fold_changes_oi = [x.fc for x in outlier_infos]
    fold_changes_pn = [x.fc for x in protnode.children if x.cluster !=0]
    peptide_names_oi = [x.peptide_sequence for x in outlier_infos]
    peptide_names_pn = [x.name for x in protnode.children  if x.cluster !=0]

    assert sorted(fold_changes_oi) == sorted(fold_changes_pn)
    assert sorted(peptide_names_oi) == sorted(peptide_names_pn)
    print("performed test")

test_that_clusterdiff_peptides_have_expected_fold_changes()

performed test


## Modified peptide loader

Modified peptides are interesting because they can alter fold changes of unmodified peptides. The below classes extract relevant information about modified peptides to allow easier comparison with unmodified peptides. These modified peptides come from different experiments and hence need to be handled differently.

In [7]:
#export
import anytree


class ModifiedPeptideLoader():
    def __init__(self, condpair_tree, specific_modification = "[Phospho (STY)]"):
        self.specific_modification = specific_modification
        self.condpair_tree = condpair_tree
        self._pepname2modpep = {}
        self._load_modified_peptides_from_tree()
    
    def get_modpep_from_sequence(self, peptide_sequence):
        return self._pepname2modpep.get(peptide_sequence)
    
    def _load_modified_peptides_from_tree(self):
        modified_pepnodes = self._get_modified_peptide_nodes()
        for mod_pep_node in modified_pepnodes:
            self._update_pepname2modpep(mod_pep_node)

    def _get_modified_peptide_nodes(self):
        return anytree.search.findall(self.condpair_tree, lambda x : getattr(x,'type',"") == 'mod_seq', maxlevel=4)
    
    def _update_pepname2modpep(self, mod_pep_node):
        modified_peptide = PeptideWithSpecificModification(mod_pep_node, self.specific_modification)
        if modified_peptide.specific_modification_found:
            self._pepname2modpep[modified_peptide.peptide_sequence] = modified_peptide


class PeptideWithSpecificModification(OutlierPeptideInfo):
    def __init__(self, node_modpeptide, specific_modification= "[Phospho (STY)]"):
        self.protein_name = self._get_protein_name(node_modpeptide)
        self.modified_sequence = node_modpeptide.name
        self.specific_modification_found = self._check_for_specific_modification(specific_modification)
        if not self.specific_modification_found:
            return
        self.peptide_sequence = self._get_peptide_sequence(node_modpeptide)
        self.fc = node_modpeptide.fc
        self.quality_score = self._get_quality_score(node_modpeptide)

    def _check_for_specific_modification(self, specific_modification):
        return specific_modification in self.modified_sequence

    def _get_peptide_sequence(self, node_modpeptide):
        pepnode = aqutils.find_node_parent_at_level(node_modpeptide, level='seq')
        return pepnode.name

    def _get_protein_name(self, node_modpeptide):
        pepnode = aqutils.find_node_parent_at_level(node_modpeptide, level='gene')
        return pepnode.name

In [8]:
#hide
class ModPepCreator():
    def __init__(self):
        peptide_sequence = None

    def get_modpep_node(self):
        protnode = ProtNodeCreator().get_simulated_protein_node()
        pepnode = protnode.children[0] #just select first peptide
        self.peptide_sequence = pepnode.name
        modpep = anytree.Node(f"{pepnode.name}_{pepnode.name}[Phospho (STY)]",parent=pepnode)
        modpep.fc = 42
        modpep.predscore = -42
        modpep.type = 'mod_seq'
        return modpep


def test_that_modpep_is_handled_correctly():
    modpepcreator = ModPepCreator()
    modpep_node = modpepcreator.get_modpep_node()
    modpep = PeptideWithSpecificModification(modpep_node)
    assert modpep.peptide_sequence == modpepcreator.peptide_sequence
    assert modpep.modified_sequence == modpep_node.name
    print("performed test")


test_that_modpep_is_handled_correctly()

performed test


In [9]:
#hide

def test_modfied_peptide_loading_from_tree():
    import alphaquant.diffquant_utils as aqutils
    
    condpair_tree = aqutils.read_condpair_tree(COND1, COND2, QUICKTEST_RESULTS_DIR)
    modpep_loader = ModifiedPeptideLoader(condpair_tree,specific_modification="[Oxidation (M)]")
    assert len(modpep_loader._pepname2modpep) >0
    modpep_example = modpep_loader.get_modpep_from_sequence('SEQ_SVEMHHEQLEQGVPGDNVGFNVK_')
    assert 'SEQ_SVEMHHEQLEQGVPGDNVGFNVK_' in modpep_example.modified_sequence
    assert "[Oxidation (M)]" in modpep_example.modified_sequence

test_modfied_peptide_loading_from_tree()



## Complemented Clusters
One thing we can do, is to compare outlier peptides with modified peptides if we have a good dataset. A question of interest is for example, if we can find matching pairs of unmodified and modified peptides. If we find such pairs, we can compare the direction of regulation and for example the dependence on the quality score.

In [10]:
#export
import numpy as np
class ComplementedClusterLoader():
    def __init__(self, outlier_peptide_loader, modified_peptide_loader):
        self._outlier_peptides = outlier_peptide_loader.outlier_peptides
        self._modified_peptide_loader = modified_peptide_loader
        self.complemented_clusters = []
        self._find_complemented_clusters()

    def _find_complemented_clusters(self):
        for outlier_peptide in self._outlier_peptides:
            modified_peptide = self._get_modified_peptide(outlier_peptide)
            if modified_peptide is not None:
                self.complemented_clusters.append(ComplementedCluster(outlier_peptide, modified_peptide))

    def _get_modified_peptide(self, outlier_peptide):
        return self._modified_peptide_loader.get_modpep_from_sequence(outlier_peptide.peptide_sequence)


class ComplementedCluster():
    def __init__(self, outlier_peptide, modified_peptide):
        self.outlier_peptide = outlier_peptide
        self.modified_peptide = modified_peptide
        self._add_normfc_to_modpep()

    def has_opposite_regulation(self):
        return np.sign(self.outlier_peptide.protnormed_fc) == -np.sign(self.modified_peptide.protnormed_fc)

    def get_quality_score(self):
        return max(self.outlier_peptide.quality_score, self.modified_peptide.quality_score)

    def get_outlier_quality_score(self):
        return self.outlier_peptide.quality_score

    def get_modpep_quality_score(self):
        return self.modified_peptide.quality_score
    
    def get_min_abs_normfc(self):
        return min(abs(self.outlier_peptide.protnormed_fc), abs(self.modified_peptide.protnormed_fc))

    def get_max_abs_normfc(self):
        return max(abs(self.outlier_peptide.protnormed_fc), abs(self.modified_peptide.protnormed_fc))

    def get_outlier_abs_normfc(self):
        return abs(self.outlier_peptide.protnormed_fc)
    
    def get_ptm_abs_normfc(self):
        return abs(self.modified_peptide.protnormed_fc)

    def get_ptm_abs_fc(self):
        return abs(self.modified_peptide.fc)

    def get_number_mainclust_peptides(self):
        return self.outlier_peptide.num_mainclust_peptides

    def _add_normfc_to_modpep(self):
        self.modified_peptide.protein_fc = self.outlier_peptide.protein_fc
        self.modified_peptide._calc_protnormed_fc()



In [11]:
#hide

def test_complemented_cluster_retains_properties():
    outlier_peptide = load_outlier_peptide()
    modpepcreator = ModPepCreator()
    modpep_node = modpepcreator.get_modpep_node()
    modpep = PeptideWithSpecificModification(modpep_node)
    complemented_cluster = ComplementedCluster(outlier_peptide, modpep)
    
    modpep = complemented_cluster.modified_peptide
    outlier_pep = complemented_cluster.outlier_peptide

    
    assert modpep.peptide_sequence == modpepcreator.peptide_sequence
    assert modpep.modified_sequence == modpep_node.name
    assert outlier_pep.peptide_sequence == outlier_peptide.peptide_sequence

    print("performed test")


def load_outlier_peptide():
    protnode = ProtNodeCreator().get_simulated_protein_node()
    outlier_infos = ProtnodeClusterCheckerPeptideInfos(protnode).get_outlier_peptide_infos()
    return outlier_infos[0]


test_complemented_cluster_retains_properties()

performed test


In [14]:
#hide

def test_complemented_cluster_loading_from_tree():
    import alphaquant.diffquant_utils as aqutils
    
    condpair_tree = aqutils.read_condpair_tree(COND1, COND2, QUICKTEST_RESULTS_DIR)
    modpep_loader = ModifiedPeptideLoader(condpair_tree,specific_modification="[Oxidation (M)]")
    outlier_loader = OutlierPeptideLoader(condpair_tree)
    cclust_loader = ComplementedClusterLoader(outlier_loader, modpep_loader)
    complemented_clusters = cclust_loader.complemented_clusters
    assert len(complemented_clusters)>0
    compare_against_hand_checked_cclust(complemented_clusters)
    check_that_there_are_no_pepname_duplicates(complemented_clusters)
    check_ccluster_list(complemented_clusters)

def check_ccluster_list(complemented_clusters, print_details = False):
    mergedlist = []
    fcs_outliers = list(set([x.outlier_peptide.protnormed_fc for x in complemented_clusters]))
    for cclust in complemented_clusters:
        if print_details:
            print(cclust.outlier_peptide.peptide_sequence)
            print(cclust.modified_peptide.modified_sequence)
            norm_fc_outlier = cclust.outlier_peptide.protnormed_fc
            norm_fc_modpep = cclust.modified_peptide.protnormed_fc
            print(f"{norm_fc_outlier} vs. {norm_fc_modpep}")
            
        mergedlist.append(f"{cclust.outlier_peptide.peptide_sequence}_{cclust.modified_peptide.modified_sequence}")
    assert len(mergedlist)==len(set(mergedlist))
    assert len(mergedlist) == len(fcs_outliers)


def compare_against_hand_checked_cclust(complemented_clusters):
    gene = 'P32324'
    modseq_outlier = 'SEQ_EGPIFGEEMR_MOD__EGPIFGEEMR__'
    pepseq = 'SEQ_EGPIFGEEMR_'
    modseq = 'SEQ_EGPIFGEEMR_MOD__EGPIFGEEM[Oxidation (M)]R__'
    
    fc_modpep = -0.6681234989903941
    fc_outlier = 0.9317352548622448
    fc_protein = 0.09399380538814694

    cclust = filter(lambda x : x.modified_peptide.modified_sequence == modseq, complemented_clusters).__next__()
    assert cclust.modified_peptide.peptide_sequence == pepseq
    assert cclust.modified_peptide.fc == fc_modpep
    assert cclust.modified_peptide.protnormed_fc ==  fc_modpep-fc_protein

    assert cclust.outlier_peptide.peptide_sequence == pepseq
    assert cclust.outlier_peptide.fc == fc_outlier
    assert cclust.outlier_peptide.protnormed_fc == fc_outlier - fc_protein
    print("performed tests")

def check_that_there_are_no_pepname_duplicates(complemented_clusters):
    peptide_names = [f"{x.outlier_peptide.peptide_sequence}_{x.modified_peptide.modified_sequence}" for x in complemented_clusters]
    assert len(peptide_names) == len(set(peptide_names))





test_complemented_cluster_loading_from_tree()

performed tests


## Complemented Cluster Evaluator

In [13]:
#export
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats


class ComplementedClusterEvaluator():
    def __init__(self, complemented_clusterloader, cluster_filterconfigs):
        self._complemented_clusters = self._load_complemented_clusters(complemented_clusterloader, cluster_filterconfigs)
        self._fcs_outliers = None
        self._fcs_modpeps = None
        self._assign_fold_change_lists()


    def compare_regulation_directions(self, ax):
        opposite_regulation_overview = [int(x.has_opposite_regulation()) for x in self._complemented_clusters]
        self._plot_regulation_direction_histogram(ax, opposite_regulation_overview)

    def scatter_fold_changes(self,ax):

        num_opposite = sum([np.sign(x[0])==-np.sign(x[1]) for x in zip(self._fcs_outliers, self._fcs_modpeps)])
        num_same = sum([np.sign(x[0])==np.sign(x[1]) for x in zip(self._fcs_outliers, self._fcs_modpeps)])
        print(f"{num_same} same, {num_opposite} opposite")
        sns.scatterplot(x =self._fcs_outliers, y=self._fcs_modpeps,ax=ax)
        ax.set_xlabel("outliers")
        ax.set_ylabel("modified_peptides")
        self._set_axis_limits(ax)
        self._draw_horizontal_vertical_line(ax)

    def calculate_correlation(self):
        r, p = scipy.stats.pearsonr(self._fcs_outliers, self._fcs_modpeps)
        print(f"pval is {p}")
        return r
        

    def _assign_fold_change_lists(self):
        self._fcs_outliers = list([x.outlier_peptide.protnormed_fc for x in self._complemented_clusters])
        self._fcs_modpeps = list([x.modified_peptide.protnormed_fc for x in self._complemented_clusters])

    def _set_axis_limits(self,ax):
        all_lims = ax.get_xlim() + ax.get_ylim() #returns tuples with the lims
        most_extreme_val =  max((abs(x) for x in all_lims))
        ax.set_xlim(-most_extreme_val, most_extreme_val)
        ax.set_ylim(-most_extreme_val, most_extreme_val)

    def _draw_horizontal_vertical_line(self, ax):
        ax.hlines(y=0, xmin=ax.get_xlim()[0], xmax=ax.get_xlim()[1], colors='black')
        ax.vlines(x=0, ymin = ax.get_ylim()[0], ymax = ax.get_ylim()[1], colors='black')
        


    @staticmethod
    def _load_complemented_clusters(complemented_clusterloader, cluster_filterconfigs):
        cfilterer = ComplementedClusterFilterer(complemented_clusterloader, cluster_filterconfigs)
        return cfilterer.get_filtered_complemented_clusters()
    
    @staticmethod
    def _plot_regulation_direction_histogram(ax, opposite_regulation_overview):
        ax.hist(opposite_regulation_overview)




## Ranking and Filtering Clusters and Outliers

In [8]:
#export
class QuantileFilterer():
    def __init__(self, objects_to_filter, filterconfigs):
        """Filters objects (e.g. ClusterDiffInfos) based on the
        FilterConfigs defined below. The objects need to have
        a function with the identical name as in FilterConfig.property_name
        which returns a scalar. The quantiles are defined with respect to this scalar.

        Args:
            objects_to_filter (_type_): _description_
            filterconfigs (_type_): _description_
        """
        
        self._objects_to_filter = objects_to_filter
        self._filterconfigs = filterconfigs

    def get_filtered_list_of_objects(self):
        if self._filterconfigs is None:
            return self._objects_to_filter
        else:
            return self._filter_objects()
    
    def _filter_objects(self):
        individually_filtered = []
        for filterconf in self._filterconfigs:
            filtered_cclusts = set(self._filter_to_property_quantile(filterconf))
            individually_filtered.append(filtered_cclusts)
        return list(set.intersection(*individually_filtered)) #we filter the quantile of the COMPLETE set for 
        #every condition and intersect in the end, alternatively one could successively filter the quantiles


    def _filter_to_property_quantile(self, filterconf):#get the quantiles with the best property scores i.e. quality score
        property_sorted = self._sort_objects_to_filter_by_score(filterconf.property_name)

        if filterconf.quantile_starts_at_lowest:
            number_to_retain = int(filterconf.quantile * len(property_sorted))
            return property_sorted[:number_to_retain]
        else:
            number_to_discard = int((1-filterconf.quantile) * len(property_sorted))
            return property_sorted[number_to_discard:]

    def _sort_objects_to_filter_by_score(self, property_name):
        try:
            return self._property_encodes_instance_variable(property_name)
        except:
            return self._property_encodes_function_name(property_name)

    def _property_encodes_instance_variable(self, property_name):
        return sorted(self._objects_to_filter, key = lambda x : getattr(x, property_name))

    def _property_encodes_function_name(self, property_name):
        return sorted(self._objects_to_filter, key = lambda x : getattr(x, property_name)())



class FilterConfig():
    def __init__(self, property_name, quantile, quantile_starts_at_lowest):
        self.property_name = property_name
        self.quantile = quantile
        self.quantile_starts_at_lowest = quantile_starts_at_lowest



### Test Filtering

In [37]:
#hide


def test_object_filtering():
    obj_creator = FilterObjectCreator()
    allsame_objects = obj_creator.get_all_same_objects()
    second_inverted_objects = obj_creator.get_second_object_inverted()
    check_allsame(allsame_objects)
    check_inverted(second_inverted_objects)



class FilterObjectCreator():

    def get_all_same_objects(self):
        first_values = [0, 1, 2, 3, 4]
        second_values = [0, 1, 2, 3, 4]
        third_values = [0, 1, 2, 3, 4]
        return self._make_filterobjects_from_vectors(first_values, second_values, third_values)

    def get_second_object_inverted(self):
        first_values = [0, 1, 2, 3, 4]
        second_values = [4, 3, 2, 1, 0]
        third_values = [0, 1, 2, 3, 4]
        return self._make_filterobjects_from_vectors(first_values, second_values, third_values)

    def _make_filterobjects_from_vectors(self, first_values, second_values, third_values):
        objs = []
        for idx in range(len(first_values)):
            objs.append(FilterObject(first_values[idx], second_values[idx], third_values[idx]))
        return objs


class FilterObject():
    def __init__(self, first_value, second_value, third_value):
        self.first_value = first_value
        self.second_value = second_value
        self.third_value = third_value

    def get_first_value(self):
        return self.first_value

    def get_second_value(self):
        return self.second_value

    def get_third_value(self):
        return self.third_value

def get_filterconfig_list(quantile):
    res_list = []
    res_list.append(FilterConfig("get_first_value", quantile, False))
    res_list.append(FilterConfig("get_second_value", quantile, False))
    res_list.append(FilterConfig("get_third_value", quantile, False))
    return res_list


def check_allsame(allsame_objects):
    filterer = QuantileFilterer(allsame_objects, filterconfigs= get_filterconfig_list(0.6))
    filtlist = filterer.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist]) == sorted([4, 3, 2])

    filterer2 = QuantileFilterer(allsame_objects, filterconfigs= get_filterconfig_list(1))
    filtlist2 = filterer2.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist2]) == sorted([4, 3, 2, 1, 0])

    filterer3 = QuantileFilterer(allsame_objects, filterconfigs= get_filterconfig_list(0.1))
    filtlist3 = filterer3.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist3]) == sorted([4])


def check_inverted(inverted_objects):
    filterer = QuantileFilterer(inverted_objects, filterconfigs= get_filterconfig_list(0.79))
    filtlist = filterer.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist]) == sorted([3, 2, 1])

    filterer2 = QuantileFilterer(inverted_objects, filterconfigs= get_filterconfig_list(1))
    filtlist2 = filterer2.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist2]) == sorted([4, 3, 2, 1, 0])

    filterer3 = QuantileFilterer(inverted_objects, filterconfigs= get_filterconfig_list(0.39))
    filtlist3 = filterer3.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist3]) == sorted([])



test_object_filtering()

In [None]:
#export
class ComplementedClusterFilterer(QuantileFilterer):
    def __init__(self, complemented_clusterloader, clusterfilterconfigs):
        super().__init__(complemented_clusterloader.complemented_clusters, clusterfilterconfigs)

    def get_filtered_complemented_clusters(self):
        return self.get_filtered_list_of_objects()
    


class ComplementedClusterFilterConfigs():
    def __init__(self, min_abs_normfc_quantile = 1, ptm_abs_normfc_quantile = 1, outlier_abs_normfc_quantile = 1, ptm_absfc_quantile = 1,
    modpep_quality_quantile = 1, outlier_quality_quantile = 1, number_mainclustpeps_quantile = 1):
        self.filterconfigs = []
        self._min_abs_normfc_quantile =min_abs_normfc_quantile
        self._ptm_abs_normfc_quantile=ptm_abs_normfc_quantile
        self._number_mainclustpeps_quantile = number_mainclustpeps_quantile
        self._outlier_abs_normfc_quantile= outlier_abs_normfc_quantile
        self._ptm_absfc_quantile=ptm_absfc_quantile
        self._outlier_quality_quantile=outlier_quality_quantile
        self._modpep_quality_quantile = modpep_quality_quantile
        self._number_mainclustpeps_quantile = number_mainclustpeps_quantile
        self._initialize_filter_configs()

    def _initialize_filter_configs(self):
        self.filterconfigs.append(FilterConfig("get_min_abs_normfc", self._min_abs_normfc_quantile, False))
        self.filterconfigs.append(FilterConfig("get_ptm_abs_normfc", self._ptm_abs_normfc_quantile, False))
        self.filterconfigs.append(FilterConfig("get_ptm_abs_fc", self._ptm_absfc_quantile, False))
        self.filterconfigs.append(FilterConfig("get_outlier_abs_normfc", self._outlier_abs_normfc_quantile, False))
        self.filterconfigs.append(FilterConfig("get_outlier_quality_score", self._outlier_quality_quantile, True))
        self.filterconfigs.append(FilterConfig("get_modpep_quality_score", self._modpep_quality_quantile, True))
        self.filterconfigs.append(FilterConfig("get_number_mainclust_peptides", self._number_mainclustpeps_quantile, False))




In [None]:
#export

class DiffClusterFilterer(QuantileFilterer):
    def __init__(self, diffclust_list, diffclustfilterconfigs):
        super().__init__(diffclust_list, diffclustfilterconfigs)

    def get_filtered_diffclust_list(self):
        return self.get_filtered_list_of_objects()
    

class DiffClusterFilterConfig(FilterConfig):
    def __init__(self, fcdiff_quantile = 1, quality_score_quantile = 1, num_mainclust_peptides_quantile = 1, num_outlierclust_peptides_quantile = 1):
        self.filterconfigs = []
        self._fcdiff_quantile = fcdiff_quantile
        self._quality_score_quantile = quality_score_quantile
        self._num_mainclust_peptides_quantile = num_mainclust_peptides_quantile
        self._num_outlierclust_peptides_quantile = num_outlierclust_peptides_quantile
        self._initialize_filter_configs()

    def _initialize_filter_configs(self):
        self.filterconfigs.append(FilterConfig("fcdiff", self._fcdiff_quantile,False))
        self.filterconfigs.append(FilterConfig("quality_score", self._quality_score_quantile, True))
        self.filterconfigs.append(FilterConfig("get_num_mainclust_peptides", self._num_mainclust_peptides_quantile, False))
        self.filterconfigs.append(FilterConfig("get_num_outlierclust_peptides", self._num_outlierclust_peptides_quantile, False))




In [None]:
#export

class OutlierPeptideFilterer(QuantileFilterer):
    def __init__(self, outlier_peptide_list, outlierpeptide_filterconfigs):
        super().__init__(outlier_peptide_list, outlierpeptide_filterconfigs)

    def get_filtered_outlier_peptide_list(self):
        return self.get_filtered_list_of_objects()
    

class OutlierPeptideFilterConfigs(FilterConfig):
    def __init__(self, quality_score_quantile = 1, num_mainclust_peptides_quantile = 1, protnormed_fc_quantile = 1):
        self.filterconfigs = []
        self._quality_score_quantile = quality_score_quantile
        self._num_mainclust_peptides_quantile = num_mainclust_peptides_quantile
        self._protnormed_fc_quantile = protnormed_fc_quantile
        self._initialize_filter_configs()

    def _initialize_filter_configs(self):
        self.filterconfigs.append(FilterConfig("quality_score", self._quality_score_quantile, True))
        self.filterconfigs.append(FilterConfig("num_mainclust_peptides", self._num_mainclust_peptides_quantile, False))
        self.filterconfigs.append(FilterConfig("protnormed_fc", self._protnormed_fc_quantile, False))