In [None]:
# default_exp outlier_scoring

In [None]:
#hide
#links and constants
QUICKTEST_RESULTS_DIR = "../test_data/unit_tests/cluster_ions/outdir_quicktest"
COND1 = "Y10"
COND2 = "Y1"



# Outlier scoring

The default approach of AlphaQuant is to collect the most consistent set of peptides for protein quantification. Potentially, however, there can be valuable information in non-consistent, i.e. outlier peptides, which can be indicative of proteoforms. In this module, we perform a systematic evaluation of outlying groups of peptides in order to determine promising candidates for further investigation.

In particuar, we:

1) Load the clustered tree data and retrieve all proteins with their clustered child peptides
2) Compare the fold changes of the main cluster with the fold changes of the outlier-clusters
3) Filter by both strength of fc diff and the quality of the score
4) Return ranked list


## Module test

In [None]:
#hide
import alphaquant.cluster.outlier_scoring as aqoutlier
import alphaquant.cluster.cluster_ions as aqclust
import anytree


def test_that_clusterdiff_infos_have_expected_peptides_and_fold_changes():
    
    prot = ProtNodeCreator().get_simulated_protein_node()
    
    clusterdiff_infos = aqoutlier.ProtnodeClusterChecker(prot).get_diffclusts()
    fcdiffs = [x.fcdiff for x in clusterdiff_infos]
    quality_scores = [x.quality_score for x in clusterdiff_infos]
    peptides = [x.peptide_names for x in clusterdiff_infos]
    
    #self calculated fcdiffs
    fcdiffs_ref = [5, 11.5]
    quality_scores_ref = [0.3, 0.4]
    peptides_ref = [[f"pep{x}" for x in range(5)] + [f"pep{x}" for x in range(5, 7)], [f"pep{x}" for x in range(5)] + [f"pep{x}" for x in range(7, 10)]]
    
    assert sorted(fcdiffs) == sorted(fcdiffs_ref)
    assert sorted(quality_scores) == sorted(quality_scores_ref)
    assert sorted(peptides) == sorted(peptides_ref)
    print("performed test")
    

class ProtNodeCreator():
    def __init__(self):
        self.names = [f"pep{x}" for x in range(10)]
        self.fold_changes = [-1.5 for x in range(5)] + [3.5 for x in range(2)] + [10 for x in range(3)]
        self.clusters = [0 for x in range(5)] + [1 for x in range(2)] + [2 for x in range(3)]
        self.predscores = [0.1 for x in range(5)] + [0.3 for x in range(2)] + [0.4 for x in range(3)]

    def get_simulated_protein_node(self):
        prot = anytree.Node(name="prot1", type = "gene")
        prot.fc = 1
        prot.num_clusters = 3
        self._add_peptide_nodes(prot)
        return prot

    def _add_peptide_nodes(self, protein_node):
        for idx in range(10):
            pepnode = anytree.Node(self.names[idx])
            pepnode.fc = self.fold_changes[idx]
            pepnode.cluster = self.clusters[idx]
            pepnode.predscore = self.predscores[idx]
            pepnode.type = "seq"
            pepnode.parent = protein_node

test_that_clusterdiff_infos_have_expected_peptides_and_fold_changes()




## PeptideInfos
We extend the `ProtnodeClusterChecker` for the case that we are interested in single 'outlier' peptides

In [None]:
#export

import alphaquant.cluster.cluster_utils as aqclustutils

class OutlierPeptideLoader():
    def __init__(self, condpair_tree):
        self._condpair_tree = condpair_tree
        self.outlier_peptides = []
        self._add_outlier_peptides()
        
    def _add_outlier_peptides(self):
        for protnode in self._condpair_tree.children:
            nodechecker = ProtnodeClusterCheckerPeptideInfos(protnode)
            self.outlier_peptides += nodechecker.get_outlier_peptide_infos()


class ProtnodeClusterCheckerPeptideInfos(aqoutlier.ProtnodeClusterChecker):
    def __init__(self, protnode):
        super().__init__(protnode)
        self._outlier_peptide_infos = []

    def get_outlier_peptide_infos(self):
        diffclusts = self.get_diffclusts()
        for clusterdiffinfo in diffclusts:
            self._update_outlier_peptide_infos(clusterdiffinfo)
        return self._outlier_peptide_infos

    def _update_outlier_peptide_infos(self, clusterdiffinfo):
        peptide_nodes = self._get_outlier_peptide_nodes(clusterdiffinfo)
        for peptide_node in peptide_nodes:
            self._outlier_peptide_infos.append(OutlierPeptideInfo(peptide_node))

    def _get_outlier_peptide_nodes(self, clusterdiffinfo):
        peptide_names = set(clusterdiffinfo.outlier_peptide_names)
        return anytree.findall(self._protnode, filter_= lambda x : x.name in peptide_names, maxlevel=2)


class ProteinInfo():
    def __init__(self, peptide_node):
        self.protein_fc = self._get_protein_fc(peptide_node)
    
    def _get_protein_fc(self, peptide_node):
        return aqclustutils.find_node_parent_at_level(peptide_node, "gene").fc


class OutlierPeptideInfo(ProteinInfo):
    def __init__(self, peptide_node):
        super().__init__(peptide_node)
        self._peptide_node = peptide_node
        self.peptide_sequence = peptide_node.name
        self.fc = peptide_node.fc
        self.quality_score = self._get_quality_score(peptide_node)
        self.protnormed_fc = None
        self.num_mainclust_peptides = self._get_number_mainclust_peptides()
        self._calc_protnormed_fc()

    def _get_quality_score(self, peptide_node):
        has_predscore = hasattr(peptide_node, 'predscore')
        if has_predscore:
            return abs(peptide_node.predscore)
        else:
            return 1/peptide_node.fraction_consistent

    def _calc_protnormed_fc(self):
        self.protnormed_fc = self.fc - self.protein_fc

    def _get_number_mainclust_peptides(self):
        samelevel_nodes = self._peptide_node.parent.children
        mainclust_nodes = filter(lambda x : x.cluster ==0, samelevel_nodes)
        return len(list(mainclust_nodes))


In [None]:
#hide
def test_that_clusterdiff_peptides_have_expected_fold_changes():
    protnode = ProtNodeCreator().get_simulated_protein_node()
    outlier_infos = ProtnodeClusterCheckerPeptideInfos(protnode).get_outlier_peptide_infos()
    fold_changes_oi = [x.fc for x in outlier_infos]
    fold_changes_pn = [x.fc for x in protnode.children if x.cluster !=0]
    peptide_names_oi = [x.peptide_sequence for x in outlier_infos]
    peptide_names_pn = [x.name for x in protnode.children  if x.cluster !=0]

    assert sorted(fold_changes_oi) == sorted(fold_changes_pn)
    assert sorted(peptide_names_oi) == sorted(peptide_names_pn)
    print("performed test")

test_that_clusterdiff_peptides_have_expected_fold_changes()

## Modified peptide loader

Modified peptides are interesting because they can alter fold changes of unmodified peptides. The below classes extract relevant information about modified peptides to allow easier comparison with unmodified peptides. These modified peptides come from different experiments and hence need to be handled differently.

In [None]:
#hide
class ModPepCreator():
    def __init__(self):
        peptide_sequence = None

    def get_modpep_node(self):
        protnode = ProtNodeCreator().get_simulated_protein_node()
        pepnode = protnode.children[0] #just select first peptide
        self.peptide_sequence = pepnode.name
        modpep = anytree.Node(f"{pepnode.name}_{pepnode.name}[Phospho (STY)]",parent=pepnode)
        modpep.fc = 42
        modpep.predscore = -42
        modpep.type = 'mod_seq'
        return modpep


def test_that_modpep_is_handled_correctly():
    modpepcreator = ModPepCreator()
    modpep_node = modpepcreator.get_modpep_node()
    modpep = aqoutlier.PeptideWithSpecificModification(modpep_node)
    assert modpep.peptide_sequence == modpepcreator.peptide_sequence
    assert modpep.modified_sequence == modpep_node.name
    print("performed test")


test_that_modpep_is_handled_correctly()

In [None]:
#hide

def test_modfied_peptide_loading_from_tree():
    condpair_tree = aqclustutils.read_condpair_tree(COND1, COND2, QUICKTEST_RESULTS_DIR)
    modpep_loader = aqoutlier.ModifiedPeptideLoader(condpair_tree,specific_modification="[Oxidation (M)]")
    assert len(modpep_loader._pepname2modpep) >0
    modpep_example = modpep_loader.get_modpep_from_sequence('SEQ_SVEMHHEQLEQGVPGDNVGFNVK_')
    assert 'SEQ_SVEMHHEQLEQGVPGDNVGFNVK_' in modpep_example.modified_sequence
    assert "[Oxidation (M)]" in modpep_example.modified_sequence

test_modfied_peptide_loading_from_tree()



## Complemented Clusters
One thing we can do, is to compare outlier peptides with modified peptides if we have a good dataset. A question of interest is for example, if we can find matching pairs of unmodified and modified peptides. If we find such pairs, we can compare the direction of regulation and for example the dependence on the quality score.

In [None]:
#hide

def test_complemented_cluster_retains_properties():
    outlier_peptide = load_outlier_peptide()
    modpepcreator = ModPepCreator()
    modpep_node = modpepcreator.get_modpep_node()
    modpep = aqoutlier.PeptideWithSpecificModification(modpep_node)
    complemented_cluster = aqoutlier.ComplementedCluster(outlier_peptide, modpep)
    
    modpep = complemented_cluster.modified_peptide
    outlier_pep = complemented_cluster.outlier_peptide

    
    assert modpep.peptide_sequence == modpepcreator.peptide_sequence
    assert modpep.modified_sequence == modpep_node.name
    assert outlier_pep.peptide_sequence == outlier_peptide.peptide_sequence

    print("performed test")


def load_outlier_peptide():
    protnode = ProtNodeCreator().get_simulated_protein_node()
    outlier_infos = ProtnodeClusterCheckerPeptideInfos(protnode).get_outlier_peptide_infos()
    return outlier_infos[0]


test_complemented_cluster_retains_properties()

In [None]:
#hide

def test_complemented_cluster_loading_from_tree():
    
    condpair_tree = aqclustutils.read_condpair_tree(COND1, COND2, QUICKTEST_RESULTS_DIR)
    modpep_loader = aqoutlier.ModifiedPeptideLoader(condpair_tree,specific_modification="[Oxidation (M)]")
    outlier_loader = aqoutlier.OutlierPeptideLoader(condpair_tree)
    cclust_loader = aqoutlier.ComplementedClusterLoader(outlier_loader, modpep_loader)
    complemented_clusters = cclust_loader.complemented_clusters
    assert len(complemented_clusters)>0
    compare_against_hand_checked_cclust(complemented_clusters)
    check_that_there_are_no_pepname_duplicates(complemented_clusters)
    check_ccluster_list(complemented_clusters)

def check_ccluster_list(complemented_clusters, print_details = False):
    mergedlist = []
    fcs_outliers = list(set([x.outlier_peptide.protnormed_fc for x in complemented_clusters]))
    for cclust in complemented_clusters:
        if print_details:
            print(cclust.outlier_peptide.peptide_sequence)
            print(cclust.modified_peptide.modified_sequence)
            norm_fc_outlier = cclust.outlier_peptide.protnormed_fc
            norm_fc_modpep = cclust.modified_peptide.protnormed_fc
            print(f"{norm_fc_outlier} vs. {norm_fc_modpep}")
            
        mergedlist.append(f"{cclust.outlier_peptide.peptide_sequence}_{cclust.modified_peptide.modified_sequence}")
    assert len(mergedlist)==len(set(mergedlist))
    assert len(mergedlist) == len(fcs_outliers)


def compare_against_hand_checked_cclust(complemented_clusters):
    gene = 'P32324'
    modseq_outlier = 'SEQ_EGPIFGEEMR_MOD__EGPIFGEEMR__'
    pepseq = 'SEQ_EGPIFGEEMR_'
    modseq = 'SEQ_EGPIFGEEMR_MOD__EGPIFGEEM[Oxidation (M)]R__'
    
    fc_modpep = -0.6681234989903941
    fc_outlier = 0.9317352548622448
    fc_protein = 0.09399380538814694

    cclust = filter(lambda x : x.modified_peptide.modified_sequence == modseq, complemented_clusters).__next__()
    assert cclust.modified_peptide.peptide_sequence == pepseq
    assert cclust.modified_peptide.fc == fc_modpep
    assert cclust.modified_peptide.protnormed_fc ==  fc_modpep-fc_protein

    assert cclust.outlier_peptide.peptide_sequence == pepseq
    assert cclust.outlier_peptide.fc == fc_outlier
    assert cclust.outlier_peptide.protnormed_fc == fc_outlier - fc_protein
    print("performed tests")

def check_that_there_are_no_pepname_duplicates(complemented_clusters):
    peptide_names = [f"{x.outlier_peptide.peptide_sequence}_{x.modified_peptide.modified_sequence}" for x in complemented_clusters]
    assert len(peptide_names) == len(set(peptide_names))





test_complemented_cluster_loading_from_tree()

## Complemented Cluster Evaluator

## Ranking and Filtering Clusters and Outliers

### Test Filtering

In [None]:
#hide


def test_object_filtering():
    obj_creator = FilterObjectCreator()
    allsame_objects = obj_creator.get_all_same_objects()
    second_inverted_objects = obj_creator.get_second_object_inverted()
    check_allsame(allsame_objects)
    check_inverted(second_inverted_objects)



class FilterObjectCreator():

    def get_all_same_objects(self):
        first_values = [0, 1, 2, 3, 4]
        second_values = [0, 1, 2, 3, 4]
        third_values = [0, 1, 2, 3, 4]
        return self._make_filterobjects_from_vectors(first_values, second_values, third_values)

    def get_second_object_inverted(self):
        first_values = [0, 1, 2, 3, 4]
        second_values = [4, 3, 2, 1, 0]
        third_values = [0, 1, 2, 3, 4]
        return self._make_filterobjects_from_vectors(first_values, second_values, third_values)

    def _make_filterobjects_from_vectors(self, first_values, second_values, third_values):
        objs = []
        for idx in range(len(first_values)):
            objs.append(FilterObject(first_values[idx], second_values[idx], third_values[idx]))
        return objs


class FilterObject():
    def __init__(self, first_value, second_value, third_value):
        self.first_value = first_value
        self.second_value = second_value
        self.third_value = third_value

    def get_first_value(self):
        return self.first_value

    def get_second_value(self):
        return self.second_value

    def get_third_value(self):
        return self.third_value

def get_filterconfig_list(quantile):
    res_list = []
    res_list.append(aqoutlier.FilterConfig("get_first_value", quantile, False))
    res_list.append(aqoutlier.FilterConfig("get_second_value", quantile, False))
    res_list.append(aqoutlier.FilterConfig("get_third_value", quantile, False))
    return res_list


def check_allsame(allsame_objects):
    filterer = aqoutlier.QuantileFilterer(allsame_objects, filterconfigs= get_filterconfig_list(0.6))
    filtlist = filterer.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist]) == sorted([4, 3, 2])

    filterer2 = aqoutlier.QuantileFilterer(allsame_objects, filterconfigs= get_filterconfig_list(1))
    filtlist2 = filterer2.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist2]) == sorted([4, 3, 2, 1, 0])

    filterer3 = aqoutlier.QuantileFilterer(allsame_objects, filterconfigs= get_filterconfig_list(0.1))
    filtlist3 = filterer3.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist3]) == sorted([4])


def check_inverted(inverted_objects):
    filterer = aqoutlier.QuantileFilterer(inverted_objects, filterconfigs= get_filterconfig_list(0.79))
    filtlist = filterer.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist]) == sorted([3, 2, 1])

    filterer2 = aqoutlier.QuantileFilterer(inverted_objects, filterconfigs= get_filterconfig_list(1))
    filtlist2 = filterer2.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist2]) == sorted([4, 3, 2, 1, 0])

    filterer3 = aqoutlier.QuantileFilterer(inverted_objects, filterconfigs= get_filterconfig_list(0.39))
    filtlist3 = filterer3.get_filtered_list_of_objects()
    assert sorted([x.first_value for x in filtlist3]) == sorted([])



test_object_filtering()