In [1]:
# default_exp outlier_scoring


# Outlier scoring

The default approach of AlphaQuant is to collect the most consistent set of peptides for protein quantification. Potentially, however, there can be valuable information in non-consistent, i.e. outlier peptides, which can be indicative of proteoforms. In this module, we perform a systematic evaluation of outlying groups of peptides in order to determine promising candidates for further investigation.

In particuar, we:

1) Load the clustered tree data and retrieve all proteins with their clustered child peptides
2) Compare the fold changes of the main cluster with the fold changes of the outlier-clusters
3) Filter by both strength of fc diff and the quality of the score
4) Return ranked list


In [2]:
#export
import alphaquant.diffquant_utils as aqutils
import numpy as np
import copy
import anytree

class OutlierHandler():
    def __init__(self, results_dir, cond1, cond2):
        self._protnodes = self.__load_protein_nodes__(results_dir, cond1, cond2)

    def __load_protein_nodes__(self, results_dir, cond1, cond2):
        return aqutils.read_condpair_tree(results_folder=results_dir, cond1=cond1, cond2 = cond2).children
    
    def get_diffclust_overview_list(self):
        """_summary_

        Returns:
            ClusterDiffInfo: object containing the relevant information about two differing clusters (fcfc, peptides), 
            can "reduce" protein nodes to the cluster-relevant peptides
        """
        diffclusts = []
        counter = 0
        for protnode in self._protnodes:
            counter+=1
            cluster_checker = ProtnodeClusterChecker(protnode)
            diffclusts += cluster_checker.get_diffclusts()
        return diffclusts


class ProtnodeClusterChecker():
    def __init__(self, protnode):
        self._protnode = protnode
        self._num_clusters = protnode.num_clusters

    def get_diffclusts(self):
        if not self.__check_if_multiple_clusters__():
            return []
        return self.__get_clusterdiff_handler_for_each_cluster__()

    def __get_clusterdiff_handler_for_each_cluster__(self):
        protnodes = []
        mainclust_info= self.__get_cluster_info__(clustnum = 0)
        for clustnum in range(1, self._num_clusters):
            protnodes.append(self.__get_clusterdiff_handler__(clustnum, mainclust_info))
        return protnodes
    
    def __get_clusterdiff_handler__(self, clustnum, mainclust_info):
        outlier_info = self.__get_cluster_info__(clustnum)
        return ClusterDiffInfo(mainclust_info, outlier_info)
    
    def __get_cluster_info__(self, clustnum):
        mainclust_peptides = self.__get_peptides_of_cluster__(clustnum)
        return ClusterInfo(protein_name=self._protnode.name,peptide_nodes = mainclust_peptides)

    def __get_peptides_of_cluster__(self, clustnum):
        return [x for x in self._protnode.children if x.cluster == clustnum]
    
    def __check_if_multiple_clusters__(protein_node):
        return protein_node._num_clusters >1






class ClusterInfo():
    def __init__(self, protein_name,peptide_nodes):
        self.protein_name = protein_name
        self.cluster_number = list({x.cluster for x in peptide_nodes})[0]
        self.peptide_names = [x.name for x in peptide_nodes]
        self.median_fc = np.median(np.array([x.fc for x in peptide_nodes]))
        self.minimum_predscore = min([abs(x.predscore) for x in peptide_nodes])

class ClusterDiffInfo():
    def __init__(self, mainclust_info, outlier_info):
        self.protein_name = mainclust_info.protein_name
        self.clusterpair_id = f"{mainclust_info.cluster_number}_{outlier_info.cluster_number}"
        self.name = f"{self.protein_name}_{self.clusterpair_id}"
        self.fcdiff = abs(mainclust_info.median_fc - outlier_info.median_fc)
        self.quality_score = max(mainclust_info.minimum_predscore, outlier_info.minimum_predscore)
        self.outlier_peptide_names = outlier_info.peptide_names
        self.peptide_names = mainclust_info.peptide_names + outlier_info.peptide_names
    
    def get_clusterdiff_protnode(self, protnode):
        protnode_clusterdiff = copy.deepcopy(protnode)
        self.__remove_peptides_not_in_cluster__(protnode_clusterdiff)
        self.__add_diffinfos__(protnode_clusterdiff)
        return protnode_clusterdiff

    def __remove_peptides_not_in_cluster__(self, protnode_clusterdiff):
        for peptide_node in protnode_clusterdiff.children:
            self.__remove_peptide_if_necessary__(peptide_node)

    def __add_diffinfos__(self, protnode):
        protnode.fcdiff = self.fcdiff
        protnode.quality_score = self.quality_score
        protnode.peptide_names = self.peptide_names
    
    def __remove_peptide_if_necessary__(self, peptide_node):
        if peptide_node.name not in self.peptide_names:
            peptide_node.parent = None




In [3]:
#hide
import anytree
import unittest

def test_that_clusterdiff_infos_have_expected_peptides_and_fold_changes():
    
    prot = ProtNodeCreator().get_simulated_protein_node()
    
    diffclust_annotations = ProtnodeClusterChecker(prot).get_diffclusts()
    fcdiffs = [x.fcdiff for x in diffclust_annotations]
    quality_scores = [x.quality_score for x in diffclust_annotations]
    peptides = [x.peptide_names for x in diffclust_annotations]
    
    #self calculated fcdiffs
    fcdiffs_ref = [5, 11.5]
    quality_scores_ref = [0.3, 0.4]
    peptides_ref = [[f"pep{x}" for x in range(5)] + [f"pep{x}" for x in range(5, 7)], [f"pep{x}" for x in range(5)] + [f"pep{x}" for x in range(7, 10)]]
    
    assert sorted(fcdiffs) == sorted(fcdiffs_ref)
    assert sorted(quality_scores) == sorted(quality_scores_ref)
    assert sorted(peptides) == sorted(peptides_ref)
    print("performed test")
    

class ProtNodeCreator():
    def __init__(self):
        self.names = [f"pep{x}" for x in range(10)]
        self.fold_changes = [-1.5 for x in range(5)] + [3.5 for x in range(2)] + [10 for x in range(3)]
        self.clusters = [0 for x in range(5)] + [1 for x in range(2)] + [2 for x in range(3)]
        self.predscores = [0.1 for x in range(5)] + [0.3 for x in range(2)] + [0.4 for x in range(3)]

    def get_simulated_protein_node(self):
        prot = anytree.Node(name="prot1", type = "gene")
        prot.fc = 1
        prot.num_clusters = 3
        self._add_peptide_nodes(prot)
        return prot

    def _add_peptide_nodes(self, protein_node):
        for idx in range(10):
            pepnode = anytree.Node(self.names[idx])
            pepnode.fc = self.fold_changes[idx]
            pepnode.cluster = self.clusters[idx]
            pepnode.predscore = self.predscores[idx]
            pepnode.type = "seq"
            pepnode.parent = protein_node

test_that_clusterdiff_infos_have_expected_peptides_and_fold_changes()




performed test


## PeptideInfos
We extend the `ProtnodeClusterChecker` for the case that we are interested in single 'outlier' peptides

In [4]:
#export

class OutlierPeptideLoader():
    def __init__(self, condpair_tree):
        self._condpair_tree = condpair_tree
        self._outlier_peptides = []
        
    def get_outlier_peptides(self):
        for protnode in self._condpair_tree.children:
            nodechecker = aqoutlier.ProtnodeClusterCheckerPeptideInfos(protnode)
            self._outlier_peptides += nodechecker.get_outlier_peptide_infos()
        return self._outlier_peptides


class ProtnodeClusterCheckerPeptideInfos(ProtnodeClusterChecker):
    def __init__(self, protnode):
        super().__init__(protnode)
        self._outlier_peptide_infos = []

    def get_outlier_peptide_infos(self):
        diffclusts = self.get_diffclusts()
        for clusterdiffinfo in diffclusts:
            self._update_outlier_peptide_infos(clusterdiffinfo)
        return self._outlier_peptide_infos

    def _update_outlier_peptide_infos(self, clusterdiffinfo):
        peptide_nodes = self._get_outlier_peptide_nodes(clusterdiffinfo)
        for peptide_node in peptide_nodes:
            self._outlier_peptide_infos.append(OutlierPeptideInfo(peptide_node))

    def _get_outlier_peptide_nodes(self, clusterdiffinfo):
        peptide_names = set(clusterdiffinfo.outlier_peptide_names)
        return anytree.findall(self._protnode, filter_= lambda x : x.name in peptide_names, maxlevel=2)


class ProteinInfo():
    def __init__(self, peptide_node):
        self.protein_fc = self._get_protein_fc(peptide_node)
    
    def _get_protein_fc(self, peptide_node):
        return aqutils.find_node_parent_at_level(peptide_node, "gene")


class OutlierPeptideInfo(ProteinInfo):
    def __init__(self, peptide_node):
        super().__init__(peptide_node)
        self.peptide_sequence = peptide_node.name
        self.fc = peptide_node.fc
        self.quality_score = peptide_node.predscore
        self.protnormed_fc = None
        self._calc_protnormed_fc()
    
    def _get_protein_fc(self, peptide_node):
        protnode = aqutils.find_node_parent_at_level(peptide_node, "gene")
        return protnode.fc

    def _calc_protnormed_fc(self):
        self.protnormed_fc = self.fc - self.protein_fc



In [5]:
#hide
def test_that_clusterdiff_peptides_have_expected_fold_changes():
    protnode = ProtNodeCreator().get_simulated_protein_node()
    outlier_infos = ProtnodeClusterCheckerPeptideInfos(protnode).get_outlier_peptide_infos()
    fold_changes_oi = [x.fc for x in outlier_infos]
    fold_changes_pn = [x.fc for x in protnode.children if x.cluster !=0]
    peptide_names_oi = [x.peptide_sequence for x in outlier_infos]
    peptide_names_pn = [x.name for x in protnode.children  if x.cluster !=0]

    assert sorted(fold_changes_oi) == sorted(fold_changes_pn)
    assert sorted(peptide_names_oi) == sorted(peptide_names_pn)
    print("performed test")

test_that_clusterdiff_peptides_have_expected_fold_changes()

performed test


## Modified peptide loader

Modified peptides are interesting because they can alter fold changes of unmodified peptides. The below classes extract relevant information about modified peptides to allow easier comparison with unmodified peptides. These modified peptides come from different experiments and hence need to be handled differently.

In [6]:
#export
import alphaquant.outlier_scoring as aqoutlier
import anytree


class ModifiedPeptideLoader():
    def __init__(self, condpair_tree):
        self.condpair_tree = condpair_tree
        self._pepname2modpep = {}
        self._load_modified_peptides_from_tree()
    
    def get_modpep_from_sequence(self, peptide_sequence):
        return self._pepname2modpep.get(peptide_sequence)
    
    def _load_modified_peptides_from_tree(self):
        modified_pepnodes = self._get_modified_peptide_nodes()
        for mod_pep_node in modified_pepnodes:
            self._update_pepname2modpep(mod_pep_node)

    def _get_modified_peptide_nodes(self):
        return anytree.search.findall(self.condpair_tree, lambda x : x.type == 'mod_seq', maxlevel=3)
    
    def _update_pepname2modpep(self, mod_pep_node):
        modified_peptide = PeptideWithSpecificModification(mod_pep_node)
        if modified_peptide.specific_modification_found:
            self._pepname2modpep[modified_peptide.peptide_sequence] = modified_peptide


class PeptideWithSpecificModification(OutlierPeptideInfo):
    def __init__(self, node_modpeptide, specific_modification = "[Phospho (STY)]"):
        self.modified_sequence = node_modpeptide.name
        self.specific_modification_found = self._check_for_specific_modification(specific_modification)
        self.peptide_sequence = None
        self.fc = None
        self.quality_score = None
        if self.specific_modification_found:
            self._load_from_modpeptide_node(node_modpeptide)

    def _check_for_specific_modification(self, specific_modification):
        return specific_modification in self.modified_sequence
    
    def _load_from_modpeptide_node(self, node_modpeptide):
        self.peptide_sequence = self._get_peptide_sequence(node_modpeptide)
        self.fc = node_modpeptide.fc
        self.quality_score = node_modpeptide.predscore

    def _get_peptide_sequence(self, node_modpeptide):
        pepnode = aqutils.find_node_parent_at_level(node_modpeptide, level='seq')
        return pepnode.name

In [7]:
#hide
class ModPepCreator():
    def __init__(self):
        peptide_sequence = None

    def get_modpep_node(self):
        protnode = ProtNodeCreator().get_simulated_protein_node()
        pepnode = protnode.children[0] #just select first peptide
        self.peptide_sequence = pepnode.name
        modpep = anytree.Node(f"{pepnode.name}_{pepnode.name}[Phospho (STY)]",parent=pepnode)
        modpep.fc = 42
        modpep.predscore = -42
        modpep.type = 'mod_seq'
        return modpep


def test_that_modpep_is_handled_correctly():
    modpepcreator = ModPepCreator()
    modpep_node = modpepcreator.get_modpep_node()
    modpep = PeptideWithSpecificModification(modpep_node)
    assert modpep.peptide_sequence == modpepcreator.peptide_sequence
    assert modpep.modified_sequence == modpep_node.name
    print("performed test")


test_that_modpep_is_handled_correctly()

performed test


## Complemented Clusters
One thing we can do, is to compare outlier peptides with modified peptides if we have a good dataset. A question of interest is for example, if we can find matching pairs of unmodified and modified peptides. If we find such pairs, we can compare the direction of regulation and for example the dependence on the quality score.

In [8]:
#export
class ComplementedClusterLoader():
    def __init__(self, outlier_peptide_loader, modified_peptide_loader):
        self._outlier_peptides = outlier_peptide_loader.get_outlier_peptides()
        self._modified_peptide_loader = modified_peptide_loader
        self._complemented_clusters = []

    def find_complemented_clusters(self):
        for outlier_peptide in self._outlier_peptides:
            modified_peptide = self._get_modified_peptide(outlier_peptide)
            if modified_peptide is not None:
                self._complemented_clusters.append(ComplementedCluster(outlier_peptide,modified_peptide))

    def _get_modified_peptide(modified_peptide_loader, outlier_peptide):
        return modified_peptide_loader.get_modpep_from_sequence(outlier_peptide.peptide_sequence)



class ComplementedCluster():
    def __init__(self, outlier_peptide, modified_peptide):
        self.outlier_peptide = outlier_peptide
        self.modified_peptide = modified_peptide
        self._add_normfc_to_modpep()
    
    def _add_normfc_to_modpep(self):
        self.modified_peptide.protein_fc = self.outlier_peptide.protein_fc
        self.modified_peptide._calc_protnormed_fc()


In [12]:
#hide

def test_complemented_cluster_retains_properties():
    outlier_peptide = load_outlier_peptide()
    modpepcreator = ModPepCreator()
    modpep_node = modpepcreator.get_modpep_node()
    modpep = PeptideWithSpecificModification(modpep_node)
    complemented_cluster = ComplementedCluster(outlier_peptide, modpep)
    
    modpep = complemented_cluster.modified_peptide
    outlier_pep = complemented_cluster.outlier_peptide

    
    assert modpep.peptide_sequence == modpepcreator.peptide_sequence
    assert modpep.modified_sequence == modpep_node.name
    assert outlier_pep.peptide_sequence == outlier_peptide.peptide_sequence

    print("performed test")


def load_outlier_peptide():
    protnode = ProtNodeCreator().get_simulated_protein_node()
    outlier_infos = ProtnodeClusterCheckerPeptideInfos(protnode).get_outlier_peptide_infos()
    return outlier_infos[0]


test_complemented_cluster_retains_properties()

performed test
