Merge 18d3f18 into 47d53b1

GiulioRossetti · Sep 12, 2019 · 620022a · 620022a
2 parents 47d53b1 + 18d3f18
commit 620022a
Show file tree

Hide file tree

Showing 11 changed files with 181 additions and 16 deletions.
diff --git a/cdlib/__init__.py b/cdlib/__init__.py
@@ -1,3 +1,4 @@
 from cdlib.classes.node_clustering import NodeClustering
 from cdlib.classes.edge_clustering import EdgeClustering
 from cdlib.classes.fuzzy_node_clustering import FuzzyNodeClustering
+from cdlib.classes.attr_node_clustering import AttrNodeClustering
diff --git a/cdlib/algorithms/__init__.py b/cdlib/algorithms/__init__.py
@@ -1,3 +1,4 @@
 from .edge_clustering import *
 from .crisp_partition import *
 from .overlapping_partition import *
+from .attribute_clustering import *
diff --git a/cdlib/algorithms/attribute_clustering.py b/cdlib/algorithms/attribute_clustering.py
@@ -0,0 +1,63 @@
+try:
+    import igraph as ig
+except ModuleNotFoundError:
+        ig = None
+
+import Eva
+
+from collections import defaultdict
+from cdlib import AttrNodeClustering
+
+import networkx as nx
+
+from cdlib.utils import convert_graph_formats
+
+__all__ = ['eva']
+
+def eva(g, labels, weight='weight', resolution=1., randomize=False, alpha=0.5):
+
+    """
+       The Eva algorithm extends the Louvain approach in order to deal with the attributes of the nodes (aka Louvain Extended to Vertex Attributes).
+       It optimizes - combining them linearly - two quality functions, a structural and a clustering one, namely the modularity and the purity.
+       A parameter alpha tunes the importance of the two functions: an high value of alpha favors the clustering criterion instead of the structural one.
+
+       :param g: a networkx/igraph object
+       :param weight: str, optional the key in graph to use as weight. Default to 'weight'
+       :param resolution: double, optional  Will change the size of the communities, default to 1.
+       :param randomize:  boolean, optional  Will randomize the node evaluation order and the community evaluation  order to get different partitions at each call, default False
+       :param alpha: a value assumed in [0,1] tuning the importance of modularity and purity criteria
+       :return: AttrNodeClustering object
+
+       :Example:
+
+        >>> from cdlib.algorithms import eva
+        >>> import networkx as nx
+        >>> import random
+        >>> l1 = ['A', 'B', 'C', 'D']
+        >>> l2 = ["E", "F", "G"]
+        >>> g = nx.barabasi_albert_graph(100, 5)
+        >>> labels=dict()
+        >>> for node in g.nodes():
+        >>>    labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)}
+        >>> communities = eva(g_attr, labels, alpha=0.8)
+
+       :References:
+
+      1. #####
+
+       .. note:: Reference implementation: https://github.com/GiulioRossetti/Eva/tree/master/Eva
+       """
+
+    g = convert_graph_formats(g, nx.Graph)
+    nx.set_node_attributes(g, labels)
+
+    coms, coms_labels = Eva.eva_best_partition(g, weight=weight, resolution=resolution, randomize=randomize, alpha=alpha)
+
+    # Reshaping the results
+    coms_to_node = defaultdict(list)
+    for n, c in coms.items():
+        coms_to_node[c].append(n)
+
+    coms_eva = [list(c) for c in coms_to_node.values()]
+    return AttrNodeClustering(coms_eva, g, coms_labels, "Eva", method_parameters={"weight": weight, "resolution": resolution,
+                                                                         "randomize": randomize, "alpha":alpha})
diff --git a/cdlib/classes/__init__.py b/cdlib/classes/__init__.py
@@ -1,3 +1,4 @@
 from .node_clustering import NodeClustering
 from .edge_clustering import EdgeClustering
 from .fuzzy_node_clustering import FuzzyNodeClustering
+from .attr_node_clustering import AttrNodeClustering
diff --git a/cdlib/classes/attr_node_clustering.py b/cdlib/classes/attr_node_clustering.py
@@ -0,0 +1,26 @@
+from cdlib.classes.node_clustering import NodeClustering
+from cdlib import evaluation
+
+
+class AttrNodeClustering(NodeClustering):
+
+    """Attribute Node Communities representation.
+
+      :param communities: list of communities
+      :param graph: a networkx/igraph object
+      :param coms_labels: dictionary specifying for each community the frequency of the attribute values
+      :param method_name: community discovery algorithm name
+      :param method_parameters: configuration for the community discovery algorithm used
+      :param overlap: boolean, whether the partition is overlapping or not
+      """
+
+    def __init__(self, communities, graph, coms_labels, method_name, method_parameters=None, overlap=False):
+        super().__init__(communities, graph, method_name, method_parameters, overlap)
+        self.coms_labels = coms_labels
+
+    def purity(self):
+
+        """Purity is the product of the frequencies of the most frequent labels carried by the nodes within the communities
+        :return: FitnessResult object
+        """
+        return evaluation.purity(self.coms_labels)
diff --git a/cdlib/classes/node_clustering.py b/cdlib/classes/node_clustering.py
@@ -297,8 +297,9 @@ def avg_odf(self, **kwargs):
 
         >>> from cdlib.algorithms import louvain
         >>> g = nx.karate_club_graph()
-        >>> communities = louvain(g)
-        >>> mod = communities.avg_odf()
+        >>>
+        >>> communities = eva(g, alpha=alpha)
+        >>> pur = communities.purity()
 
         """
         if self.__check_graph():

diff --git a/cdlib/evaluation/fitness.py b/cdlib/evaluation/fitness.py
@@ -5,13 +5,13 @@
 import numpy as np
 import scipy
 from cdlib.evaluation.internal.link_modularity import cal_modularity
+import Eva
 
 __all__ = ["FitnessResult", "link_modularity", "normalized_cut", "internal_edge_density", "average_internal_degree",
            "fraction_over_median_degree", "expansion", "cut_ratio", "edges_inside", "flake_odf", "avg_odf", "max_odf",
            "triangle_participation_ratio", "modularity_density", "z_modularity", "erdos_renyi_modularity",
            "newman_girvan_modularity", "significance", "surprise", "conductance", "size", "avg_embeddedness",
-           "scaled_density", "avg_distance", "hub_dominance", "avg_transitivity"]
-
+           "scaled_density", "avg_distance", "hub_dominance", "avg_transitivity", "purity"]
 
 # FitnessResult = namedtuple('FitnessResult', ['min', 'max', 'mean', 'std'])
 FitnessResult = namedtuple('FitnessResult', 'min max score std')
@@ -84,7 +84,7 @@ def scaled_density(graph, communities, **kwargs):
     """
 
     return __quality_indexes(graph, communities,
-                             lambda graph, coms: nx.density(nx.subgraph(graph, coms))/ nx.density(graph), **kwargs)
+                             lambda graph, coms: nx.density(nx.subgraph(graph, coms)) / nx.density(graph), **kwargs)
 
 
 def avg_distance(graph, communities, **kwargs):
@@ -131,7 +131,7 @@ def hub_dominance(graph, communities, **kwargs):
 
     return __quality_indexes(graph, communities,
                              lambda graph, coms: max([x[1] for x in
-                                                      list(nx.degree(nx.subgraph(graph, coms)))])/(len(coms) - 1),
+                                                      list(nx.degree(nx.subgraph(graph, coms)))]) / (len(coms) - 1),
                              **kwargs)
 
 
@@ -382,7 +382,6 @@ def edges_inside(graph, community, **kwargs):
     1. Radicchi, F., Castellano, C., Cecconi, F., Loreto, V., & Parisi, D. (2004). Defining and identifying communities in networks. Proceedings of the National Academy of Sciences, 101(9), 2658-2663.
     """
 
-
     return __quality_indexes(graph, community, pq.PartitionQuality.edges_inside, **kwargs)
 
 
@@ -622,7 +621,7 @@ def erdos_renyi_modularity(graph, communities, **kwargs):
         c = nx.subgraph(graph, community)
         mc = c.number_of_edges()
         nc = c.number_of_nodes()
-        q += mc - (m*nc*(nc - 1)) / (n*(n-1))
+        q += mc - (m * nc * (nc - 1)) / (n * (n - 1))
 
     return FitnessResult(score=(1 / m) * q)
 
@@ -710,8 +709,8 @@ def z_modularity(graph, communities, **kwargs):
         for node in c:
             dc += c.degree(node)
 
-        mmc += (mc/m)
-        dc2m += (dc/(2*m))**2
+        mmc += (mc / m)
+        dc2m += (dc / (2 * m)) ** 2
 
     res = 0
     try:
@@ -759,10 +758,10 @@ def surprise(graph, communities, **kwargs):
         q += mc
         qa += scipy.special.comb(nc, 2, exact=True)
     try:
-        q = q/m
-        qa = qa/scipy.special.comb(n, 2, exact=True)
+        q = q / m
+        qa = qa / scipy.special.comb(n, 2, exact=True)
 
-        sp = m*(q*np.log(q/qa) + (1-q)*np.log2((1-q)/(1-qa)))
+        sp = m * (q * np.log(q / qa) + (1 - q) * np.log2((1 - q) / (1 - qa)))
     except ZeroDivisionError:
         pass
 
@@ -792,7 +791,7 @@ def significance(graph, communities, **kwargs):
     m = graph.number_of_edges()
 
     binom = scipy.special.comb(m, 2, exact=True)
-    p = m/binom
+    p = m / binom
 
     q = 0
 
@@ -805,7 +804,35 @@ def significance(graph, communities, **kwargs):
             binom_c = scipy.special.comb(nc, 2, exact=True)
             pc = mc / binom_c
 
-            q += binom_c * (pc * np.log(pc/p) + (1-pc)*np.log((1-pc)/(1-p)))
+            q += binom_c * (pc * np.log(pc / p) + (1 - pc) * np.log((1 - pc) / (1 - p)))
         except ZeroDivisionError:
             pass
     return FitnessResult(score=q)
+
+
+def purity(communities):
+    """Purity is the product of the frequencies of the most frequent labels carried by the nodes within the communities
+
+        :param communities: AttrNodeClustering object
+        :return: FitnessResult object
+
+        Example:
+
+        >>> from cdlib.algorithms import eva
+        >>> from cdlib import evaluation
+        >>> import random
+        >>> l1 = ['A', 'B', 'C', 'D']
+        >>> l2 = ["E", "F", "G"]
+        >>> g = nx.barabasi_albert_graph(100, 5)
+        >>> labels=dict()
+        >>> for node in g.nodes():
+        >>>    labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)}
+        >>> communities = eva(g_attr, labels, alpha=0.5)
+        >>> pur = evaluation.purity(communities)
+
+        :References:
+
+        1. ######
+        """
+    pur = Eva.purity(communities.coms_labels)
+    return FitnessResult(score=pur)
diff --git a/cdlib/test/__init__.py b/cdlib/test/__init__.py
@@ -7,3 +7,4 @@
 from .test_partitions_comparisons import *
 from .test_utils import *
 from .test_viz_network import *
+from .test_attributeclustering import *
diff --git a/cdlib/test/test_attributeclustering.py b/cdlib/test/test_attributeclustering.py
@@ -0,0 +1,23 @@
+import unittest
+from cdlib import algorithms
+import networkx as nx
+import random
+
+class AttrCommunityDiscoveryTests(unittest.TestCase):
+
+    def test_eva(self):
+
+        l1 = ['one', 'two', 'three', 'four']
+        l2 = ["A", "B", "C"]
+        g = nx.barabasi_albert_graph(100, 5)
+        labels=dict()
+
+        for node in g.nodes():
+            labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)}
+
+        coms = algorithms.eva(g,labels,alpha=0.5)
+
+        self.assertEqual(type(coms.communities), list)
+        if len(coms.communities) > 0:
+            self.assertEqual(type(coms.communities[0]), list)
+            self.assertEqual(type(coms.communities[0][0]), int)
diff --git a/cdlib/test/test_fitness_functions.py b/cdlib/test/test_fitness_functions.py
@@ -1,10 +1,12 @@
 import unittest
-from cdlib.algorithms import louvain
+from cdlib.algorithms import louvain, eva
 import networkx as nx
 import numpy as np
+import random
 from cdlib import evaluation
 
 
+
 class FitnessFunctionsTests(unittest.TestCase):
 
     def test_link_modularity(self):
@@ -66,3 +68,21 @@ def test_pquality_indexes(self):
         for idx in indexes:
             res = idx(g, communities)
             self.assertIsInstance(res, evaluation.FitnessResult)
+
+    def test_purity(self):
+
+        l1 = ['one', 'two', 'three', 'four']
+        l2 = ["A", "B", "C"]
+        g_attr = nx.barabasi_albert_graph(100, 5)
+        labels = dict()
+
+        for node in g_attr.nodes():
+            labels[node] = {"l1": random.choice(l1), "l2": random.choice(l2)}
+
+        coms = eva(g_attr, labels, alpha=0.8)
+
+        pur = evaluation.purity(coms)
+
+        self.assertIsInstance(pur, evaluation.FitnessResult)
+        self.assertGreaterEqual(pur.score, 0)
+        self.assertLessEqual(pur.score, 1)
diff --git a/requirements.txt b/requirements.txt
@@ -16,3 +16,4 @@ pulp==1.6.*
 pquality==0.0.7
 seaborn==0.9.*
 pandas==0.25.*
+eva_lcd