diff --git a/cdlib/__init__.py b/cdlib/__init__.py index bf589885..4f5b8430 100644 --- a/cdlib/__init__.py +++ b/cdlib/__init__.py @@ -1,3 +1,4 @@ from cdlib.classes.node_clustering import NodeClustering from cdlib.classes.edge_clustering import EdgeClustering from cdlib.classes.fuzzy_node_clustering import FuzzyNodeClustering +from cdlib.classes.attr_node_clustering import AttrNodeClustering diff --git a/cdlib/algorithms/__init__.py b/cdlib/algorithms/__init__.py index 451496a6..e1248fdb 100644 --- a/cdlib/algorithms/__init__.py +++ b/cdlib/algorithms/__init__.py @@ -1,3 +1,4 @@ from .edge_clustering import * from .crisp_partition import * from .overlapping_partition import * +from .attribute_clustering import * diff --git a/cdlib/algorithms/attribute_clustering.py b/cdlib/algorithms/attribute_clustering.py new file mode 100644 index 00000000..a462295e --- /dev/null +++ b/cdlib/algorithms/attribute_clustering.py @@ -0,0 +1,63 @@ +try: + import igraph as ig +except ModuleNotFoundError: + ig = None + +import Eva + +from collections import defaultdict +from cdlib import AttrNodeClustering + +import networkx as nx + +from cdlib.utils import convert_graph_formats + +__all__ = ['eva'] + +def eva(g, labels, weight='weight', resolution=1., randomize=False, alpha=0.5): + + """ + The Eva algorithm extends the Louvain approach in order to deal with the attributes of the nodes (aka Louvain Extended to Vertex Attributes). + It optimizes - combining them linearly - two quality functions, a structural and a clustering one, namely the modularity and the purity. + A parameter alpha tunes the importance of the two functions: an high value of alpha favors the clustering criterion instead of the structural one. + + :param g: a networkx/igraph object + :param weight: str, optional the key in graph to use as weight. Default to 'weight' + :param resolution: double, optional Will change the size of the communities, default to 1. + :param randomize: boolean, optional Will randomize the node evaluation order and the community evaluation order to get different partitions at each call, default False + :param alpha: a value assumed in [0,1] tuning the importance of modularity and purity criteria + :return: AttrNodeClustering object + + :Example: + + >>> from cdlib.algorithms import eva + >>> import networkx as nx + >>> import random + >>> l1 = ['A', 'B', 'C', 'D'] + >>> l2 = ["E", "F", "G"] + >>> g = nx.barabasi_albert_graph(100, 5) + >>> labels=dict() + >>> for node in g.nodes(): + >>> labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)} + >>> communities = eva(g_attr, labels, alpha=0.8) + + :References: + + 1. ##### + + .. note:: Reference implementation: https://github.com/GiulioRossetti/Eva/tree/master/Eva + """ + + g = convert_graph_formats(g, nx.Graph) + nx.set_node_attributes(g, labels) + + coms, coms_labels = Eva.eva_best_partition(g, weight=weight, resolution=resolution, randomize=randomize, alpha=alpha) + + # Reshaping the results + coms_to_node = defaultdict(list) + for n, c in coms.items(): + coms_to_node[c].append(n) + + coms_eva = [list(c) for c in coms_to_node.values()] + return AttrNodeClustering(coms_eva, g, coms_labels, "Eva", method_parameters={"weight": weight, "resolution": resolution, + "randomize": randomize, "alpha":alpha}) \ No newline at end of file diff --git a/cdlib/classes/__init__.py b/cdlib/classes/__init__.py index fba7af98..596dc60a 100644 --- a/cdlib/classes/__init__.py +++ b/cdlib/classes/__init__.py @@ -1,3 +1,4 @@ from .node_clustering import NodeClustering from .edge_clustering import EdgeClustering from .fuzzy_node_clustering import FuzzyNodeClustering +from .attr_node_clustering import AttrNodeClustering diff --git a/cdlib/classes/attr_node_clustering.py b/cdlib/classes/attr_node_clustering.py new file mode 100644 index 00000000..8f05cb5a --- /dev/null +++ b/cdlib/classes/attr_node_clustering.py @@ -0,0 +1,26 @@ +from cdlib.classes.node_clustering import NodeClustering +from cdlib import evaluation + + +class AttrNodeClustering(NodeClustering): + + """Attribute Node Communities representation. + + :param communities: list of communities + :param graph: a networkx/igraph object + :param coms_labels: dictionary specifying for each community the frequency of the attribute values + :param method_name: community discovery algorithm name + :param method_parameters: configuration for the community discovery algorithm used + :param overlap: boolean, whether the partition is overlapping or not + """ + + def __init__(self, communities, graph, coms_labels, method_name, method_parameters=None, overlap=False): + super().__init__(communities, graph, method_name, method_parameters, overlap) + self.coms_labels = coms_labels + + def purity(self): + + """Purity is the product of the frequencies of the most frequent labels carried by the nodes within the communities + :return: FitnessResult object + """ + return evaluation.purity(self.coms_labels) \ No newline at end of file diff --git a/cdlib/classes/node_clustering.py b/cdlib/classes/node_clustering.py index 6abd0d1a..ad403582 100644 --- a/cdlib/classes/node_clustering.py +++ b/cdlib/classes/node_clustering.py @@ -297,8 +297,9 @@ def avg_odf(self, **kwargs): >>> from cdlib.algorithms import louvain >>> g = nx.karate_club_graph() - >>> communities = louvain(g) - >>> mod = communities.avg_odf() + >>> + >>> communities = eva(g, alpha=alpha) + >>> pur = communities.purity() """ if self.__check_graph(): diff --git a/cdlib/evaluation/fitness.py b/cdlib/evaluation/fitness.py index 176c5178..493c6c04 100644 --- a/cdlib/evaluation/fitness.py +++ b/cdlib/evaluation/fitness.py @@ -5,13 +5,13 @@ import numpy as np import scipy from cdlib.evaluation.internal.link_modularity import cal_modularity +import Eva __all__ = ["FitnessResult", "link_modularity", "normalized_cut", "internal_edge_density", "average_internal_degree", "fraction_over_median_degree", "expansion", "cut_ratio", "edges_inside", "flake_odf", "avg_odf", "max_odf", "triangle_participation_ratio", "modularity_density", "z_modularity", "erdos_renyi_modularity", "newman_girvan_modularity", "significance", "surprise", "conductance", "size", "avg_embeddedness", - "scaled_density", "avg_distance", "hub_dominance", "avg_transitivity"] - + "scaled_density", "avg_distance", "hub_dominance", "avg_transitivity", "purity"] # FitnessResult = namedtuple('FitnessResult', ['min', 'max', 'mean', 'std']) FitnessResult = namedtuple('FitnessResult', 'min max score std') @@ -84,7 +84,7 @@ def scaled_density(graph, communities, **kwargs): """ return __quality_indexes(graph, communities, - lambda graph, coms: nx.density(nx.subgraph(graph, coms))/ nx.density(graph), **kwargs) + lambda graph, coms: nx.density(nx.subgraph(graph, coms)) / nx.density(graph), **kwargs) def avg_distance(graph, communities, **kwargs): @@ -131,7 +131,7 @@ def hub_dominance(graph, communities, **kwargs): return __quality_indexes(graph, communities, lambda graph, coms: max([x[1] for x in - list(nx.degree(nx.subgraph(graph, coms)))])/(len(coms) - 1), + list(nx.degree(nx.subgraph(graph, coms)))]) / (len(coms) - 1), **kwargs) @@ -382,7 +382,6 @@ def edges_inside(graph, community, **kwargs): 1. Radicchi, F., Castellano, C., Cecconi, F., Loreto, V., & Parisi, D. (2004). Defining and identifying communities in networks. Proceedings of the National Academy of Sciences, 101(9), 2658-2663. """ - return __quality_indexes(graph, community, pq.PartitionQuality.edges_inside, **kwargs) @@ -622,7 +621,7 @@ def erdos_renyi_modularity(graph, communities, **kwargs): c = nx.subgraph(graph, community) mc = c.number_of_edges() nc = c.number_of_nodes() - q += mc - (m*nc*(nc - 1)) / (n*(n-1)) + q += mc - (m * nc * (nc - 1)) / (n * (n - 1)) return FitnessResult(score=(1 / m) * q) @@ -710,8 +709,8 @@ def z_modularity(graph, communities, **kwargs): for node in c: dc += c.degree(node) - mmc += (mc/m) - dc2m += (dc/(2*m))**2 + mmc += (mc / m) + dc2m += (dc / (2 * m)) ** 2 res = 0 try: @@ -759,10 +758,10 @@ def surprise(graph, communities, **kwargs): q += mc qa += scipy.special.comb(nc, 2, exact=True) try: - q = q/m - qa = qa/scipy.special.comb(n, 2, exact=True) + q = q / m + qa = qa / scipy.special.comb(n, 2, exact=True) - sp = m*(q*np.log(q/qa) + (1-q)*np.log2((1-q)/(1-qa))) + sp = m * (q * np.log(q / qa) + (1 - q) * np.log2((1 - q) / (1 - qa))) except ZeroDivisionError: pass @@ -792,7 +791,7 @@ def significance(graph, communities, **kwargs): m = graph.number_of_edges() binom = scipy.special.comb(m, 2, exact=True) - p = m/binom + p = m / binom q = 0 @@ -805,7 +804,35 @@ def significance(graph, communities, **kwargs): binom_c = scipy.special.comb(nc, 2, exact=True) pc = mc / binom_c - q += binom_c * (pc * np.log(pc/p) + (1-pc)*np.log((1-pc)/(1-p))) + q += binom_c * (pc * np.log(pc / p) + (1 - pc) * np.log((1 - pc) / (1 - p))) except ZeroDivisionError: pass return FitnessResult(score=q) + + +def purity(communities): + """Purity is the product of the frequencies of the most frequent labels carried by the nodes within the communities + + :param communities: AttrNodeClustering object + :return: FitnessResult object + + Example: + + >>> from cdlib.algorithms import eva + >>> from cdlib import evaluation + >>> import random + >>> l1 = ['A', 'B', 'C', 'D'] + >>> l2 = ["E", "F", "G"] + >>> g = nx.barabasi_albert_graph(100, 5) + >>> labels=dict() + >>> for node in g.nodes(): + >>> labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)} + >>> communities = eva(g_attr, labels, alpha=0.5) + >>> pur = evaluation.purity(communities) + + :References: + + 1. ###### + """ + pur = Eva.purity(communities.coms_labels) + return FitnessResult(score=pur) diff --git a/cdlib/test/__init__.py b/cdlib/test/__init__.py index e35b876a..3dc8122c 100644 --- a/cdlib/test/__init__.py +++ b/cdlib/test/__init__.py @@ -7,3 +7,4 @@ from .test_partitions_comparisons import * from .test_utils import * from .test_viz_network import * +from .test_attributeclustering import * diff --git a/cdlib/test/test_attributeclustering.py b/cdlib/test/test_attributeclustering.py new file mode 100644 index 00000000..fa247201 --- /dev/null +++ b/cdlib/test/test_attributeclustering.py @@ -0,0 +1,23 @@ +import unittest +from cdlib import algorithms +import networkx as nx +import random + +class AttrCommunityDiscoveryTests(unittest.TestCase): + + def test_eva(self): + + l1 = ['one', 'two', 'three', 'four'] + l2 = ["A", "B", "C"] + g = nx.barabasi_albert_graph(100, 5) + labels=dict() + + for node in g.nodes(): + labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)} + + coms = algorithms.eva(g,labels,alpha=0.5) + + self.assertEqual(type(coms.communities), list) + if len(coms.communities) > 0: + self.assertEqual(type(coms.communities[0]), list) + self.assertEqual(type(coms.communities[0][0]), int) \ No newline at end of file diff --git a/cdlib/test/test_fitness_functions.py b/cdlib/test/test_fitness_functions.py index 6eab8490..4af96040 100644 --- a/cdlib/test/test_fitness_functions.py +++ b/cdlib/test/test_fitness_functions.py @@ -1,10 +1,12 @@ import unittest -from cdlib.algorithms import louvain +from cdlib.algorithms import louvain, eva import networkx as nx import numpy as np +import random from cdlib import evaluation + class FitnessFunctionsTests(unittest.TestCase): def test_link_modularity(self): @@ -66,3 +68,21 @@ def test_pquality_indexes(self): for idx in indexes: res = idx(g, communities) self.assertIsInstance(res, evaluation.FitnessResult) + + def test_purity(self): + + l1 = ['one', 'two', 'three', 'four'] + l2 = ["A", "B", "C"] + g_attr = nx.barabasi_albert_graph(100, 5) + labels = dict() + + for node in g_attr.nodes(): + labels[node] = {"l1": random.choice(l1), "l2": random.choice(l2)} + + coms = eva(g_attr, labels, alpha=0.8) + + pur = evaluation.purity(coms) + + self.assertIsInstance(pur, evaluation.FitnessResult) + self.assertGreaterEqual(pur.score, 0) + self.assertLessEqual(pur.score, 1) diff --git a/requirements.txt b/requirements.txt index de84c4db..f5890954 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ pulp==1.6.* pquality==0.0.7 seaborn==0.9.* pandas==0.25.* +eva_lcd