diff --git a/cdlib/__init__.py b/cdlib/__init__.py index bf589885..4f5b8430 100644 --- a/cdlib/__init__.py +++ b/cdlib/__init__.py @@ -1,3 +1,4 @@ from cdlib.classes.node_clustering import NodeClustering from cdlib.classes.edge_clustering import EdgeClustering from cdlib.classes.fuzzy_node_clustering import FuzzyNodeClustering +from cdlib.classes.attr_node_clustering import AttrNodeClustering diff --git a/cdlib/algorithms/__init__.py b/cdlib/algorithms/__init__.py index 451496a6..e1248fdb 100644 --- a/cdlib/algorithms/__init__.py +++ b/cdlib/algorithms/__init__.py @@ -1,3 +1,4 @@ from .edge_clustering import * from .crisp_partition import * from .overlapping_partition import * +from .attribute_clustering import * diff --git a/cdlib/algorithms/attribute_clustering.py b/cdlib/algorithms/attribute_clustering.py new file mode 100644 index 00000000..e8fbacb5 --- /dev/null +++ b/cdlib/algorithms/attribute_clustering.py @@ -0,0 +1,85 @@ +try: + import igraph as ig +except ModuleNotFoundError: + ig = None + +import Eva + +from collections import defaultdict +from cdlib import AttrNodeClustering + +import networkx as nx + +from cdlib.utils import convert_graph_formats + +from cdlib.algorithms.internal.ILouvain import ML2 + +__all__ = ['eva', 'ilouvain'] + +def eva(g, labels, weight='weight', resolution=1., randomize=False, alpha=0.5): + + """ + The Eva algorithm extends the Louvain approach in order to deal with the attributes of the nodes (aka Louvain Extended to Vertex Attributes). + It optimizes - combining them linearly - two quality functions, a structural and a clustering one, namely the modularity and the purity. + A parameter alpha tunes the importance of the two functions: an high value of alpha favors the clustering criterion instead of the structural one. + + :param g: a networkx/igraph object + :param weight: str, optional the key in graph to use as weight. Default to 'weight' + :param resolution: double, optional Will change the size of the communities, default to 1. + :param randomize: boolean, optional Will randomize the node evaluation order and the community evaluation order to get different partitions at each call, default False + :param alpha: a value assumed in [0,1] tuning the importance of modularity and purity criteria + :return: AttrNodeClustering object + + :Example: + + >>> from cdlib.algorithms import eva + >>> import networkx as nx + >>> import random + >>> l1 = ['A', 'B', 'C', 'D'] + >>> l2 = ["E", "F", "G"] + >>> g = nx.barabasi_albert_graph(100, 5) + >>> labels=dict() + >>> for node in g.nodes(): + >>> labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)} + >>> communities = eva(g_attr, labels, alpha=0.8) + + :References: + + 1. ##### + + .. note:: Reference implementation: https://github.com/GiulioRossetti/Eva/tree/master/Eva + """ + + g = convert_graph_formats(g, nx.Graph) + nx.set_node_attributes(g, labels) + + coms, coms_labels = Eva.eva_best_partition(g, weight=weight, resolution=resolution, randomize=randomize, alpha=alpha) + + # Reshaping the results + coms_to_node = defaultdict(list) + for n, c in coms.items(): + coms_to_node[c].append(n) + + coms_eva = [list(c) for c in coms_to_node.values()] + return AttrNodeClustering(coms_eva, g, "Eva", coms_labels, method_parameters={"weight": weight, "resolution": resolution, + "randomize": randomize, "alpha":alpha}) + + +def ilouvain(g, labels, id): + g = convert_graph_formats(g, nx.Graph) + nx.set_node_attributes(g, labels) + id = dict() + for n in g.nodes(): + id[n] = n + + algo = ML2(g,labels, id) + coms = algo.findPartition() + + # Reshaping the results + coms_to_node = defaultdict(list) + for n, c in coms.items(): + coms_to_node[c].append(n) + + coms_ilouv = [list(c) for c in coms_to_node.values()] + + return AttrNodeClustering(coms_ilouv, g, "ILouvain") \ No newline at end of file diff --git a/cdlib/algorithms/crisp_partition.py b/cdlib/algorithms/crisp_partition.py index fcf20bae..a2779c85 100644 --- a/cdlib/algorithms/crisp_partition.py +++ b/cdlib/algorithms/crisp_partition.py @@ -965,4 +965,3 @@ def sbm_dl_nested(g, B_min=None,B_max=None, deg_corr=True, **kwargs): coms = affiliations2nodesets(affiliations) coms = [list(v) for k,v in coms.items()] return NodeClustering(coms, g, "SBM_nested", method_parameters={"B_min": B_min, "B_max": B_max, "deg_corr": deg_corr}) - diff --git a/cdlib/algorithms/internal/ILouvain.py b/cdlib/algorithms/internal/ILouvain.py new file mode 100644 index 00000000..2e23b6b4 --- /dev/null +++ b/cdlib/algorithms/internal/ILouvain.py @@ -0,0 +1,504 @@ +#!/usr/bin/python + +from __future__ import division + +#from pprint import pprint + +#import argparse +import numpy as np +#from scipy.spatial.distance import pdist, squareform +import os.path + +# -*- coding: utf-8 -*- +""" +This module implements community detection. +""" +__all__ = ["partition_at_level", "modularity", "best_partition", "generate_dendogram", "induced_graph"] +__author__ = """Thomas Aynaud (thomas.aynaud@lip6.fr)""" +# Copyright (C) 2009 by +# Thomas Aynaud +# All rights reserved. +# BSD license. + +import networkx as nx + + +class ML2: + __MIN = 0.000001 + __PASS_MAX = -1 + LOGOPERATIONS = False + + nbVertices = 0 + + def __init__(self, graph, attributes, authorIndex): + self.graph = graph + self.graphBase = graph.copy() + self.attributes = attributes + self.nbVertices = len(graph) + self.statusTab = [] + self.authorIndex = authorIndex + + # Build status structures + status = Status() + status.init(graph) + self.statusTab.append(status) + statusA = Status() + statusA.initAttribStatus(graph, authorIndex, attributes) + self.statusTab.append(statusA) + + self.status_list = list() + + def critereCombinaison(self): + # if(args.verbose): + # print("Mod1: " + str(self.__modularity(self.statusTab[0]))) + # print("Mod2: " + str(self.__modularity(self.statusTab[1]))) + return (self.__modularity(self.statusTab[0]) + self.__modularity(self.statusTab[1])) / 2. + + def findPartition(self): + giniMatrix = self.calculateGiniMatrixInitial() + self.__one_level(giniMatrix=None) + new_mod = self.critereCombinaison() + + partition, bijection = self.__renumber() + + self.status_list.append(partition) + mod = new_mod + self.induced_graph(partition) + self.statusTab[0].init(self.graph) + + # if(args.verbose): + # print("Inducing attrib status") + self.statusTab[1].inducedAttribStatusTab(partition, bijection) + giniMatrix = self.firstInducedGiniMatrix(partition, giniMatrix) + + while True: + self.__one_level(giniMatrix=giniMatrix) + new_mod = self.critereCombinaison() + if new_mod - mod < self.__MIN: + # if(args.verbose): + # print("modularities") + # print(self.__modularity(self.statusTab[0])) + # print(self.__modularity(self.statusTab[1])) + # print("Modularity Final: " + str(self.__modularity(self.statusTab[1]) + self.__modularity(self.statusTab[0]))) + break + partition, bijection = self.__renumber() + + self.status_list.append(partition) + + mod = new_mod + self.induced_graph(partition) + giniMatrix = self.inducedGiniMatrix(partition, giniMatrix) + + self.statusTab[0].init(self.graph) + + if self.statusTab[1] != None: + self.statusTab[1].inducedAttribStatusTab(partition, bijection) + + dendogram = self.status_list[:] + + # Generate and output partition + partition = dendogram[0].copy() + for index in range(1, len(dendogram)): + for node, community in partition.items(): + partition[node] = dendogram[index][community] + + return partition + # for elem, part in sorted(partition.iteritems()) : + # if(args.verbose): + # print(str(self.authorIndex[elem]) + " " + str(part) + " " + str(self.attributes[self.authorIndex[elem]])) + # else: + # out = str(self.authorIndex[elem]) + " " + str(part) + # if(args.multipleDataset != None): + # f = open(args.dataset + "_" + str(curDatasetIdx) + ".2ModLouvain",'a') + # else: + # f = open(args.dataset + ".2ModLouvain",'a') + # f.write(out + "\n") + # f.close() + + def dist(self, v1, v2): + attrV1 = self.attributes[v1] + attrV2 = self.attributes[v2] + distance = 0. + for attr, val1 in attrV1.items(): + val2 = attrV2.get(attr, 0.) + distance += (val1 - val2) ** 2 + for attr, val2 in attrV2.items(): + if not attr in attrV1: + distance += val2 * val2 + return distance + + def distArray(self, v1, v2): + attrV1 = self.attributes[v1] + attrV2 = self.attributes[v2] + distance = 0. + for i in range(len(attrV1)): + distance += (attrV1[i] - attrV2[i]) ** 2 + return distance + + def firstInducedGiniMatrix(self, partition, giniMatrix): + out = np.zeros([len(set(partition.values())), len(set(partition.values()))]) + # if(args.verbose): + # pprint(giniMatrix) + for i in partition: + for j in partition: + out[partition[i]][partition[j]] = giniMatrix[self.authorIndex[i]][self.authorIndex[j]] + return out + + def inducedGiniMatrix(self, partition, giniMatrix): + # if(args.verbose): + # print("inducedGiniMatrix...") + out = np.zeros([len(set(partition.values())), len(set(partition.values()))]) + for i in partition: + for j in partition: + out[partition[i]][partition[j]] = out[partition[i]][partition[j]] + giniMatrix[i][j] + # if(args.verbose): + # print("End inducedGiniMatrix") + + return out + + def calculateGiniMatrixInitial(self): + giniMatrix = {} + for v1 in self.graph: + giniMatrix[self.authorIndex[v1]] = {} + np.zeros(self.nbVertices ** 2).reshape((self.nbVertices, self.nbVertices)) + for v1 in self.graph: + for v2 in self.graph: + d = -1 * self.dist(self.authorIndex[v1], self.authorIndex[v2]) / self.nbVertices ** 2 + giniMatrix[self.authorIndex[v1]][self.authorIndex[v2]] = d + giniMatrix[self.authorIndex[v2]][self.authorIndex[v1]] = d + """ + print "Calculating Gini Matrix Initial" + Y = pdist(self.attributes, 'sqeuclidean') + print "division" + Y = np.divide(Y, 0.0-float(len(self.attributes)**2)) + pprint(giniMatrix) + pprint(squareform(Y)) + return squareform(Y) + """ + return giniMatrix + + def induced_graph(self, partition): + newGraph = nx.Graph() + newGraph.add_nodes_from(partition.values()) + + # for node1, node2, datas in self.graph.edges_iter(data = True) : + for node1, node2, datas in list(self.graph.edges(data=True)): + weight = datas.get("weight", 1) + com1 = partition[node1] + com2 = partition[node2] + w_prec = newGraph.get_edge_data(com1, com2, {"weight": 0}).get("weight", 1) + newGraph.add_edge(com1, com2, weight=w_prec + weight) + self.graph = newGraph + + def __renumber(self): + count = 0 + dictionary = self.statusTab[0].node2com + ret = dictionary.copy() + new_values = dict([]) + for key in dictionary.keys(): + value = dictionary[key] + new_value = new_values.get(value, -1) + if new_value == -1: + new_values[value] = count + new_value = count + count = count + 1 + ret[key] = new_value + return ret, new_values + + def __one_level(self, giniMatrix=None): + modif = True + + while modif: + modif = False + numNode = 0 + for node in self.graph.nodes(): + numNode = numNode + 1 + + com_node = self.statusTab[0].node2com[node] + best_com = com_node + + best_increase = 0 + neigh_communities = self.__neighcom(node, giniMatrix=giniMatrix) + # if(args.verbose): + # print("Neighb Communities of " + str(node)) + # pprint(neigh_communities) + + degc_totw_tab = [] + + for i in range(len(self.statusTab)): + + degc_totw_tab.append( + self.statusTab[i].gdegrees.get(node, 0.) / (self.statusTab[i].total_weight * 2.)) + theWeight = neigh_communities[com_node][i] + + if abs(self.statusTab[i].degrees[com_node]) <= abs(self.statusTab[i].gdegrees[node]): + self.statusTab[i].degrees[com_node] = abs(self.statusTab[i].gdegrees[node]) + + self.__remove(node, com_node, theWeight, self.statusTab[i]) + assert (self.statusTab[0].node2com[node] == self.statusTab[1].node2com[node]) + + # Find the best community + for com, dnc in neigh_communities.items(): + incr = 0. + for i in range(len(self.statusTab)): + totw = abs(self.statusTab[i].total_weight) + if i == 0: + a = (abs(dnc[i]) - abs(self.statusTab[i].degrees.get(com, 0.) * degc_totw_tab[i])) / totw + incr += a + else: + a = (0.0 - abs(dnc[i]) + abs( + self.statusTab[i].degrees.get(com, 0.) * degc_totw_tab[i])) / totw + incr += a + incr /= 2 + if incr > best_increase: + best_increase = incr + best_com = com + + for i in range(len(self.statusTab)): + if best_com in neigh_communities: + theWeight = neigh_communities[best_com][i] + else: + print("IS THAT POSSIBLE ???? (best_com not in neigh_communities)") + exit(0) + theWeight = 0 + self.__insert(node, best_com, theWeight, self.statusTab[i]) + + if best_com != com_node: + modif = True + + def __neighcom(self, node, giniMatrix=None): + weights = {} + voisins = self.graph[node].items() + curCommunity = self.statusTab[0].node2com[node] + if curCommunity not in weights: + weights[curCommunity] = np.zeros([len(self.statusTab)]) + + for neighbor, datas in voisins: + if neighbor != node: + weight = datas.get("weight", 1) + neighborcom = self.statusTab[0].node2com[neighbor] + + if neighborcom not in weights: + weights[neighborcom] = np.zeros([len(self.statusTab)]) + + # For the graph + weights[neighborcom][0] = weights[neighborcom][0] + weight + + # For the attributes + if giniMatrix is not None: + weight = giniMatrix[node][neighbor] + else: + weight = -1 * self.dist(self.authorIndex[node], self.authorIndex[neighbor]) / self.nbVertices ** 2 + weights[neighborcom][1] = weights[neighborcom][1] + weight + return weights + + def __remove(self, node, com, weight, status): + status.degrees[com] = (status.degrees.get(com, 0.) - status.gdegrees.get(node, 0.)) + status.internals[com] = float(status.internals.get(com, 0.) - weight - status.loops.get(node, 0.)) + status.node2com[node] = -1 + + def __insert(self, node, com, weight, status): + status.node2com[node] = com + status.degrees[com] = (status.degrees.get(com, 0.) + status.gdegrees.get(node, 0.)) + status.internals[com] = float(status.internals.get(com, 0.) + weight + status.loops.get(node, 0.)) + + def __modularity(self, status): + links = abs(float(status.total_weight)) + result = 0. + for community in set(status.node2com.values()): + in_degree = abs(status.internals.get(community, 0.)) + degree = abs(status.degrees.get(community, 0.)) + expected = ((degree / (2. * links)) ** 2) + found = in_degree / links + if status.total_weight < 0: + result += expected - found + else: + result += found - expected + return result + + +class Status: + """ + To handle several data in one struct. + Could be replaced by named tuple, but don't want to depend on python 2.6 + """ + node2com = dict([]) + total_weight = 0 + internals = dict([]) + degrees = dict([]) + gdegrees = dict([]) + loops = dict([]) + + def __str__(self): + return ("------------------------\nnode2com : " + str(self.node2com) + "\n degrees : " + + str(self.degrees) + "\n gdegrees : " + + str(self.gdegrees) + "\n internals : " + str(self.internals) + + "\n total_weight : " + str(self.total_weight) + "\n loops:" + str( + self.loops) + "\n-----------------------") + + def initAttribStatus(self, graph, authorIndex, attributes): + """Initialize the status of an attributes list with every node in one community""" + N = len(graph) + count = 0 + + # Compute the center of gravity using dict + meanVector = {} + for v, attrs in attributes.items(): + for attrId, attrValue in attrs.items(): + meanVector[attrId] = meanVector.get(attrId, 0.) + attrValue + for attrId, attrValue in meanVector.items(): + meanVector[attrId] = meanVector[attrId] / N + + variance = {} + for node in sorted(graph.nodes()): + distanceToCenterOfGravity = 0. + for attrId, attrValue in meanVector.items(): + variance[attrId] = variance.get(attrId, 0.) + ( + (attrValue - attributes[authorIndex[node]].get(attrId, 0.)) ** 2) + inertieTot = 0. + for v in variance.values(): + inertieTot += (v / N) + + # if(args.verbose): + # print("# Total inertia:", inertieTot) + self.total_weight = (0.0 - inertieTot) + + for node in sorted(graph.nodes()): + self.node2com[node] = count + + # Compute the distance to the center of gravity + distanceToCenterOfGravity = 0. + for attrId, attrValue in meanVector.items(): + distanceToCenterOfGravity += (attrValue - attributes[authorIndex[node]].get(attrId, 0.)) ** 2 + + phiHuyghens = -1 * (inertieTot + distanceToCenterOfGravity) / N + # if(args.verbose): + # print("# phiHuyghens(" + str(node) + ") = " + str(phiHuyghens)) + self.degrees[count] = phiHuyghens + self.gdegrees[node] = phiHuyghens + self.loops[node] = 0 + self.internals[count] = self.loops[node] + count = count + 1 + + def inducedAttribStatusTab(self, node2com, bijection): + # if(args.verbose): + # print(self) + retrobijection = {} + for k, v in bijection.items(): + retrobijection[v] = k + self.node2com = dict([]) + oldDegrees = self.degrees + oldInternals = self.internals + + self.degrees = dict([]) + self.gdegrees = dict([]) + self.internals = dict([]) + self.node2com = dict([]) + self.loops = dict([]) + + for node in retrobijection: + self.node2com[node] = node + deg = oldDegrees[retrobijection[node]] + self.degrees[node] = deg + self.gdegrees[node] = deg + self.loops[node] = oldInternals[retrobijection[node]] + self.internals[node] = self.loops[node] + + def init(self, graph): + """Initialize the status of a graph with every node in one community""" + count = 0 + self.node2com = dict([]) + self.degrees = dict([]) + self.gdegrees = dict([]) + self.internals = dict([]) + self.total_weight = graph.size(weight='weight') + for node in sorted(graph.nodes()): + self.node2com[node] = count + deg = float(graph.degree(node, weight='weight')) + self.degrees[count] = deg + self.gdegrees[node] = deg + self.loops[node] = float(graph.get_edge_data(node, node, {"weight": 0}).get("weight", 1)) + self.internals[count] = self.loops[node] + count = count + 1 + + +def loadDataset(path): + graph = nx.Graph() + + # Read the graph + if (not os.path.isfile(path + ".edgeList")): + print("Error: file '" + path + ".edgeList' not found") + exit(-1) + with open(path + ".edgeList") as f: + for line in f.readlines(): + v1 = int(line.split(" ")[0]) + v2 = int(line.split(" ")[1]) + graph.add_node(v1) + graph.add_node(v2) + graph.add_edge(v1, v2) + + # Read the attributes + attributes = {} + for n in graph: + attributes[n] = {} + + if (not os.path.isfile(path + ".attributes")): + print("Error: file '" + path + ".attributes' not found") + exit(-1) + + with open(path + ".attributes") as f: + for line in f.readlines(): + vertexId = int(line.split(" ")[0]) + elems = line.split(" ")[1].split(",") + i = 0 + attrValues = {} + for attrValue in elems: + attrValues[i] = float(attrValue) + i = i + 1 + attributes[vertexId] = attrValues + + # Build authorIndex + authorIndex = {} + for n in graph: + authorIndex[n] = n + + # if(args.verbose): + # print("# Finished reading dataset") + if os.path.exists(path + ".2ModLouvain"): + os.remove(path + ".2ModLouvain") + + return graph, attributes, authorIndex + + +def readToyGraph(): + graph = nx.Graph() + graph.add_node("a") + graph.add_node("b") + graph.add_node("c") + graph.add_node("d") + graph.add_node("e") + graph.add_edge("a", "b") + graph.add_edge("b", "c") + graph.add_edge("c", "d") + graph.add_edge("d", "e") + graph.add_edge("a", "e") + graph.add_edge("b", "e") + graph.add_edge("c", "e") + graph.add_edge("b", "d") + graph.add_edge("a", "c") + graph.add_edge("a", "d") + + authorIndex = {} + authorIndex["a"] = 0 + authorIndex["b"] = 1 + authorIndex["c"] = 2 + authorIndex["d"] = 3 + authorIndex["e"] = 4 + + attributes = { + 0: {0: 2., 1: 4}, + 1: {0: 8., 1: 1}, + 2: {0: 7., 1: 5}, + 3: {0: 12., 1: 6}, + 4: {0: 1., 1: 4}} + return graph, attributes, authorIndex \ No newline at end of file diff --git a/cdlib/classes/__init__.py b/cdlib/classes/__init__.py index fba7af98..596dc60a 100644 --- a/cdlib/classes/__init__.py +++ b/cdlib/classes/__init__.py @@ -1,3 +1,4 @@ from .node_clustering import NodeClustering from .edge_clustering import EdgeClustering from .fuzzy_node_clustering import FuzzyNodeClustering +from .attr_node_clustering import AttrNodeClustering diff --git a/cdlib/classes/attr_node_clustering.py b/cdlib/classes/attr_node_clustering.py new file mode 100644 index 00000000..6dc48679 --- /dev/null +++ b/cdlib/classes/attr_node_clustering.py @@ -0,0 +1,29 @@ +from cdlib.classes.node_clustering import NodeClustering +from cdlib import evaluation + + +class AttrNodeClustering(NodeClustering): + + """Attribute Node Communities representation. + + :param communities: list of communities + :param graph: a networkx/igraph object + :param method_name: community discovery algorithm name + :param coms_labels: dictionary specifying for each community the frequency of the attribute values + :param method_parameters: configuration for the community discovery algorithm used + :param overlap: boolean, whether the partition is overlapping or not + """ + + def __init__(self, communities, graph, method_name, coms_labels=None, method_parameters=None, overlap=False): + super().__init__(communities, graph, method_name, method_parameters, overlap) + self.coms_labels = coms_labels + + def purity(self): + + """Purity is the product of the frequencies of the most frequent labels carried by the nodes within the communities + :return: FitnessResult object + """ + res = None + if self.coms_labels is not None: + res = evaluation.purity(self.coms_labels) + return res \ No newline at end of file diff --git a/cdlib/classes/node_clustering.py b/cdlib/classes/node_clustering.py index 6abd0d1a..ad403582 100644 --- a/cdlib/classes/node_clustering.py +++ b/cdlib/classes/node_clustering.py @@ -297,8 +297,9 @@ def avg_odf(self, **kwargs): >>> from cdlib.algorithms import louvain >>> g = nx.karate_club_graph() - >>> communities = louvain(g) - >>> mod = communities.avg_odf() + >>> + >>> communities = eva(g, alpha=alpha) + >>> pur = communities.purity() """ if self.__check_graph(): diff --git a/cdlib/evaluation/fitness.py b/cdlib/evaluation/fitness.py index 176c5178..1a6a6890 100644 --- a/cdlib/evaluation/fitness.py +++ b/cdlib/evaluation/fitness.py @@ -5,13 +5,13 @@ import numpy as np import scipy from cdlib.evaluation.internal.link_modularity import cal_modularity +import Eva __all__ = ["FitnessResult", "link_modularity", "normalized_cut", "internal_edge_density", "average_internal_degree", "fraction_over_median_degree", "expansion", "cut_ratio", "edges_inside", "flake_odf", "avg_odf", "max_odf", "triangle_participation_ratio", "modularity_density", "z_modularity", "erdos_renyi_modularity", "newman_girvan_modularity", "significance", "surprise", "conductance", "size", "avg_embeddedness", - "scaled_density", "avg_distance", "hub_dominance", "avg_transitivity"] - + "scaled_density", "avg_distance", "hub_dominance", "avg_transitivity", "purity"] # FitnessResult = namedtuple('FitnessResult', ['min', 'max', 'mean', 'std']) FitnessResult = namedtuple('FitnessResult', 'min max score std') @@ -84,7 +84,7 @@ def scaled_density(graph, communities, **kwargs): """ return __quality_indexes(graph, communities, - lambda graph, coms: nx.density(nx.subgraph(graph, coms))/ nx.density(graph), **kwargs) + lambda graph, coms: nx.density(nx.subgraph(graph, coms)) / nx.density(graph), **kwargs) def avg_distance(graph, communities, **kwargs): @@ -131,7 +131,7 @@ def hub_dominance(graph, communities, **kwargs): return __quality_indexes(graph, communities, lambda graph, coms: max([x[1] for x in - list(nx.degree(nx.subgraph(graph, coms)))])/(len(coms) - 1), + list(nx.degree(nx.subgraph(graph, coms)))]) / (len(coms) - 1), **kwargs) @@ -382,7 +382,6 @@ def edges_inside(graph, community, **kwargs): 1. Radicchi, F., Castellano, C., Cecconi, F., Loreto, V., & Parisi, D. (2004). Defining and identifying communities in networks. Proceedings of the National Academy of Sciences, 101(9), 2658-2663. """ - return __quality_indexes(graph, community, pq.PartitionQuality.edges_inside, **kwargs) @@ -622,7 +621,7 @@ def erdos_renyi_modularity(graph, communities, **kwargs): c = nx.subgraph(graph, community) mc = c.number_of_edges() nc = c.number_of_nodes() - q += mc - (m*nc*(nc - 1)) / (n*(n-1)) + q += mc - (m * nc * (nc - 1)) / (n * (n - 1)) return FitnessResult(score=(1 / m) * q) @@ -710,8 +709,8 @@ def z_modularity(graph, communities, **kwargs): for node in c: dc += c.degree(node) - mmc += (mc/m) - dc2m += (dc/(2*m))**2 + mmc += (mc / m) + dc2m += (dc / (2 * m)) ** 2 res = 0 try: @@ -759,10 +758,10 @@ def surprise(graph, communities, **kwargs): q += mc qa += scipy.special.comb(nc, 2, exact=True) try: - q = q/m - qa = qa/scipy.special.comb(n, 2, exact=True) + q = q / m + qa = qa / scipy.special.comb(n, 2, exact=True) - sp = m*(q*np.log(q/qa) + (1-q)*np.log2((1-q)/(1-qa))) + sp = m * (q * np.log(q / qa) + (1 - q) * np.log2((1 - q) / (1 - qa))) except ZeroDivisionError: pass @@ -792,7 +791,7 @@ def significance(graph, communities, **kwargs): m = graph.number_of_edges() binom = scipy.special.comb(m, 2, exact=True) - p = m/binom + p = m / binom q = 0 @@ -805,7 +804,36 @@ def significance(graph, communities, **kwargs): binom_c = scipy.special.comb(nc, 2, exact=True) pc = mc / binom_c - q += binom_c * (pc * np.log(pc/p) + (1-pc)*np.log((1-pc)/(1-p))) + q += binom_c * (pc * np.log(pc / p) + (1 - pc) * np.log((1 - pc) / (1 - p))) except ZeroDivisionError: pass return FitnessResult(score=q) + + +def purity(communities): + """Purity is the product of the frequencies of the most frequent labels carried by the nodes within the communities + + :param communities: AttrNodeClustering object + :return: FitnessResult object + + Example: + + >>> from cdlib.algorithms import eva + >>> from cdlib import evaluation + >>> import random + >>> l1 = ['A', 'B', 'C', 'D'] + >>> l2 = ["E", "F", "G"] + >>> g = nx.barabasi_albert_graph(100, 5) + >>> labels=dict() + >>> for node in g.nodes(): + >>> labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)} + >>> communities = eva(g_attr, labels, alpha=0.5) + >>> pur = evaluation.purity(communities) + + :References: + + 1. ###### + """ + print(communities.coms_labels) + pur = Eva.purity(communities.coms_labels) + return FitnessResult(score=pur) diff --git a/cdlib/test/__init__.py b/cdlib/test/__init__.py index e35b876a..3dc8122c 100644 --- a/cdlib/test/__init__.py +++ b/cdlib/test/__init__.py @@ -7,3 +7,4 @@ from .test_partitions_comparisons import * from .test_utils import * from .test_viz_network import * +from .test_attributeclustering import * diff --git a/cdlib/test/test_attributeclustering.py b/cdlib/test/test_attributeclustering.py new file mode 100644 index 00000000..40dc9b76 --- /dev/null +++ b/cdlib/test/test_attributeclustering.py @@ -0,0 +1,42 @@ +import unittest +from cdlib import algorithms +import networkx as nx +import random + +class AttrCommunityDiscoveryTests(unittest.TestCase): + + def test_eva(self): + + l1 = ['one', 'two', 'three', 'four'] + l2 = ["A", "B", "C"] + g = nx.barabasi_albert_graph(100, 5) + labels=dict() + + for node in g.nodes(): + labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)} + + coms = algorithms.eva(g,labels,alpha=0.5) + + self.assertEqual(type(coms.communities), list) + if len(coms.communities) > 0: + self.assertEqual(type(coms.communities[0]), list) + self.assertEqual(type(coms.communities[0][0]), int) + + def test_ilouvain(self): + + l1 = [0.1, 0.4, 0.5] + l2 = [34, 3, 112] + g = nx.barabasi_albert_graph(100, 5) + labels = dict() + + for node in g.nodes(): + labels[node]={"l1":random.choice(l1), "l2":random.choice(l2)} + + id = dict() + for n in g.nodes(): + id[n] = n + + coms = algorithms.ilouvain(g, labels, id) + + self.assertEqual(type(coms.communities), list) + diff --git a/cdlib/test/test_fitness_functions.py b/cdlib/test/test_fitness_functions.py index 6eab8490..d1d764c6 100644 --- a/cdlib/test/test_fitness_functions.py +++ b/cdlib/test/test_fitness_functions.py @@ -1,10 +1,12 @@ import unittest -from cdlib.algorithms import louvain +from cdlib.algorithms import louvain, eva import networkx as nx import numpy as np +import random from cdlib import evaluation + class FitnessFunctionsTests(unittest.TestCase): def test_link_modularity(self): @@ -66,3 +68,20 @@ def test_pquality_indexes(self): for idx in indexes: res = idx(g, communities) self.assertIsInstance(res, evaluation.FitnessResult) + + def test_purity(self): + + l1 = ['one', 'two', 'three', 'four'] + l2 = ["A", "B", "C"] + g_attr = nx.barabasi_albert_graph(100, 5) + labels = dict() + + for node in g_attr.nodes(): + labels[node] = {"l1": random.choice(l1), "l2": random.choice(l2)} + + coms = eva(g_attr, labels, alpha=0.8) + + pur = evaluation.purity(coms) + + self.assertGreaterEqual(pur.score, 0) + self.assertLessEqual(pur.score, 1) diff --git a/requirements.txt b/requirements.txt index de84c4db..f5890954 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ pulp==1.6.* pquality==0.0.7 seaborn==0.9.* pandas==0.25.* +eva_lcd