In [15]:
import networkx as nx
import numpy as np
import igraph as ig
from infomap import Infomap
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import f1_score, jaccard_score
from networkx.algorithms.community.quality import modularity
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Graph Creation

In [2]:
def read_data(data_path="../data/emails.txt", labels_path="../data/emails_labels.txt"):
    with open(labels_path, "r") as f:
        nodes_labels = f.read()

    nodes_labels = nodes_labels.strip("\n").split("\n")
    nodes = [int(x.split()[0]) for x in nodes_labels]
    labels = [int(x.split()[1]) for x in nodes_labels]

    with open(data_path, "r") as f:
        edges_data = f.read()

    edges = {x: dict() for x in nodes}
    edges = [(int(x.split()[0]), int(x.split()[1])) for x in edges_data.strip("\n").split("\n")]

    return np.array(edges), np.array(nodes), np.array(labels)

In [3]:
def create_graph(edges, nodes):
    G = nx.Graph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    return G

In [5]:
e, n, l = read_data()
e.shape, n.shape, l.shape

((25571, 2), (1005,), (1005,))

In [6]:
G = create_graph(e, n)
G.number_of_edges()

16706

In [10]:
adjlist = nx.generate_adjlist(G)

g = ig.Graph()

g.add_vertices(G.number_of_nodes())

label_to_id = {label: idx for idx, label in enumerate(G.nodes())}

ig_edges = [(label_to_id[edge[0]], label_to_id[edge[1]]) for edge in G.edges()]

g.add_edges(ig_edges)

# Infomap

In [11]:
infomap_wrapper = Infomap("--two-level")
for e in g.es:
    infomap_wrapper.add_link(*e.tuple)

infomap_wrapper.run();

In [13]:
"Found %d modules with codelength: %f" % (infomap_wrapper.num_top_modules, infomap_wrapper.codelength)

'Found 61 modules with codelength: 8.696479'

In [14]:
community_assignments = [None] * len(g.vs)

for node in infomap_wrapper.tree:
    if node.is_leaf:
        community_assignments[node.node_id] = node.module_id

In [16]:
le = LabelEncoder()
numeric_cluster_labels = le.fit_transform(community_assignments)

clusters = [set() for _ in range(max(numeric_cluster_labels)+1)]
for node, cluster in zip(G.nodes(), numeric_cluster_labels):
    clusters[cluster].add(node)
mod = modularity(G, clusters)
print(f'Modularity: {mod}')

ari = adjusted_rand_score(l, numeric_cluster_labels)
print(f'Adjusted Rand Index: {ari}')

nmi = normalized_mutual_info_score(l, numeric_cluster_labels)
print(f'Normalized Mutual Information: {nmi}')


mlb = MultiLabelBinarizer()

binary_cluster_assignments = mlb.fit_transform([[label] for label in numeric_cluster_labels])
binary_true_assignments = mlb.transform([[label] for label in l])

f1 = f1_score(binary_true_assignments, binary_cluster_assignments, average='weighted')
print(f'F1 Score: {f1}')

jaccard = jaccard_score(binary_true_assignments, binary_cluster_assignments, average='micro')
print(f'Jaccard Index: {jaccard}')

Modularity: 0.41857250708512284
Adjusted Rand Index: 0.29431629775857837
Normalized Mutual Information: 0.6291805324992814
F1 Score: 0.07968164781238346
Jaccard Index: 0.03929679420889348
