In [1]:
import networkx as nx
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import f1_score, jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

# Graph Creation

In [2]:
def read_data(data_path="../data/emails.txt", labels_path="../data/emails_labels.txt"):
    with open(labels_path, "r") as f:
        nodes_labels = f.read()

    nodes_labels = nodes_labels.strip("\n").split("\n")
    nodes = [int(x.split()[0]) for x in nodes_labels]
    labels = [int(x.split()[1]) for x in nodes_labels]

    with open(data_path, "r") as f:
        edges_data = f.read()

    edges = {x: dict() for x in nodes}
    edges = [(int(x.split()[0]), int(x.split()[1])) for x in edges_data.strip("\n").split("\n")]

    return np.array(edges), np.array(nodes), np.array(labels)

In [3]:
def create_graph(edges, nodes):
    G = nx.Graph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    return G

In [4]:
e, n, l = read_data()
e.shape, n.shape, l.shape

((25571, 2), (1005,), (1005,))

In [5]:
G = create_graph(e, n)
G.number_of_edges()

16706

# DBSCAN

In [7]:
adjacency_matrix = nx.to_numpy_array(G)
distance_matrix = np.where(adjacency_matrix == 0, 1, 0)
dbscan = DBSCAN(eps=1, min_samples=2, metric="precomputed")

In [8]:
clusters = dbscan.fit_predict(distance_matrix)

In [10]:
np.where(clusters != 0)

(array([], dtype=int64),)