In [None]:
# This notebook deals with creating an edgelist file containing all the edges in the nth largest
# connected component of the large edgelist (600 million edges). It does not rely on NetworkX
# or any other libraries other than Pandas to do this (since they are too computationally expensive)
import time
import dask.dataframe as dd
from fun.fun import *

In [2]:
class UnionFind:
    def __init__(self, n):
        self.parent = [i for i in range(n)]
        self.rank = [0] * n
        self.size = [1] * n

    def find(self, u):
        if self.parent[u] != u:
            self.parent[u] = self.find(self.parent[u])
        return self.parent[u]

    def union(self, u, v):
        pu, pv = self.find(u), self.find(v)
        if pu == pv:
            return
        if self.rank[pu] < self.rank[pv]:
            self.parent[pu] = pv
            self.size[pv] += self.size[pu]
        elif self.rank[pv] < self.rank[pu]:
            self.parent[pv] = pu
            self.size[pu] += self.size[pv]
        else:
            self.parent[pu] = pv
            self.rank[pv] += 1
            self.size[pv] += self.size[pu]

In [3]:
def initialize_unionfind(edges):
    print("Initializing UnionFind data structure ...")
    nodes = set(edges['source']).union(set(edges['target']))
    node_index = {node: i for i, node in enumerate(nodes)}
    n = len(nodes)
    print("Found {:_} unique nodes".format(n))
    uf = UnionFind(n)
    print("Computing union of all edges ...")
    handled, total = 0, len(edges)
    for _, row in edges.iterrows():
        uf.union(node_index[row['source']], node_index[row['target']])
        handled, perc  = track_progress(total, handled, "edges handled:", inc=100)
    print("\nDone.")
    return uf, node_index

In [4]:
def get_connected_component(uf, node_index, n=0):
    comps = {}
    for node, i in node_index.items():
        parent = uf.find(i)
        comps[parent] = comps.get(parent, []) + [node]
    if n >= len(comps):
        return []
    keys_sorted = sorted( comps.keys(), reverse=True, key=lambda parent: len(comps[parent]) )
    return comps[keys_sorted[n]]

In [5]:
def get_amount_of_components(uf, node_index):
    unique_parents = set()
    for i in range(len(node_index)):
        unique_parents.add(uf.find(i))
    return len(unique_parents)

In [6]:
# START
edges_fn = "../data/edges.parquet"
# edges_total = 684_732_453

In [None]:
# -> IN : Read edge list to dask df
print("reading edges ... ", end='')
start = time.time()
df = dd.read_parquet("../data/edges.parquet").head(100_000)
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
df.head()

In [25]:
# Initialize union find
uf, node_index = initialize_unionfind(df)

Initializing UnionFind data structure ...
Found 1_605 unique nodes
Computing union of all edges ...
 edges handled: 100_000/100_000 (100.00000%)
Done.


In [26]:
# Get nth largest connected component
cc_n = get_amount_of_components(uf, node_index)
print("Found {} connected components:".format(cc_n))
for i in range(cc_n):
    component = get_connected_component(uf, node_index, i)
    print("  component {} has {:_} nodes".format(i+1, len(component)))

Found 8 connected components:
  component 1 has 801 nodes
  component 2 has 356 nodes
  component 3 has 221 nodes
  component 4 has 136 nodes
  component 5 has 36 nodes
  component 6 has 32 nodes
  component 7 has 18 nodes
  component 8 has 5 nodes


In [32]:
# DICT generate dict of nodes to groups
d = {}
cc_n = get_amount_of_components(uf, node_index)
for i in range(cc_n):
    group = i+1
    component = get_connected_component(uf, node_index, i)
    for node in component:
        d[node] = group
sorted_keys = sorted(d.keys())

In [39]:
# <- OUT : Save node communities to file
import csv
file = open("../data/communities.csv", 'w', newline='')
writer = csv.writer(file)
writer.writerow(['node', 'community'])
for i, key in enumerate(sorted(d.keys())):
    group = d[key]
    writer.writerow([key, group])
    track_progress(len(d), i, text="rows written:", inc=100)
file.close()

 rows written: 500_000/500_000 (100.00000%)

In [11]:
# Get edgelist of connected component
dfcc = df[ (df['source'].isin(component)) & (df['target'].isin(component)) ]
print("{:_} / {:_}".format(len(dfcc), len(df)))

1 / 1_000


In [12]:
# <- OUT : Save new edges to file
dfcc.to_csv("../data/edges_cc_test.csv", encoding='utf-8', index=False)