In [51]:
import os
import time
from itertools import combinations
from collections import namedtuple

##### Section I: max-spacing kk-clustering.
run the clustering algorithm from lecture on this data set, where the target number kk of clusters is set to 4. What is the maximum spacing of a 4-clustering?

In [72]:
data_folder = os.path.dirname(os.path.dirname(os.getcwd())) +"/data"
fname_clustering1 = data_folder + "/clustering1.txt"
fname_clustering2 = data_folder + "/clustering_big.txt"
# print(fname_clustering1)

In [41]:
Edge = namedtuple('Edge', ('head', 'tail', 'cost'))

def read_edges(fname):
    nodes, edges = set([]), []
    with open(fname, "rb") as f:
        header = next(f)
        for line in f:
            head, tail, cost = [int(x) for x in line.split()]
            edges.append(Edge(head, tail, cost))
            nodes.update((head, tail))
    return list(nodes), edges

In [63]:
"""
K-clustering algorithm using the Union-Find data structure.
Proceeds similar to Kruskal's MST algorithm, differs in stopping when there are k components of a graph, instead of a connected graph."""

class UnionFind:
    
    def __init__(self, nodes):
        self.nodes = nodes
        self.components = {}
        self.leaders = {}
        for node in nodes:
            #first, assign each node to its own cluster and mark itself as the leader
            self.components.setdefault(str(node), []).append(node) #allow for duplicates
            self.leaders[str(node)] = node
    
    def find(self, node): 
        #find the leader of the node
        return self.leaders[str(node)]
        
    def union(self, node1, node2):
        # keys in the same component, union already performed
        node1_leader, node2_leader = self.find(str(node1)), self.find(str(node2))
        if node1_leader == node2_leader:
            return 
        if len(self.components[str(node1_leader)]) >= len(self.components[str(node2_leader)]):
            new_leader = node1_leader
            old_leader = node2_leader
        else:
            new_leader = node2_leader
            old_leader = node1_leader
        
        # Add the old group the new group
        old_group = self.components.pop(str(old_leader))
        self.components[str(new_leader)].extend(old_group)
        # Reassign the leaders in the old group to new leader
        for node in old_group:
            self.leaders[str(node)] = new_leader
        return 


In [64]:
nodes, edges = read_edges(fname_clustering1)
edges.sort(key=lambda x: x[2])

In [65]:
start_time = time.time()
# set the number of clusters 
k = 4

uf = UnionFind(nodes)
while len(uf.components) >= k:
    min_edge = edges.pop(0)
    uf.union(min_edge.head, min_edge.tail)
# max spacing is the smallest unused edge
print("max spacing:", min_edge.cost)
    
end_time = time.time()
print('time: ' + str(end_time - start_time) + 's ')
print('time: ' + str((end_time - start_time) / 60) + 'min')
print('time: ' + str((end_time - start_time) / 3600) + 'h')

max spacing: 106
time: 0.05011916160583496s 
time: 0.0008353193600972494min
time: 1.3921989334954156e-05h


#### section II: hamming dist clustering
"""
For example, the third line of the file "0 1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1" denotes the 24 bits associated with node #2.

The distance between two nodes uu and vv in this problem is defined as the Hamming distance--- the number of differing bits --- between the two nodes' labels. For example, the Hamming distance between the 24-bit label of node #2 above and the label "0 1 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 1" is 3 (since they differ in the 3rd, 7th, and 21st bits)

what is the largest value of kk such that there is a kk-clustering with spacing at least 3? That is, how many clusters are needed to ensure that no pair of nodes with all but 2 bits in common get split into different clusters?
"""

In [87]:
def read_hamming(fname):
    nodes = []
    with open(fname, "rb") as f:
        header = next(f)
        num_nodes, node_bytes = map(int, header.split())
        print("number of nodes-", num_nodes, " & byte size for node-", node_bytes)
        for line in f:
            nodes.append([int(x) for x in line.split()])
    return nodes

def flip_bit(bit):
    if bit == 1:
        return 0
    if bit == 0:
        return 1

In [88]:
nodes = read_hamming(fname_clustering2)

number of nodes- 200000  & byte size for node- 24


In [107]:
start_time = time.time()
max_spacing = 2
uf = UnionFind(nodes)

for node in uf.nodes:
    for space in range(1, max_spacing + 1):
        idx2replace = combinations(range(len(node)), space)
        for idxs in idx2replace:
            # trick is to build new node (`node_within_space`) with limited space distatnce instead looping of graph nodes
            node_within_space = node.copy()
            for pos in idxs:
                node_within_space[pos] = flip_bit(node_within_space[pos])
            try:
                uf.union(node, node_within_space)
            except:
                pass

print("total number of clusters left:", len(uf.leaders))
end_time = time.time()
print('time: ' + str(end_time - start_time) + 's ')
print('time: ' + str((end_time - start_time) / 60) + 'min')
print('time: ' + str((end_time - start_time) / 3600) + 'h')

total number of clusters left: 198788
time: 345.94753217697144s 
time: 5.765792202949524min
time: 0.0960965367158254h


In [102]:
print(nodes[1])
print([flip_bit(i) for i in nodes[1]])

[0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1]
[1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0]
