In [15]:
import glob
import homer
import dask.dataframe as dd
import pandas as pd
import numpy as np
import pickle
import itertools

In [2]:
collection = homer.Homer(weighted_edge_list_globstring='tests/resources/Making_Connections_Generated_Data.hdf')

In [3]:
collection.compute_clusters('working/MC_gen_clusters_*.hdf', min_threshold=1)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [4]:
collection.compute_relations('working/MC_gen_relations_*.hdf')

In [50]:
class Cluster(object):
    def __init__(self, 
                 contents,
                 k=None,
                 w=None,
                 date=None,
                 is_leaf=False):
        self.contents = contents
        self.k = k  # k-clique clustering parameter
        self.w = w  # threshold
        self.date = date
        self.is_leaf = is_leaf
        
        self.k_children = []
        self.w_children = []
        self.k_parents = []
        self.t_parents = []
        self.tomorrow = []
        self.p_tomorrow = []
        self.yesterday = []
        self.p_yesterday = []
        self.members = []
        
        self.left = None
        self.right = None
        
        
    def flatten(self):
        """ All of the words in the cluster """
        
    def __repr__(self):
        return str(self.contents)
    
    def find(self, keyword):
        """
        query: string to find a particular keyword
        """
        if keyword > self.contents:
            try:
                return self.right.find(keyword)
            except:
                return None
        elif keyword < self.contents:
            try:
                return self.left.find(keyword)
            except:
                return None
        elif keyword == self.contents:
            return self
        
    def insert(self, cluster_obj):
        """ for inserting leaves """
        if cluster_obj.contents > self.contents:
            try:
                self.right.insert(cluster_obj)
            except:  # right is none, so set it to the new object
                self.right = cluster_obj
        elif cluster_obj.contents < self.contents:
            try:
                self.left.insert(cluster_obj)
            except: # left is none, so set it to the new object
                self.left = cluster_obj
        elif cluster_obj.contents == self.contents:
            raise ValueError('Already have this one. Should probably implement merge here.')
            

    def get_k_members(self):
        if not self.members:
            if self.is_leaf:
                self.members = self.contents
            else:
                self.members = itertools.chain([n.get_k_members() for n in self.k_children])
        return self.members
        
        

In [51]:
def apply_to_tree(node, f, how='center'):
    if node is None:
        return
    if how is 'left':
        f(node)
    apply_to_tree(node.left, f)
    if how is 'center':
        f(node)
    apply_to_tree(node.right, f)
    if how is 'right':
        f(node)

In [52]:
def walk_k_ancestry(tree, order='bottom up'):
    if order == 'top down':
        yield tree 

    for child in tree.k_children:
        for elem in walk_k_ancestry(child, order):
            yield elem
    
    if order == 'bottom up':
        yield tree
    
    if order not in ['top down', 'bottom up']:
        raise ValueError('Bad Value for "order"')
        

In [63]:
# put clusters in a tree
root = Cluster('__root__')
for ID, row in collection.clusters.iterrows():
    new = Cluster(contents=str(ID),
                  k=row['k'],
                  w=row['threshold'],
                  date=row['Date'])
    
    root.insert(new)
    if row['k'] == 3:
        root.k_children.append(new)
    

    

In [64]:
# add clusters as children
for (_, _, ID), row in collection.relations.iterrows():
    node = root.find(str(ID))
    for child in row['children']:
        node.k_children.append(root.find(str(child)))

In [65]:
# add leaves (words)
for node in walk_k_ancestry(root, 'bottom up'):
    print(node)
    present_in_children = node.get_k_members()
    words = collection.clusters['Set'].loc[int(node.contents)].compute().values[0].split(' ')
    for leaf_word in list(set(words)-set(present_in_children)):
        leaf = root.find(leaf_word)
        if leaf is None:
            leaf = Cluster(leaf_word, is_leaf=True)
            root.insert(leaf)
        node.k_children.append(leaf)
        

-3496304249982661967
-7583807475848969340
-7372375518307365886
-1580432843336694116
213785065144779913
7179004229618902589
7579613705214725860
5268342693731845156
-9120092168898285478
3128681681080870309
-9037474792455870455
-8955311868503315873
-3669200604818778700
3041231414810347708
5868668942002051641
-708254164392898234
-8725875768127475733
3907545003442243585
1976617768702285945
-8454976086606258137
403193416136662345
-8551037380822914686
-8549184739472257284
-8009600784878711672
3128678774098877359
-7250685934399208066
-6704968416265504836
-6704966962779498661
-6704965509288502186
-8551041741295904111
-6684201670308989435
-6521242543707643408
-6584515745252808277
-6418194257954130416
-6418157920679218541
-6418156467178241466
3123716984781411287
-6181953241311343916
-2839429919156836089
-1369408599423015707
-682809135185681555
2040528023996207878
-5886685429130949988
2062249958341171746
-6839713795105265135
3041227054336110708
5861962599639446544
9044009008906916535
8480911159032

ValueError: invalid literal for int() with base 10: '__root__'

In [66]:
apply_to_tree(root, lambda x: print(x, x.k_children))

-1170252738776442330 []
-1170254192277419405 []
-1170262913220903105 [live, fewer, wear, wish, peace, share]
-1170268727177403555 []
-1170270180670895180 [live, fewer, wear, wish, peace, share]
-1170271634161891655 []
-121101197443335237 [live, wear, wish, peace, fewer, honour, share, upon]
-1328951369400172921 [live, wear, wish, one]
-1359659636287308887 [one, garments, wish, covet, markd, dwell]
-1369408599423015707 [peace, honour, pray, upon]
-1409427029017598059 [-8809262921407238522, live, cost, wear, wish, peace, fewer, honour, share, one]
-1562923068658001662 [doth, cost, honour]
-1580432843336694116 [wish, gods, doth, care, markd, honour, things, live]
-1624831385635998859 [5021525720259675872, wear, wish, gods, pray, soul, fewer, markd, things, honour, upon, one, cost, covet, peace, dwell, care, share, live]
-1635981473442134971 [covet, cost, markd, wish, care]
-1635982926938121746 []
-1857537900741793665 [-8798029411813074863, -2465867530942332432, 5318898181724314082, honour

In [77]:
root.k_children[-1].k_children

[8458272823378304437,
 live,
 cost,
 wear,
 wish,
 peace,
 soul,
 fewer,
 care,
 honour,
 things,
 share,
 one]

[-682455678926282972]

2073065243354279717

In [73]:
collection.clusters.loc[-9120092168898285478].compute()

Unnamed: 0_level_0,Set,k,threshold,Date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-9120092168898285478,faith country wish jove enow markd things grea...,3,1,1


In [14]:
collection.relations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,children
Date,threshold,ID,Unnamed: 3_level_1
1,1,785040444659930722,[-5525551585677041704]
1,1,-5525551585677041704,[-3262778911109569049]
1,1,-3262778911109569049,"[-7472826804915823269, -1480890997324872430, 1..."
1,1,-7472826804915823269,[]
1,1,-1480890997324872430,[3882636840346142346]
