In [128]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [129]:
import os
import traceback
import numpy as np

from hyperbolicTSNE import Datasets, load_data
from hyperbolicTSNE import Datasets
from hyperbolicTSNE.visualization import plot_poincare, animate

from data_gen import find_children, find_depth, generate_Tree_D_V

In [138]:
# Main parameters
dist = 10
depth = 2
n_children = 2
cluster_size = 2
sigma = 2
mu = 5
n_nodes = sum(np.power(n_children, d) for d in range(depth))

# Initial distance matrix D
D, V = generate_Tree_D_V(cluster_size, n_children, n_nodes, mu, sigma, dist)
print(D)
print(np.allclose(D, D.T))

print(V.sum())

[[ 0.          4.28194202 11.94998164 15.1694803  16.25150787 18.0565542 ]
 [ 4.28194202  0.         12.9437631  16.24175564 16.6771032  17.27038277]
 [11.94998164 12.9437631   0.          2.07859554 28.20148951 33.2260345 ]
 [15.1694803  16.24175564  2.07859554  0.         29.6208663  33.51213841]
 [16.25150787 16.6771032  28.20148951 29.6208663   0.          4.10485562]
 [18.0565542  17.27038277 33.2260345  33.51213841  4.10485562  0.        ]]
True
1.0


In [70]:
import numpy as np

# Original (n, n) array
n = 3
d = 4
arr = np.arange(n * n).reshape(n, n)

print(arr.shape)
for arr in [arr] * d:
    print(arr.shape)


# Stack the (n, n) array d times along a new axis (axis 1 for stacking vertically)
stacked_arr = np.stack([arr] * d, axis=1)

print(stacked_arr.shape)  # Output: (3, 4, 3)
print(stacked_arr)

(3, 3)
(3, 3)
(3, 3)
(3, 3)
(3, 3)
(3, 4, 3)
[[[0 1 2]
  [0 1 2]
  [0 1 2]
  [0 1 2]]

 [[3 4 5]
  [3 4 5]
  [3 4 5]
  [3 4 5]]

 [[6 7 8]
  [6 7 8]
  [6 7 8]
  [6 7 8]]]


# Tree-data testing

In [None]:
from scipy.stats import multivariate_normal
from __future__ import annotations 

class TreeDataset():
    def __init__(self, n_data, n_children, depth):
        # Basic parameters
        self.n_data = n_data            # nr. of datapoints per node
        self.n_children = n_children    # nr. of children per node 
        self.depth = depth              # depth of the tree
        self.dist = 20

        # Calculate size of D, V
        self.n_nodes = np.sum([np.power(n_children, p) for p in range(depth + 1)])
        size = n_data * self.n_nodes

        # Build the tree
        self.tree = TreeNode(self.n_data, None, n_children, depth, self.dist, self.n_nodes)


        # D[n_data*i : n_data*(i+1)] 
        # Corresponds to the distance rows of node i
        #
        # D[n_data*i : n_data*(i+1)][n_data*j : n_data*(j+1)] 
        # Corresponds to distance blocks of node i with node j
        self.D = np.zeros((size, size))
        self.V = np.zeros((size, size))


    def compute_distances(self, tree_node:TreeNode):
        """
        For each node cluster in our tree, compute a list of distances to other node clusters
        """ 
        if tree_node.children is None:
            return 
        
        # Compute distances of current node to all other nodes
        tree_node.compute_distances(self.n_nodes)

        # Perform same computation in children
        for child in tree_node.children:
            pass 





def generate_hierarchical_D_V(cluster_size, n_children, depth):
    """
    Generates a D (distance) and V (affinity) matrix for a tree-like/hierarchical dataset
    where we have 'n_children' children per node, with a depth of 'depth'

    depth = 0 means only a root, etc..
    """

    n_nodes = np.sum([np.power(n_children, d) for d in range(depth + 1)])
    sigma = 4       # parameters for distance sampling inside of a cluster
    mu = 4          
    dist = 50       # distance to other clusters
    
    # D[i, n, j, m] = src_node i; src_node cluster point n; tgt_node j, tgt_node cluster m
    D = np.zeros((n_nodes, cluster_size, n_nodes, cluster_size))

    # Initialize diagonal entries (node i=a with j=a), where distances between different
    # points in the same cluster are randomly drawn
    for node in n_nodes:
        dists = np.abs(sigma * np.random.randn(cluster_size, cluster_size) + mu)
        D[node, :, node, :] = np.triu(dists, 1)
        D[node, :, node, :] = D[node, :, node, :] + D[node, :, node, :].T
        np.fill_diagonal(D[node, :, node, :], 0)


    # Compute distances to all children for every node
    for node in n_nodes:
        compute_child_distances(D, node, n_children, dist, n_nodes)

    return D, V





def compute_child_distances(D, root, n, dist, n_nodes):
    """ 
    D:      Distance matrix
    root:   Index of root node we wish to compute for
    n:      Nr. of direct children per node
    dist:   Distance between each node cluster
    """
    # Assume dist distance between the cluster means
    # so distance between individual points would be: d(p_1, mean_1) + dist + d(p_2, mean_2)
    child_idxs = [n * root + i for i in range(1, n + 1)]

    # Base case, no more children, end of tree
    if root >= n_nodes:
        return

    # Loop over children, recurse down and compute child distances bottom up
    for child in child_idxs:
        # Can't go out of bounds
        if  child >= n_nodes:
            return 
        
        # Compute distances for each children
        compute_child_distances(D, child, n, dist, n_nodes)

        # Distance of this node to children (and childrens' children) is equal to:
        # d(this nodes cluster, mean) + dist + D[root, :, child_idxs, :]
        # Distance root node to its mean
        root_mean_dist = D[root, :, root, :] - (D[root, :, root, :].sum() / (n_nodes * n_nodes))

        # Distance child node to its mean
        child_mean_dist = D[child , :, child , :] - (D[child, :, child, :].sum() / (n_nodes * n_nodes))

        # Distance from root to child (idx)
        D[root, :, child, :] = root_mean_dist + dist + child_mean_dist

        # Distance from root to child's children?


class TreeNode():
    """ 
    Each tree node represents a cluster of data with a parent and children
    Tree created breath-first. This means the following:

    children_id's := (n_children * node_id + 1), ... (n_children * node_id + n_children)
    parent_id     := floor(node_id / n_children) 
                  := floor([node_id + 1] / n_children) -- if node_id is a multiple of n_children
    """
    def __init__(self, parent:TreeNode, depth:int, n_children:int, dist:float, node_ids: list):
        self.parent: TreeNode   = parent            # Singular parent
        self.depth: int         = depth    
        self.dist: int          = dist
        self.id: int            = node_ids.pop()
        self.n_children: int    = n_children
        self.children:list      = None

    def update_distances(self, D):
        pass

    def create_children(self):
        # Keep creating children until depth == 0
        if self.depth > 0:
            self.children: list = [TreeNode(self, self.depth - 1, self.n_children, self.dist, self.node_ids) for _ in range(self.n_children)]    


def construct_tree_D_V(n_children, depth, dist):
    n_nodes: int   = np.sum([np.power(n_children, d) for d in range(depth + 1)])    # total nr. of tree nodes
    node_ids: list = [id for id in range(n_nodes, -1, -1)]                          # list of node id's
    tree: TreeNode = TreeNode(None, depth, n_children, dist, node_ids)      

    # Distance matrices    
    D = np.zeros((n_nodes, n_nodes))
    V = None 

    init_D(D, None, tree, dist)

    return D, V


def init_D(D: np.array, parent: TreeNode, current: TreeNode, dist: int):
    if current.children is None:
        D[parent.id, current.id] = dist
        D[current.id, parent.id] = dist
        return

    for ch in current.children:
        if parent is not None:
            D[parent.id, current.id] = dist
            D[current.id, parent.id] = dist

        D[current.id, ch.id] = dist
        D[ch.id, current.id] = dist

        init_D(D, current, ch, dist)

# n_children = 2
# depth = 2
# dist = 1

# D, V = construct_tree_D_V(n_children, depth, dist)
# print(D)

Testing numpy functionality

In [2]:
import numpy as np
from numpy import linalg as LA 

y = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
print(y.shape)

yy = (y * y)
print(yy)

norms = yy.sum(axis=1)
print(norms)

(3, 3)
[[1 1 1]
 [4 4 4]
 [9 9 9]]
[ 3 12 27]


Testing global hsne high dim. matrix function

In [3]:
from hyperbolicTSNE.hd_mat_ import globalhsne_D_V

In [2]:
data_home = "datasets"
log_path = "temp/poincare/"  # path for saving embedding snapshots

dataset = Datasets.WORDNET

pca_components = 50 
seed = 42 
perplexity = 50
np = -1

# load data
dataX, dataLabels, D, V, *rest = load_data(
        dataset, 
        data_home=data_home, 
        pca_components=pca_components,
        random_state=seed, 
        to_return="X_labels_D_V",
        hd_params={"perplexity": perplexity}, 
        sample=np, 
        knn_method="hnswlib"  # we use an approximation of high-dimensional neighbors to speed up computations
    )

# Regular p_ij found in D, V

# Determine a good value for nr of neighbours
# TODO: A better way to figure this out? Maybe use quadtree to compute D_hat, V_hat
# n_neighbours = 100

# # get global p_ij (p_ij hat in paper)
# D_hat, V_hat = globalhsne_D_V(dataX, n_neighbours=n_neighbours)

# print(D_hat.shape)
# print(V_hat.shape)

  model = torch.load(str(Path.joinpath(full_path, "nouns.bin.best")))


In [3]:
print(dataX.shape, dataLabels.shape)

(82115, 11) (82115,)
