In [13]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import numpy as np

# Generate some data in 2D
# np.random.seed(123)
data = np.random.rand(55, 10)

# Compute the distance between each pair of the two collections of inputs.
Y = pdist(data)

# Perform hierarchical/agglomerative clustering.
Z = linkage(Y)

# Form flat clusters from the hierarchical clustering defined by the given linkage matrix.
# Here, we're cutting the dendrogram to create a certain number of clusters (e.g., 3)
clusters = fcluster(Z, 5, criterion='maxclust')

print(clusters)

[1 1 1 1 1 1 1 1 1 1 3 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 5 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [25]:
from enbios2.experiment.bw_vector_db.create_vectors import get_all_vector_docs

docs = get_all_vector_docs()

In [15]:
embeddings = [doc.embedding for doc in docs]
Y = pdist(embeddings, metric='cosine')
Z = linkage(Y)

In [30]:
Z

array([[6.36000000e+02, 4.62400000e+03, 1.51983761e-03, 2.00000000e+00],
       [2.44100000e+03, 4.93200000e+03, 1.67563821e-03, 2.00000000e+00],
       [4.48000000e+02, 3.95500000e+03, 1.75458163e-03, 2.00000000e+00],
       ...,
       [1.94900000e+03, 1.65510000e+04, 5.51764802e-01, 8.27600000e+03],
       [1.75600000e+03, 1.65520000e+04, 6.10256463e-01, 8.27700000e+03],
       [2.14700000e+03, 1.65530000e+04, 6.29868526e-01, 8.27800000e+03]])

In [22]:
from matplotlib import pyplot as plt

from scipy.cluster.hierarchy import dendrogram
#
# fig = plt.figure(figsize=(10, 10))
# dn = dendrogram(Z)
#
# plt.title('Dendrogram')
# plt.xlabel('Sample index')
# plt.ylabel('Distance')
#
# plt.show()

In [59]:
from dataclasses import dataclass
from enbios2.experiment.bw_vector_db.psql_vectorDB import Document
from typing import Optional
from enbios2.generic.tree.basic_tree import BasicTreeNode


@dataclass
class WordCluster:
    id: int
    distance: float
    document: Optional[Document] = None


In [60]:
idNodeMap: dict[int, BasicTreeNode] = {}

for i, doc in enumerate(docs):
    idNodeMap[i] = BasicTreeNode[WordCluster](doc.content, data=WordCluster(id=i, distance=0, document=doc))

In [61]:
from tqdm import tqdm

for index,row in tqdm(enumerate(Z)):
    id1 = int(row[0])
    id2 = int(row[1])
    distance = row[2]
    node1 = idNodeMap[id1]
    node2 = idNodeMap[id2]
    newNode = BasicTreeNode[WordCluster](f'{node1.data.id} {node2.data.id}',
                                         children= [node1, node2],
                                         data=WordCluster(id=len(docs) + index, distance=distance))
    idNodeMap[newNode.data.id] = newNode

8277it [00:00, 34589.86it/s]


In [62]:
leaves = list(idNodeMap[len(idNodeMap)-1].get_leaves())

In [63]:
len(leaves)

8278

In [65]:
leaves[210].data.document.content

'inductor production, ring core choke type'

94

In [70]:
[d.data.document.content for d in leaves[210].level_up(2).get_leaves()]

['market for inductor, ring core choke type',
 'inductor production, ring core choke type',
 'biogas purification to biomethane by amino washing',
 'biogas purification to biomethane by membrane technique',
 'biogas purification to biomethane by pressure swing adsorption',
 'market for trellis system, wooden poles, soft wood, tar impregnated',
 'trellis system construction, wooden poles, soft wood, tar impregnated',
 'market for television',
 'television production',
 'market for pumice',
 'market for non-ionic surfactant',
 'non-ionic surfactant production, fatty acid derivate',
 'non-ionic surfactant production, ethylene oxide derivate',
 'horn meal production',
 'market for horn meal',
 'market for rice, non-basmati',
 'market for rice, basmati',
 'rice production, non-basmati',
 'rice production, basmati',
 'market for latex',
 'latex production',
 'dodecanol production, ziegler process',
 'market for epichlorohydrin',
 'epichlorohydrin production from allyl chloride',
 'glycerine 