In [None]:
## Hierarchical clustering


# Hierarchical clustering

Clustering on OpenAI embeddings of recipes

how it works



## Explanation

There are two types of hierarchical clustering:

- Agglomerative where we start with one cluster per observation and merge recursively
- Divisive where we start with one cluster and break it down recursively

There are two components:

1. A metric which calculates a distance between two points.
2. A linkage criterion which computes a distance between two clusters given their pair-wise distances obtained with the metric.

Some simple linkage criteria are:

- Maximum linkage: $max\{d(a, b); a \in A\, b \in B\}$
- Minimum linkage: $min\{d(a, b); a \in A\, b \in B\}$
- Average linkage: $\frac{1}{|A| \times |B|} \sum_{a \in A} \sum_{b \in B} d(a, b)$

You can find more linkage criteria [here](https://www.wikiwand.com/en/Hierarchical_clustering#/Linkage_criteria).

In [None]:
from scipy.cluster import hierarchy


linkage = hierarchy.linkage(df, 'single')

fig = plt.figure(figsize=(7, 7))
hierarchy.dendrogram(linkage, labels=df.index, orientation='left')
plt.tight_layout();
plt.show();


In [None]:
import itertools


def hac(X, metric=l1_distance):
    """HAC stands for Hierarchical Agglomerative Clustering.

    In this implementation we use minimum linkage criterion.
    """

    # Make sure we're working with a numpy.ndarray and not a pandas.DataFrame
    if isinstance(X, pd.DataFrame):
        X = X.values

    # We will store the clusters in a dictionary of lists
    clusters = {i: [i] for i, _ in enumerate(X)}

    # We will also store the linkage history for plotting purposes
    linkage = []

    # Compute each pairwise distance
    n = len(clusters)
    distances = np.zeros(shape=(n, n)) + np.inf
    for i in range(n):
        for j in range(i+1, n):
            dist = metric(X[i], X[j])
            distances[i, j] = dist
            distances[j, i] = dist

    while len(clusters) > 1:

        # We're looking for the two closest clusters, denoted a and b
        a, b = None, None
        min_dist = np.inf

        for i, j in itertools.combinations(clusters.keys(), 2):

            # Search for the smallest pairwise between both clusters
            dist = min(metric(X[k], X[l]) for k in clusters[i] for l in clusters[j])

            # Check if the smallest distance is lower than the current lowest distance
            if dist < min_dist:
                a, b = i, j
                min_dist = dist

        # Merge the two closest clusters
        clusters[n] = clusters.pop(a) + clusters.pop(b)

        # Update the linkage matrix
        linkage.append([a, b, min_dist, len(clusters[n])])

        n += 1

    return linkage

linkage = hac(df)

fig = plt.figure(figsize=(7, 7))
hierarchy.dendrogram(linkage, labels=df.index, orientation='left')
plt.tight_layout();
plt.show();
