In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from scipy.cluster.hierarchy import dendrogram, fcluster

from umap import UMAP
import plotly.offline as pyo
pyo.init_notebook_mode()

# Directory constants
analysis_of_embeddings_dir = ".."
analysis_of_embeddings_data_dir = join(analysis_of_embeddings_dir, "data")
analysis_of_embeddings_custom_data_dir = join(analysis_of_embeddings_dir, "custom_data")
root_code_dir = join(analysis_of_embeddings_dir, "..")
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([analysis_of_embeddings_dir, root_code_dir])

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances
from analysis_utils import (agglomerative_clustering,
                            agglomerative_cluster_hyperparameter_search, plot_cluster_metric_scores,
                            words_in_clusters, plot_cluster_sizes, inspect_word_clusters,
                            load_word_cluster_group_words, visualize_word_cluster_groups)
from word_embeddings.eval_utils import plot_word_vectors
from word_embeddings.word2vec import load_model_training_output

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_sept_2020_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]

In [None]:
# Restrict vocabulary size for analysis
vocab_size = 10000

In [None]:
# Precompute cosine distance matrix
word_embeddings_to_precompute = last_embedding_weights[:vocab_size]
word_embeddings_distances = pairwise_cosine_distances(word_embeddings_to_precompute)

# Clustering

In [None]:
# Perform agglomerative clustering
agglomerative_clusterings = agglomerative_clustering(
    word_embeddings_pairwise_dists=word_embeddings_distances,
    linkages=["complete", "average", "single"]
)

In [None]:
from cdbw import CDbw
from s_dbw import S_Dbw

In [None]:
def agglomerative_cluster_hyperparameter_search(
    cluster_numbers: list,
    linkages: list,
    agglomerative_clusterings: dict,
    word_embeddings: np.ndarray,
    word_embeddings_pairwise_dists: np.ndarray,
    output_dir: str,
    model_name: str,
    dataset_name: str,
    output_filepath_suffix: str,
) -> dict:
    """
    Searches for the best set of hyperparameters using agglomerative
    clustering and various internal cluster metrics:
    - Silhouette Coefficient
    - S_Dbw validity index
    - CDbw validity index

    Parameters
    ----------
    cluster_numbers : list
        List of cluster numbers to evaluate.
    linkages : list
        List of linkages to evaluate
    agglomerative_clusterings : dict
        Dictionary containing result from `agglomerative_clustering`
        function.
    word_embeddings : np.ndarray
        Word embeddings to perform clustering on
    word_embeddings_pairwise_dists : np.ndarray
        Numpy matrix containing pairwise distances between word embeddings
    output_dir : str
        Output directory
    model_name : str
        Name of the model
    dataset_name : str
        Name of the dataset the model was trained on
    output_filepath_suffix : str
        Output filepath suffix

    Returns
    -------
    result : dict
        Dictionary containing cluster labels and metric scores
    """
    # Ensure output directory exists
    makedirs(output_dir, exist_ok=True)

    # Perform clustering
    clustering_result = {}
    print(f"-- Fitting and predicting cluster labels for agglomerative clustering --")
    for linkage in linkages:
        print(f"Linkage: {linkage}")

        cluster_labels = []
        cluster_metrics = {
            "silhouette_coeff": {
                "name": "Silhouette Coefficient",
                "scores": [],
                "best_score_idx": -1,
            },
            "s_dbw": {
                "name": "S_Dbw validity index",
                "scores": [],
                "best_score_idx": -1,
            },
            "cdbw": {
                "name": "CDbw validity index",
                "scores": [],
                "best_score_idx": -1,
            },
        }

        for k in tqdm(cluster_numbers):
            linkage_matrix = agglomerative_clusterings[linkage]["linkage_matrix"]
            cluster_labels_pred = fcluster(Z=linkage_matrix, criterion="maxclust", t=k) - 1
            cluster_labels.append(cluster_labels_pred)

            # Compute metric scores
            silhouette_coeff_score = silhouette_score(
                X=word_embeddings_pairwise_dists,
                labels=cluster_labels_pred,
                metric="precomputed",
            )
            s_dbw_score = S_Dbw(
                X=word_embeddings, labels=cluster_labels_pred, metric="cosine"
            )
            cdbw_score = CDbw(
                X=word_embeddings, labels=cluster_labels_pred, metric="cosine", s=3
            )
            print(silhouette_coeff_score, s_dbw_score, cdbw_score)

            # Append metric scores
            cluster_metrics["silhouette_coeff"]["scores"].append(silhouette_coeff_score)
            cluster_metrics["s_dbw"]["scores"].append(s_dbw_score)
            cluster_metrics["cdbw"]["scores"].append(cdbw_score)

        # Find set score index for each metric
        cluster_metrics["silhouette_coeff"]["best_score_idx"] = np.argmax(
            cluster_metrics["silhouette_coeff"]["scores"]
        )
        cluster_metrics["s_dbw"]["best_score_idx"] = np.argmin(
            cluster_metrics["s_dbw"]["scores"]
        )
        cluster_metrics["cdbw"]["best_score_idx"] = np.argmax(
            cluster_metrics["cdbw"]["scores"]
        )
        clustering_result[linkage] = {
            "cluster_labels": cluster_labels,
            "cluster_metrics": cluster_metrics,
        }

    # Save result to output dir
    save_cluster_result_to_disk(
        clustering_result, output_dir, model_name, dataset_name, output_filepath_suffix
    )

    return clustering_result

In [None]:
should_pred_cluster_labels = True
ks = [2, 3, 4, 5, 10, 50, 100, 150, 200, 300, 400, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
if should_pred_cluster_labels:
    pred_cluster_labels = agglomerative_cluster_hyperparameter_search(
        cluster_numbers=ks,
        linkages=list(agglomerative_clusterings.keys()),
        agglomerative_clusterings=agglomerative_clusterings,
        word_embeddings=word_embeddings_to_precompute,
        word_embeddings_pairwise_dists=word_embeddings_distances,
        output_dir=word2vec_cluster_analysis_dir,
        model_name="word2vec",
        dataset_name="enwiki",
        output_filepath_suffix="agglomerative_labels",
    )
else:
    pred_cluster_labels = joblib.load(
        join(word2vec_cluster_analysis_dir, "word2vec-enwiki-agglomerative_labels.joblib")
    )

In [None]:
for linkage in agglomerative_clusterings.keys():
    print(f"Linkage: {linkage}")
    fig, ax = plt.subplots(figsize=(8, 6))
    plot_cluster_metric_scores(
        metric_scores=pred_cluster_labels[linkage]["metric_scores"],
        hyperparameters=[{"n_clusters": k} for k in ks],
        best_score_idx=pred_cluster_labels[linkage]["best_cluster_labels_idx"],
        metric_name="Silhouette",
        ax=ax
    )

**Observation**: single linkage clustering does not seem to yield any good results looking at the silhouette score (lots of negative scores) and we will discard it from the further analysis.

In [None]:
# Zoom in at 3000-6000 clusters, discarding single linkage clustering.
should_pred_cluster_labels_zoomed = False
start_zoom_cluster_num = 3000
end_zoom_cluster_num = 6000
ks_zoomed = np.arange(start_zoom_cluster_num, end_zoom_cluster_num + 1) # np.linspace(3000, 6000, num=100, dtype=int)
if should_pred_cluster_labels_zoomed:
    pred_cluster_labels_zoomed = agglomerative_cluster_hyperparameter_search(
        cluster_numbers=ks_zoomed,
        linkages=["complete", "average"],
        agglomerative_clusterings=agglomerative_clusterings,
        word_embeddings_pairwise_dists=word_embeddings_distances,
        output_dir=word2vec_cluster_analysis_dir,
        model_name="word2vec",
        dataset_name="enwiki",
        output_filepath_suffix="agglomerative_labels_zoomed",
    )
else:
    pred_cluster_labels_zoomed = joblib.load(
        join(word2vec_cluster_analysis_dir, "word2vec-enwiki-agglomerative_labels_zoomed.joblib")
    )

In [None]:
best_cluster_labels = {}
for linkage in ["complete", "average"]:
    print(f"Linkage: {linkage}")
    silhouette_scores = pred_cluster_labels_zoomed[linkage]["metric_scores"]
    best_cluster_labels_idx = pred_cluster_labels_zoomed[linkage]["best_cluster_labels_idx"]
    
    best_num_clusters = ks_zoomed[best_cluster_labels_idx]
    print(f"Best number of clusters: {best_num_clusters}")
    
    best_cluster_labels[linkage] = pred_cluster_labels_zoomed[linkage]["cluster_labels"][best_cluster_labels_idx]
    
    fig, ax = plt.subplots(figsize=(12, 5))
    plot_cluster_metric_scores(
        hyperparameters=[{"n_clusters": k} for k in ks_zoomed],
        metric_scores=silhouette_scores,
        best_score_idx=best_cluster_labels_idx,
        metric_name="Silhouette",
        scatter=False,
        set_xticks=False,
        ax=ax,
        xlabel="Cluster number",
        xrange=ks_zoomed
    )

Complete linkage clustering yields the highest silhouette score. For this reason, we choose this linkage for the cluster analysis.

In [None]:
chosen_cluster_labels = best_cluster_labels["complete"]

# Cluster analysis

In [None]:
# Plot cluster sizes
most_common_cluster_sizes = plot_cluster_sizes(chosen_cluster_labels)

In [None]:
# Inspect word clusters
inspect_word_clusters(
    cluster_labels=chosen_cluster_labels,
    words=words[:vocab_size],
    min_cluster_size=5,
    most_common_cluster_sizes=most_common_cluster_sizes,
    num_words_in_clusters_print=10,
)