In [None]:
# Imports
from os import makedirs
from os.path import join
import pickle
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import sys
sys.path.append("..")

from hdbscan import HDBSCAN
from DBCV import DBCV

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from scipy.cluster.hierarchy import dendrogram, fcluster

from umap import UMAP
import plotly.offline as pyo
pyo.init_notebook_mode()

from importlib import reload
import utils
reload(utils)

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances, cosine_distance
from analysis_utils import create_linkage_matrix, words_in_clusters, plot_silhouette_scores
from word_embeddings.eval_utils import plot_word_vectors

In [None]:
# Get last word embeddings from training
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir="../output/word2vec_training/word2vec_enwiki_sept_2020_word2phrase",
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

In [None]:
# Load words and create word to int lookup dict
with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as file:
    words = np.array(file.read().split("\n"))
word_to_int = {word: i for i, word in enumerate(words)}
vocab_size = 10000

In [None]:
# Precompute cosine distance matrix
word_embeddings_to_precompute = last_embedding_weights[:vocab_size]
word_embeddings_distances = pairwise_cosine_distances(word_embeddings_to_precompute)

In [None]:
# Perform clustering with HDBSCAN
# TODO: Split into function
# TODO: Add saving result to disk
hdbscan_param_grid = ParameterGrid({
    "min_cluster_size": [2, 4, 8, 16, 32, 64, 128, 256], 
    "min_samples": [1, 2, 4, 8, 16, 32, 64, 128, 256],
})
hdbscan_cluster_labels = []
hdbscan_dbcv_scores = []
for param_grid in tqdm(hdbscan_param_grid, desc="Performing HDBSCAN clustering"):
    hdbscan_clustering = HDBSCAN(
        **param_grid,
        metric="cosine",
        algorithm="generic",
        core_dist_n_jobs=-1,
        gen_min_span_tree=True,
    )
    cluster_labels_pred = hdbscan_clustering.fit_predict(word_embeddings_to_precompute)
    hdbscan_cluster_labels.append(cluster_labels_pred)
    
    dbcv_score = hdbscan_clustering.relative_validity_
    hdbscan_dbcv_scores.append(dbcv_score)

In [None]:
# TODO: Create DBCV score plot

In [None]:
should_pred_cluster_labels = False
ks = [2, 3, 4, 5, 10, 50, 100, 150, 200, 300, 400, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
if should_pred_cluster_labels:
    pred_cluster_labels = agglomerative_cluster_number_search(
        cluster_numbers=ks,
        clusterings=agglomerative_clusterings,
        linkages=list(agglomerative_clusterings.keys()),
        word_embeddings_distances=word_embeddings_distances,
        output_filepath_suffix="agglomerative_labels",
        output_dir="../output/word2vec_cluster_analysis",
        model_name="word2vec",
        dataset_name="enwiki"
    )
else:
    with open("../output/word2vec_cluster_analysis/word2vec-enwiki-agglomerative_labels.pkl", "rb") as file:
        pred_cluster_labels = pickle.load(file)

In [None]:
# Compute cluster size ratios (maximum cluster size / minimum cluster size)
most_common_cluster_sizes = {}
for linkage, labels in best_cluster_labels.items():
    print(f"Linkage: {linkage}")
    labels_unique, labels_counts = np.unique(labels, return_counts=True)
    num_clusters = len(labels_unique)
    max_cluster_size = max(labels_counts)
    min_cluster_size = min(labels_counts)
    cluster_size_ratio = max_cluster_size / min_cluster_size
    print(f"{num_clusters} clusters: max={max_cluster_size}, min={min_cluster_size}, ratio={cluster_size_ratio}")
    
    # Plot distribution of cluster sizes
    hist_plot = sns.histplot(labels_counts, bins=max_cluster_size)
    bar_heights = [h.get_height() for h in hist_plot.patches]
    most_common_cluster_sizes[linkage] = np.arange(1, max_cluster_size + 1)[np.argsort(bar_heights)[::-1]]
    plt.show()

    print("---")

In [None]:
# Look at the words corresponding to the different clusters (biggest, smallest, etc.)
cluster_words, cluster_sizes = words_in_clusters(
    cluster_labels=best_cluster_labels["complete"],
    words=words[:vocab_size]
)

In [None]:
# Only inspect clusters with at least 5 words in them
min_cluster_size = 5
filter_min_cluster_size_mask = cluster_sizes >= min_cluster_size
cluster_sizes_filtered = cluster_sizes[filter_min_cluster_size_mask]
cluster_words_filtered = cluster_words[filter_min_cluster_size_mask]

In [None]:
sorted_cluster_indices = np.argsort(cluster_sizes_filtered)[::-1]

num_clusters_print = 10
print(f"-- {num_clusters_print} largest clusters --")
for i in range(num_clusters_print):
    print(cluster_words_filtered[sorted_cluster_indices[i]])
    
print(f"-- {num_clusters_print} smallest clusters --")
for i in range(1, num_clusters_print + 1):
    print(cluster_words_filtered[sorted_cluster_indices[-i]])

In [None]:
# Inspect words from clusters whose cluster numbers is the most common
for cluster_words in cluster_words[cluster_sizes == most_common_cluster_sizes["complete"][0]][:25]:
    print(cluster_words)

In [None]:
word_embeddings_transformed = UMAP(n_components=2, metric="precomputed").fit_transform(word_embeddings_distances)

In [None]:
# Visualizing first 1000 words using UMAP (TODO: remove this?)
plot_word_vectors(
    words_to_plot=words[:1000],
    transformed_word_embeddings=word_embeddings_transformed[:1000],
    word_to_int=word_to_int,
    word_colors=best_cluster_labels["complete"][:1000]
)