In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm

from hdbscan import HDBSCAN

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from scipy.cluster.hierarchy import dendrogram, fcluster

from umap import UMAP
import plotly.offline as pyo
pyo.init_notebook_mode()

# Directory constants
analysis_of_embeddings_dir = ".."
root_code_dir = join(analysis_of_embeddings_dir, "..")
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([analysis_of_embeddings_dir, root_code_dir])

from utils import pairwise_cosine_distances, cosine_distance
from analysis_utils import hdbscan_cluster_hyperparameter_search, plot_cluster_metric_scores, plot_cluster_sizes, words_in_clusters, inspect_word_clusters
from word_embeddings.eval_utils import plot_word_vectors
from word_embeddings.word2vec import load_model_training_output

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_sept_2020_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]

In [None]:
# Restrict vocabulary size for analysis
vocab_size = 10000

In [None]:
# Precompute coine distance matrix
word_embeddings_to_precompute = last_embedding_weights[:vocab_size]
word_embeddings_distances = pairwise_cosine_distances(word_embeddings_to_precompute)

# Clustering

In [None]:
hdbscan_param_grid = ParameterGrid({
    "min_cluster_size": [2, 4, 8, 16, 32, 64, 128, 256], 
    "min_samples": [1, 2, 4, 8, 16, 32, 64, 128, 256],
})
hdbscan_default_params = {
    "metric": "cosine",
    "algorithm": "generic",
    "core_dist_n_jobs": -1,
    "gen_min_span_tree": True,
}
should_do_hyperparameter_search = False
if should_do_hyperparameter_search:
    hdbscan_hyperparameter_search_result = hdbscan_cluster_hyperparameter_search(
        param_grid=hdbscan_param_grid,
        default_params=hdbscan_default_params,
        word_embeddings=word_embeddings_to_precompute,
        output_dir=word2vec_cluster_analysis_dir,
        model_name="word2vec",
        dataset_name="enwiki",
        output_filepath_suffix="hdbscan_labels",
    )
else:
    hdbscan_hyperparameter_search_result = joblib.load(
        join(word2vec_cluster_analysis_dir, "word2vec-enwiki-hdbscan_labels.joblib")
    )

In [None]:
fig, ax = plt.subplots(figsize=(12, 10))
ax.set_title("DBCV scores of HDBSCAN clustering with different sets of hyperparameters")
plot_cluster_metric_scores(
    metric_scores=hdbscan_hyperparameter_search_result["dbcv_scores"],
    hyperparameters=hdbscan_param_grid,
    best_score_idx=hdbscan_hyperparameter_search_result["best_labels_idx"],
    metric_name="DBCV",
    scatter=True,
    ax=ax
)

# Cluster analysis

In [None]:
# Find best cluster labels (highest DBCV score)
best_labels_idx = hdbscan_hyperparameter_search_result["best_labels_idx"]
best_cluster_labels = hdbscan_hyperparameter_search_result["cluster_labels"][best_labels_idx]

# Filter out noisy words (words that have label -1)
best_cluster_labels_no_noise_mask = best_cluster_labels != -1
best_cluster_labels_no_noise = best_cluster_labels[best_cluster_labels_no_noise_mask]
words_no_noise = words[:vocab_size][best_cluster_labels_no_noise_mask]

In [None]:
# Plot cluster sizes
most_common_cluster_sizes = plot_cluster_sizes(best_cluster_labels_no_noise)

In [None]:
# Inspect word clusters
inspect_word_clusters(
    cluster_labels=best_cluster_labels_no_noise,
    words=words_no_noise,
    min_cluster_size=0,
    most_common_cluster_sizes=most_common_cluster_sizes,
    num_words_in_clusters_print=10,
)

# Visualizing word cluster groups
TODO