In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import (KMeans, SpectralClustering, AgglomerativeClustering,
                             MiniBatchKMeans)
from sklearn.mixture import GaussianMixture
from sklearn_extra.cluster import KMedoids
from hdbscan import HDBSCAN

# Clustering
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from scipy.cluster.hierarchy import dendrogram, fcluster

from umap import UMAP
import plotly.offline as pyo
pyo.init_notebook_mode()

# Directory constants
analysis_of_embeddings_dir = ".."
analysis_of_embeddings_data_dir = join(analysis_of_embeddings_dir, "data")
analysis_of_embeddings_custom_data_dir = join(analysis_of_embeddings_dir, "custom_data")
root_code_dir = join(analysis_of_embeddings_dir, "..")
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([analysis_of_embeddings_dir, root_code_dir])

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances
from analysis_utils import (plot_cluster_metric_scores, transform_word_embeddings,
                            words_in_clusters, plot_cluster_sizes, inspect_word_clusters,
                            load_word_cluster_group_words, visualize_word_cluster_groups)
from cluster_analysis_metrics import (silhouette_score_metric, davies_bouldin_score_metric,
                                      s_dbw_score_metric, sd_score_metric, cdbw_score_metric)
from cluster_analysis_utils import (cluster_analysis, visualize_cluster_analysis_result,
                                    plot_word_embeddings_clustered)
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors
from topological_data_analysis.tda_utils import plot_persistence_diagram

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_sept_2020_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]

In [None]:
# Restrict vocabulary size for analysis
vocab_size = 10000
vocabulary = list(range(vocab_size))

# Clustering

In [None]:
# Constants
should_pred_cluster_labels = True
#n_clusters = [2, 3, 4, 5, 10, 50, 100, 150, 200, 300, 400, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
n_clusters = [2, 3, 4, 5]
eval_metrics = [
    ("silhouette_score", silhouette_score_metric),
    ("sd_score", sd_score_metric),
    ("s_dbw_score", s_dbw_score_metric),
]
eval_metrics_grid=[
    eval_metrics,
    eval_metrics,
    eval_metrics
]
eval_metrics_params={
    "silhouette_score": {"metric": "precomputed"},
}
clusterers=[
    ("Agglomerative clustering", AgglomerativeClustering),
    ("K-means clustering", KMeans),
    ("Spectral clustering", SpectralClustering)
]
hyperparameter_grids=[
    {
        "n_clusters": n_clusters,
        "affinity": ["precomputed"],
        "linkage": ["single", "average", "complete"],
    },
    {"n_clusters": n_clusters, "random_state": [rng_seed]},
    {"n_clusters": n_clusters, "random_state": [rng_seed]},
]

In [None]:
if should_pred_cluster_labels:
    cluster_analysis_result, word_vecs, pairwise_word_distances = cluster_analysis(
        clusterers=clusterers,
        hyperparameter_grids=hyperparameter_grids,
        eval_metrics_grid=eval_metrics_grid,
        eval_metrics_params=eval_metrics_params,
        word_embeddings=last_embedding_weights,
        words_vocabulary=vocabulary,
        word_to_int=word_to_int,
        compute_pairwise_word_distances=True,
        return_word_vectors=True,
        save_result_to_disk=True,
        output_dir=word2vec_cluster_analysis_dir,
        model_name="word2vec",
        dataset_name="enwiki",
        output_filepath_suffix="cluster_labels",
    )
else:
    cluster_analysis_result, word_vecs, pairwise_word_distances = joblib.load(
        join(word2vec_cluster_analysis_dir, "word2vec-enwiki-cluster_labels.joblib")
    )