In [None]:
# Imports
from os import makedirs
from os.path import join
import pickle
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import sys
sys.path.append("..")

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from scipy.cluster.hierarchy import dendrogram, fcluster

from umap import UMAP
import plotly.offline as pyo
pyo.init_notebook_mode()

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances
from analysis_utils import create_linkage_matrix, words_in_clusters, plot_silhouette_scores
from word_embeddings.eval_utils import plot_word_vectors

In [None]:
# Get last word embeddings from training
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir="../output/word2vec_training/03-Nov-2020_11-01-00",
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

In [None]:
# Load words and create word to int lookup dict
with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as file:
    words = np.array(file.read().split("\n"))
word_to_int = {word: i for i, word in enumerate(words)}
vocab_size = 10000

In [None]:
# Precompute cosine distance matrix
word_embeddings_to_precompute = last_embedding_weights[:vocab_size]
word_embeddings_distances = pairwise_cosine_distances(word_embeddings_to_precompute)

In [None]:
# Perform agglomerative clustering
agglomerative_clusterings = {}
for linkage in ["complete", "average", "single"]:

    # Fit clustering and create linkage matrix
    agglomerative_clustering = AgglomerativeClustering(
        n_clusters=None,
        affinity="precomputed",
        linkage=linkage,
        distance_threshold=0
    )
    agglomerative_clustering.fit(word_embeddings_distances)
    
    # Create required linkage matrix for fcluster function
    agglomerative_clustering_linkage_matrix = create_linkage_matrix(agglomerative_clustering)
    
    # Set result in dict
    agglomerative_clusterings[linkage] = {
        "clustering": agglomerative_clustering,
        "linkage_matrix": agglomerative_clustering_linkage_matrix
    }

In [None]:
def agglomerative_cluster_number_search(
    cluster_numbers: list,
    clusterings: list,
    linkages: list,
    word_embeddings_distances: np.ndarray,
    output_filepath_suffix: str,
    output_dir: str = None,
    model_name: str = None,
    dataset_name: str = None
) -> None:
    """
    TODO: Docs
    """
    # Ensure output directory exists
    makedirs(output_dir, exist_ok=True)

    # Fit and predict cluster labels
    cluster_labels = {}
    print(f"-- Fitting and predicting cluster labels for agglomerative clustering --")
    for linkage in linkages:
        print(f"Linkage: {linkage}")
        cluster_labels[linkage] = {
            "labels": [],
            "metric_values": [],
            "best_labels_idx": -1
        }
        for k in tqdm(cluster_numbers):

            linkage_matrix = clusterings[linkage]["linkage_matrix"]
            cluster_labels_pred = fcluster(Z=linkage_matrix, criterion="maxclust", t=k)
            cluster_labels[linkage]["labels"].append(cluster_labels_pred)

            cluster_metric_value = silhouette_score(word_embeddings_distances, cluster_labels_pred, metric="precomputed")
            cluster_labels[linkage]["metric_values"].append(cluster_metric_value)

        cluster_labels[linkage]["best_labels_idx"] = np.argmax(cluster_labels[linkage]["metric_values"])

    # Save to output dir
    if output_dir is not None and model_name is not None and dataset_name is not None:
        output_path = join(output_dir, f"{model_name}-{dataset_name}-{output_filepath_suffix}.pkl")
        with open(output_path, "wb") as file:
            pickle.dump(cluster_labels, file)
    
    return cluster_labels

In [None]:
should_pred_cluster_labels = True
ks = [2, 3, 4, 5, 10, 50, 100, 150, 200, 300, 400, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
if should_pred_cluster_labels:
    pred_cluster_labels = agglomerative_cluster_number_search(
        cluster_numbers=ks,
        clusterings=agglomerative_clusterings,
        linkages=list(agglomerative_clusterings.keys()),
        word_embeddings_distances=word_embeddings_distances,
        output_filepath_suffix="agglomerative_labels",
        output_dir="../output/word2vec_cluster_analysis",
        model_name="word2vec",
        dataset_name="enwiki"
    )
else:
    with open("../output/word2vec_cluster_analysis/word2vec-enwiki-agglomerative_labels.pkl", "rb") as file:
        pred_cluster_labels = pickle.load(file)

In [None]:
for linkage in agglomerative_clusterings.keys():
    print(f"Linkage: {linkage}")
    plot_silhouette_scores(ks, pred_cluster_labels[linkage]["metric_values"])

In [None]:
# Zoom in at 3000-6000 clusters.
should_pred_cluster_labels_zoomed = True
ks_zoomed = np.linspace(3000, 6000, num=100, dtype=int)
if should_pred_cluster_labels_zoomed:
    pred_cluster_labels_zoomed = agglomerative_cluster_number_search(
        cluster_numbers=ks_zoomed,
        clusterings=agglomerative_clusterings,
        linkages=["complete", "average"],
        word_embeddings_distances=word_embeddings_distances,
        output_filepath_suffix="agglomerative_labels_zoomed",
        output_dir="../output/word2vec_cluster_analysis",
        model_name="word2vec",
        dataset_name="enwiki"
    )
else:
    with open("../output/word2vec_cluster_analysis/word2vec-enwiki-agglomerative_labels_zoomed.pkl", "rb") as file:
        pred_cluster_labels_zoomed = pickle.load(file)

In [None]:
best_cluster_labels = {}
for linkage in ["complete", "average"]:
    print(f"Linkage: {linkage}")
    silhouette_scores = pred_cluster_labels_zoomed[linkage]["metric_values"]
    best_labels_idx = pred_cluster_labels_zoomed[linkage]["best_labels_idx"]
    
    best_num_clusters = ks_zoomed[best_labels_idx]
    print(f"Best number of clusters: {best_num_clusters}")
    
    best_cluster_labels[linkage] = pred_cluster_labels_zoomed[linkage]["labels"][best_labels_idx]
    plot_silhouette_scores(ks_zoomed, silhouette_scores)

In [None]:
# Compute cluster size ratios (maximum cluster size / minimum cluster size)
for linkage, labels in best_cluster_labels.items():
    print(f"Linkage: {linkage}")
    labels_unique, labels_counts = np.unique(labels, return_counts=True)
    num_clusters = len(labels_unique)
    max_cluster_size = max(labels_counts)
    min_cluster_size = min(labels_counts)
    cluster_size_ratio = max_cluster_size / min_cluster_size
    print(f"{num_clusters} clusters: max={max_cluster_size}, min={min_cluster_size}, ratio={cluster_size_ratio}")
    
    # Plot distribution of cluster sizes
    sns.histplot(labels_counts, bins=max_cluster_size)
    plt.show()
    
    print("---")

In [None]:
# Look at the words corresponding to the different clusters (biggest, smallest, etc.)

In [None]:
cluster_words, cluster_sizes = words_in_clusters(
    cluster_labels=best_cluster_labels["complete"],
    words=words[:vocab_size]
)

In [None]:
biggest_cluster_idx = np.argmax(cluster_sizes)
smallest_cluster_idx = np.argmin(cluster_sizes[cluster_sizes >= 5])

In [None]:
cluster_words[biggest_cluster_idx]

In [None]:
# TODO: Hva skjer med de andre tallene? Er de i clustre "ved siden av"?

In [None]:
cluster_words[cluster_sizes >= 5][smallest_cluster_idx]

In [None]:
# TODO: Se på histogram og inspiser clustre (nummer) som forekommer ofte.

In [None]:
# Visualizing first 100 words using UMAP

In [None]:
word_embeddings_to_precompute.shape

In [None]:
word_embeddings_transformed = UMAP(n_components=2, metric="precomputed").fit_transform(word_embeddings_distances)

In [None]:
plot_word_vectors(
    words_to_plot=words[:1000],
    transformed_word_embeddings=word_embeddings_transformed,
    word_to_int=word_to_int,
    word_colors=best_cluster_labels["complete"][:1000]
)

In [None]:
# TODO:
# - UMAP av kun tall?