In [None]:
# Imports
from os import makedirs
from os.path import join
import re
import numpy as np
import pandas as pd
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm

import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

from umap import UMAP
from sklearn.decomposition import PCA

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances

import gudhi as gd

# Directory constants
analysis_of_embeddings_dir = ".."
root_code_dir = join(analysis_of_embeddings_dir, "..")
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([analysis_of_embeddings_dir, root_code_dir])

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances
from analysis_utils import words_in_clusters
from word_embeddings.word2vec import load_model_training_output
from preprocess_analysis_data import preprocess_text

## Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_sept_2020_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]

In [None]:
# Stringify list of numbers to list of words
max_num = 1000
numbers_set = set()
for number in np.arange(max_num + 1):
    for num in preprocess_text(str(number)):
        if num != "and":
            numbers_set.add(num)
number_words_in_vocab = np.array([num_word for num_word in numbers_set if num_word in word_to_int])
print(f"Total {len(numbers_set)} number words, of them {len(number_words_in_vocab)} in vocabulary.")

In [None]:
def cluster_category_words(words: np.ndarray, embedding_weights: np.ndarray, word_to_int: dict, cluster_sizes: list, embedders: list):
    """
    TODO: Docs
    """
    # Create array with word vectors
    word_vecs = np.zeros((len(words), embedding_weights.shape[1]))
    for i, word in enumerate(words):
        word_vecs[i] = embedding_weights[word_to_int[word]]
    
    # Compute cluster labels
    cluster_labels = {}
    for k in cluster_sizes:
        cluster_labels[k] = KMeans(n_clusters=k).fit_predict(word_vecs)

    # Create embeddings
    embeddings = {}
    for embedding_name, embedders in embedders:
        embeddings[embedding_name] = embedders.fit_transform(word_vecs)
    
    return word_vecs, cluster_labels, embeddings

In [None]:
number_word_vecs, number_cluster_labels, number_embeddings = cluster_category_words(
    words=number_words_in_vocab,
    embedding_weights=last_embedding_weights,
    word_to_int=word_to_int,
    cluster_sizes=[5],
    embedders=[
        ("PCA", PCA(
            n_components=2,
            random_state=rng_seed
        )),
        ("UMAP", UMAP(
            n_neighbors=10,
            n_components=2,
            metric="cosine",
            random_state=rng_seed
        )),
    ]
)

In [None]:
def visualize_category_embedding(
    cluster_labels: dict,
    word_vector_embeddings: np.ndarray,
    coordinate_system: str,
    category_words: np.ndarray,
    print_words_in_clusters: bool = False
):
    """
    TODO: Docs
    """
    # Visualize embedding
    for cluster_size, cluster_labels in cluster_labels.items():

        # Plot
        fig = px.scatter(
            x=word_vector_embeddings[:, 0],
            y=word_vector_embeddings[:, 1],
            title=f"Embedding of words in {coordinate_system} coordinates with {cluster_size} clusters",
            labels={"x": "x1", "y": "x2"},
            color=cluster_labels,
            hover_data={"word": category_words}
        )
        fig.show()

        if print_words_in_clusters:
            cluster_words, _ = words_in_clusters(cluster_labels, category_words)
            print("-- Words in clusters --")
            for word_cluster in cluster_words:
                print("Words", word_cluster)

In [None]:
visualize_category_embedding(
    cluster_labels=number_cluster_labels,
    word_vector_embeddings=number_embeddings["PCA"],
    coordinate_system="PCA",
    category_words=number_words_in_vocab,
    print_words_in_clusters=False
)

In [None]:
# TODO: Cluster with 100 clusters

In [None]:
visualize_category_embedding(
    cluster_labels=number_cluster_labels,
    word_vector_embeddings=number_embeddings["UMAP"],
    coordinate_system="UMAP",
    category_words=number_words_in_vocab,
    print_words_in_clusters=False
)

In [None]:
# TODO: Cluster with ~100 clusters

In [None]:
# Precompute cosine distance matrix
word_embeddings_distances_number_words = pairwise_cosine_distances(number_word_vecs)

In [None]:
# Building Vietoris-Rips complex 
skeleton_word2vec = gd.RipsComplex(
    distance_matrix=word_embeddings_distances_number_words,
    # max_edge_length=0.8
)

simplex_tree = skeleton_word2vec.create_simplex_tree(max_dimension=2)
barcodes = simplex_tree.persistence()
gd.plot_persistence_diagram(barcodes)
plt.show()