In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd

from sklearn.model_selection import ParameterGrid
from sklearn.metrics.pairwise import pairwise_distances
from scipy.cluster.hierarchy import dendrogram, fcluster

from umap import UMAP
import plotly.offline as pyo
pyo.init_notebook_mode()

# Directory constants
analysis_of_embeddings_dir = ".."
analysis_of_embeddings_data_dir = join(analysis_of_embeddings_dir, "data")
analysis_of_embeddings_custom_data_dir = join(analysis_of_embeddings_dir, "custom_data")
root_code_dir = join(analysis_of_embeddings_dir, "..")
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([analysis_of_embeddings_dir, root_code_dir])

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances
from analysis_utils import (plot_cluster_metric_scores, words_in_clusters,
                            plot_cluster_sizes, inspect_word_clusters,
                            load_word_cluster_group_words, visualize_word_cluster_groups)
from word_embeddings.eval_utils import plot_word_vectors
from word_embeddings.word2vec import load_model_training_output

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_sept_2020_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]

In [None]:
# Restrict vocabulary size for analysis
vocab_size = 10000

In [None]:
# Precompute cosine distance matrix
word_embeddings_to_precompute = last_embedding_weights[:vocab_size]
word_embeddings_distances = pairwise_cosine_distances(word_embeddings_to_precompute)

# Visualizing word cluster groups

In [None]:
# Transforming word embeddings into 2D UMAP space
word_embeddings_transformed = UMAP(
    n_components=2,
    metric="precomputed",
    random_state=rng_seed,
).fit_transform(word_embeddings_distances)

In [None]:
# Load word groups data
word_groups_data = load_word_cluster_group_words(
    data_dir=analysis_of_embeddings_data_dir,
    custom_data_dir=analysis_of_embeddings_custom_data_dir,
    word_to_int=word_to_int,
)
word_groups_data.keys()

## Countries/capitals

In [None]:
visualize_word_cluster_groups(
    transformed_word_embeddings=word_embeddings_transformed,
    words=words[:vocab_size],
    word_groups={
        "countries": {
            "words": word_groups_data["countries"],
            "color": "green",
        },
        "country_capitals": {
            "words": word_groups_data["country_capitals"],
            "color": "cyan",
        }
    },
    visualize_non_group_words=True,
    xlabel="UMAP 1",
    ylabel="UMAP 2",
    alpha=1,
    interactive=True,
)

TODO: Comment on result/perform deeper analysis.

## Names

In [None]:
visualize_word_cluster_groups(
    transformed_word_embeddings=word_embeddings_transformed,TODO: Comment on result/perform deeper analysis.
    words=words[:vocab_size],
    word_groups={
        "male_names": {
            "words": word_groups_data["male_names"],
            "color": "green",
        },
        "female_names": {
            "words": word_groups_data["female_names"],
            "color": "cyan",
        }
    },
    visualize_non_group_words=True,
    xlabel="UMAP 1",
    ylabel="UMAP 2",
    alpha=1,
    interactive=True,
)

TODO: Comment on result/perform deeper analysis.

## Numbers

In [None]:
visualize_word_cluster_groups(
    transformed_word_embeddings=word_embeddings_transformed,
    words=words[:vocab_size],
    word_groups={
        "numbers": {
            "words": word_groups_data["numbers"],
            "color": "green",
        },
    },
    visualize_non_group_words=True,
    xlabel="UMAP 1",
    ylabel="UMAP 2",
    alpha=1,
    interactive=True,
)

TODO: Comment on result/perform deeper analysis.

## Video games

In [None]:
visualize_word_cluster_groups(
    transformed_word_embeddings=word_embeddings_transformed,
    words=words[:vocab_size],
    word_groups={
        "video_games": {
            "words": word_groups_data["video_games"],
            "color": "green",
        },TODO: Comment on result/perform deeper analysis.
    },
    visualize_non_group_words=True,
    xlabel="UMAP 1",
    ylabel="UMAP 2",
    alpha=1,
    interactive=True,
)

TODO: Comment on result/perform deeper analysis.