In [None]:
%load_ext nb_black

In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np

rng_seed = 399
np.random.seed(rng_seed)
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
from matplotlib.offsetbox import AnnotationBbox, TextArea
import seaborn as sns

sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd
import gudhi as gd
from gudhi.wasserstein import wasserstein_distance

from umap import UMAP
from sklearn.decomposition import PCA
from nltk.corpus import wordnet as wn
import annoy
from sklearn.metrics.pairwise import euclidean_distances

import plotly.offline as pyo

pyo.init_notebook_mode()
import plotly.express as px

# Directory constants
analysis_of_embeddings_data_dir = "data"
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")
tps_experimentation_dir = join(output_dir, "topological_polysemy_experimentation")
wme_word2vec_enwiki_dir = join("raw_data", "wme_word2vec_enwiki")
output_plots_dir = join("output_plots")
makedirs(output_plots_dir, exist_ok=True)

# Extend sys path for importing custom Python files
import sys

sys.path.append(root_code_dir)

from utils import (
    get_model_checkpoint_filepaths,
    pairwise_cosine_distances,
    words_to_vectors,
)
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors, configure_plotting_for_thesis
from topological_data_analysis.geometric_anomaly_detection import compute_gad
from analysis_utils import transform_word_embeddings

configure_plotting_for_thesis()

## Load and prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(
        word2vec_training_dir, "word2vec_enwiki_jan_2021_word2phrase"
    ),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]

In [None]:
compute_words_to_num_meanings = False
if compute_words_to_num_meanings:
    print("Finding words in vocabulary with #Wordnet synsets > 0")
    words_to_num_meanings = {}
    for word in tqdm(words):
        num_synsets = len(wn.synsets(word))
        if num_synsets > 0:
            words_to_num_meanings[word] = num_synsets
    joblib.dump(
        words_to_num_meanings,
        join(analysis_of_embeddings_data_dir, "word2vec-enwiki-wordnet-dict.joblib"),
    )
else:
    words_to_num_meanings = joblib.load(
        join(analysis_of_embeddings_data_dir, "word2vec-enwiki-wordnet-dict.joblib")
    )
data_words = np.array(list(words_to_num_meanings.keys()))
data_words_to_full_vocab_ints = np.array([word_to_int[word] for word in data_words])

In [None]:
# Load ID estimation result from supervised task
knn_size = 200
id_estimator_keys = ["lpca", "twonn", "tle"]
id_estimator_to_human_readable = {"lpca": "lPCA", "twonn": "TwoNN", "tle": "TLE"}
id_estimators = {}
for id_estimator_key in id_estimator_keys:
    id_estimators[id_estimator_key] = np.load(
        join(
            wme_word2vec_enwiki_dir,
            "estimated_ids",
            f"{id_estimator_key}_{knn_size}.npy",
        )
    )

In [None]:
# Plot estimated ID vs. number of word meanings
_, axes = plt.subplots(ncols=3, figsize=(5 * 3, 5))
ax_chars = "abc"

for ax, ax_char, id_estimator_key in zip(axes.ravel(), ax_chars, id_estimator_keys):

    estimated_id_num_synsets_corr, _ = pearsonr(
        x=id_estimators[id_estimator_key],
        y=list(words_to_num_meanings.values()),
    )
    ax_scatter_handle = ax.scatter(
        x=id_estimators[id_estimator_key],
        y=list(words_to_num_meanings.values()),
        s=10,
        label=f"Correlation: {estimated_id_num_synsets_corr:.3f}",
    )
    ax.set_xlabel("Estimated ID")
    ax.set_ylabel("Synsets in WordNet")
    ax.set_title(
        f"({ax_char}) Estimated ID w/{id_estimator_to_human_readable[id_estimator_key]}"
    )
    ax.legend()
    ax_scatter_handle.set_rasterized(True)

# Plot/save
save_to_pgf = True
plt.tight_layout()
if save_to_pgf:
    plt.savefig(
        join(
            output_plots_dir,
            "intrinsic-dimension-estimation-vs-wordnet-synsets.pdf",
        ),
        backend="pgf",
    )
else:
    plt.show()