In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd
import gudhi as gd
from gudhi.wasserstein import wasserstein_distance

from umap import UMAP
from sklearn.decomposition import PCA
from nltk.corpus import wordnet as wn
import annoy
from sklearn.metrics.pairwise import euclidean_distances

import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.express as px

# Directory constants
topological_data_analysis_data_dir = "data"
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")
analysis_of_embeddings_dir = join(root_code_dir, "analysis_of_embeddings")

# Extend sys path for importing custom Python files
import sys
sys.path.extend([root_code_dir, analysis_of_embeddings_dir])

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances, words_to_vectors
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors
from topological_data_analysis.geometric_anomaly_detection import (
    GeometricAnomalyDetection, grid_search_prepare_word_ints_within_radii
)
from analysis_utils import transform_word_embeddings

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_sept_2020_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]
word_counts = w2v_training_output["word_counts"]

In [None]:
# Normalize word embeddings
last_embedding_weights_normalized = last_embedding_weights / np.linalg.norm(last_embedding_weights, axis=1).reshape(-1, 1)

# Geometric anomaly detection in word embeddings

## Grid search to find best set of inner/outer annulus radii

In [None]:
# Constants
vocab_size = 10000
vocabulary_word_ints = np.arange(vocab_size)
grid_search_manifold_dimension = 2
num_radii_per_parameter = 20

In [None]:
# Compute pairwise distances for grid search using specified vocab size
word_embeddings_pairwise_dists_grid_search = euclidean_distances(
    last_embedding_weights_normalized[vocabulary_word_ints]
)

In [None]:
# Precompute word ints within each radii
word_ints_within_radii, radii_space = grid_search_prepare_word_ints_within_radii(
    word_ints=vocabulary_word_ints,
    num_radii_per_parameter=num_radii_per_parameter,
    word_vector_distance=lambda i, j: word_embeddings_pairwise_dists_grid_search[i, j],
    word_embeddings_pairwise_dists=word_embeddings_pairwise_dists_grid_search,
)

In [None]:
# Initialize GAD instance
gad_instance = GeometricAnomalyDetection(word_embeddings=last_embedding_weights_normalized)

In [None]:
# Do grid search
best_gad_result_idx, gad_results, P_man_counts = gad_instance.grid_search_radii(
    word_ints=np.arange(vocab_size),
    manifold_dimension=grid_search_manifold_dimension,
    num_radii_per_parameter=num_radii_per_parameter,
    outer_inner_radii_max_diff=0.25,
    word_ints_within_radii=word_ints_within_radii,
    radii_space=radii_space,
    word_embeddings_pairwise_dists=word_embeddings_pairwise_dists_grid_search,
)

In [None]:
# Find best GAD result
best_gad_result = gad_results[best_gad_result_idx]

## Visualize best result

In [None]:
# Create vector with word colors
word_colors = np.empty(vocab_size, dtype=object)
for i in vocabulary_word_ints:
    for key in best_gad_result.keys():
        if i in best_gad_result[key]:
            word_colors[i] = key

In [None]:
# Dimensionality reduction
transformed_word_embeddings = transform_word_embeddings(
    embedders=[
        ("PCA", PCA(
            n_components=2,
            random_state=rng_seed,
        )),
        ("UMAP", UMAP(
            n_components=2,
            metric="euclidean",
            random_state=rng_seed
        )),
        
    ],
    word_embeddings=last_embedding_weights_normalized,
    words_vocabulary=vocabulary_word_ints,
    word_to_int=word_to_int,
)

In [None]:
# Visualize with 2D UMAP/PCA embedding
for embedding_key in transformed_word_embeddings.keys():
    fig = px.scatter(
        title=embedding_key,
        labels={
            "x": "PC1" if embedding_key == "PCA" else f"{embedding_key}1",
            "y": "PC2" if embedding_key == "PCA" else f"{embedding_key}2",
        },
        x=transformed_word_embeddings[embedding_key][:, 0],
        y=transformed_word_embeddings[embedding_key][:, 1],
        color=word_colors,
        hover_data={"word": words[vocabulary_word_ints]},
    )
    fig.show()