In [None]:
%load_ext nb_black

In [None]:
# Imports
from os.path import join
import numpy as np
from tqdm.auto import tqdm

rng_seed = 399
np.random.seed(rng_seed)

from matplotlib import pyplot as plt
from scipy.stats import pearsonr, poisson
from scipy.optimize import minimize

import plotly.offline as pyo

pyo.init_notebook_mode()

import skdim
from umap import UMAP

# Directory constants
analysis_of_embeddings_dir = "."
analysis_of_embeddings_data_dir = join(analysis_of_embeddings_dir, "data")
analysis_of_embeddings_custom_data_dir = join(analysis_of_embeddings_dir, "custom_data")
root_code_dir = join(analysis_of_embeddings_dir, "..")
topological_data_analysis_dir = join(root_code_dir, "topological_data_analysis")
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys

sys.path.extend(
    [analysis_of_embeddings_dir, root_code_dir, topological_data_analysis_dir]
)

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances
from analysis_utils import (
    plot_cluster_metric_scores,
    transform_word_embeddings,
    words_in_clusters,
    plot_cluster_sizes,
    inspect_word_clusters,
    load_word_cluster_group_words,
    visualize_word_cluster_groups,
)
from cluster_analysis_utils import (
    cluster_analysis,
    visualize_cluster_analysis_result,
    plot_word_embeddings_clustered,
)
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors
from topological_polysemy import tps

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(
        word2vec_training_dir, "word2vec_enwiki_jan_2021_word2phrase"
    ),
    model_name="word2vec",
    dataset_name="enwiki",
    return_normalized_embeddings=True,
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
last_embedding_weights_normalized = w2v_training_output[
    "last_embedding_weights_normalized"
]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]

In [None]:
# Restrict vocabulary size for analysis
vocab_size = 10000
vocabulary = list(range(vocab_size))
last_embedding_weights_in_vocab = np.array(last_embedding_weights[:vocab_size])

In [None]:
# Compure pairwise cosine dists
word_embeddings_pairwise_dists = pairwise_cosine_distances(
    last_embedding_weights_in_vocab
)

# Intrinsic dimension estimation

In [None]:
def MLERegression(betas: np.ndarray, *params: np.ndarray) -> float:
    """
    TODO: Docs
    """
    # Parse parameters
    X_d_k, y = params

    # Compute negative log-likelihood using Poisson PMF
    negative_log_likelihood = -poisson.logpmf(y, mu=X_d_k @ betas).sum()

    return negative_log_likelihood

In [None]:
def local_id_estimation(
    word_embeddings: np.ndarray, pairwise_dists: np.ndarray = None
) -> tuple:
    """
    TODO: Docs
    """
    n, d = word_embeddings.shape
    m = min(np.ceil(n / 5), 100)

    # Estimate the intrinsic dimension for each data point
    estimated_ids = []
    for i in tqdm(range(n)):
        sorted_distance_indices = np.argsort(pairwise_dists[i])
        sorted_distances = pairwise_dists[i][sorted_distance_indices][1:]

        # Compute X_d and y
        y = np.zeros(m, dtype=int)
        X_d = np.zeros((d, m, 2))
        for j in range(m):
            epsilon_radius = np.random.uniform(low=0, high=sorted_distances[j])

            # Set y (number of points in epsilon ball from x_i)
            y[j] = len(np.where(sorted_distances <= epsilon_radius)[0]) + 1

            # Set X_d
            for k in range(1, d + 1):
                X_d[k - 1, j] = [epsilon_radius ** k, epsilon_radius ** (k + 2)]

        # Perform maximum likelihood estimation using Poisson distribution
        log_likelihoods = []
        for k in range(d):
            X_d_k = X_d[k]

            mle_res = minimize(
                fun=MLERegression,
                x0=[0, 0],
                args=(X_d_k, y),
                # method="Nelder-Mead",
                options={"maxiter": 2000},  # , "disp": True
            )

            # Use optimal betas to compute likelihood
            opt_betas = mle_res.x
            opt_log_likelihood = poisson.logpmf(k=y, mu=X_d_k @ opt_betas).sum()
            log_likelihoods.append(opt_log_likelihood)

        # Find estimated intrinsic dimension
        max_log_likelihood_idx = np.argmax(log_likelihoods)
        estimated_id = max_log_likelihood_idx + 1
        estimated_ids.append(estimated_id)
        print(estimated_id)

    return estimated_ids, np.mean(estimated_ids)

In [None]:
estimated_ids, estimated_ids_mean = local_id_estimation(
    word_embeddings=last_embedding_weights_in_vocab,
    pairwise_dists=word_embeddings_pairwise_dists,
)