In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd
import gudhi as gd
from gudhi.wasserstein import wasserstein_distance

from nltk.corpus import wordnet as wn
import annoy

import plotly.offline as pyo
pyo.init_notebook_mode()

# Directory constants
topological_data_analysis_data_dir = "data"
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys
sys.path.append(root_code_dir)

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances, words_to_vectors
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors
from topological_data_analysis.tda_utils import plot_persistence_diagram, tps

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_sept_2020_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights = w2v_training_output["last_embedding_weights"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]
word_counts = w2v_training_output["word_counts"]

In [None]:
# Normalize word embeddings
last_embedding_weights_normalized = last_embedding_weights / np.linalg.norm(last_embedding_weights, axis=1).reshape(-1, 1)

In [None]:
# Load SemEval data
semeval_2010_14_vocabulary = joblib.load(
    join(topological_data_analysis_data_dir, "semeval_2010_14_vocabulary.joblib")
)
semeval_2010_14_wordnet_senses = joblib.load(
    join(topological_data_analysis_data_dir, "semeval_2010_14_wordnet_senses.joblib")
)
semeval_2010_14_word_senses = joblib.load(
    join(topological_data_analysis_data_dir, "semeval_2010_14_word_senses.joblib")
)

In [None]:
annoy_index = annoy.AnnoyIndex(f=last_embedding_weights.shape[1], metric="euclidean")
annoy_index.load(
    fn=join(word2vec_ann_indices_dir, "word2vec_enwiki_annoy_index.ann"),
    prefault=True
)

# Topolocial polysemy

In [None]:
tps_neighbourhood_sizes = [10, 40, 50, 60, 100, 150, 200]
table_1_dict = {
    "n": tps_neighbourhood_sizes,
    "TPS_n vs. GS": [],
    "TPS_n vs. synsets": [],
    "TPS_n vs. frequency": [],
}

## TPS for 100 SemEval target words

In [None]:
# Compute TPS for 100 SemEval target words
semeval_target_word_tps_scores = {}

semeval_gs_clusters = list(semeval_2010_14_word_senses["all"].values())
for tps_neighbourhood_size in tps_neighbourhood_sizes:
    print(f"Neighbourhood size: {tps_neighbourhood_size}")
    
    # Compute TPS scores
    semeval_tps_scores = []
    for semeval_target_word, semeval_target_word_clusters in tqdm(semeval_2010_14_word_senses["all"].items()):
        tps_score = tps(
            target_word=semeval_target_word,
            word_embeddings=last_embedding_weights,
            words_vocabulary=None,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_embeddings_pairwise_dists=None,
            annoy_index=annoy_index
        )
        semeval_tps_scores.append(tps_score)
        
    # Compute correlation
    semeval_tps_score_gs_corr, _ = pearsonr(
        x=semeval_tps_scores,
        y=semeval_gs_clusters
    )
    
    # Set result
    semeval_target_word_tps_scores[tps_neighbourhood_size] = {
        "tps_scores": semeval_tps_scores,
        "gs_tps_correlation": semeval_tps_score_gs_corr
    }
    table_1_dict["TPS_n vs. GS"].append(semeval_tps_score_gs_corr)
    
    # Plot TPS scores to GS
    plt.figure(figsize=(10, 5))
    plt.scatter(
        x=semeval_tps_scores,
        y=semeval_gs_clusters
    )
    plt.xlabel("TPS")
    plt.ylabel("Clusters in GS")
    plt.title(f"Correlation: {semeval_tps_score_gs_corr:.5f}")
    plt.show()

## TPS for Wordnet synsets that are in vocabulary

In [None]:
# Find words in vocabulary that have synsets in Wordnet
wordnet_synsets_in_vocab = {} 
for word in tqdm(words):
    num_synsets_word = len(wn.synsets(word))
    if num_synsets_word > 0:
        wordnet_synsets_in_vocab[word] = num_synsets_word

In [None]:
wordnet_synsets_words_in_vocab = list(wordnet_synsets_in_vocab.keys())
wordnet_synsets_words_in_vocab_meanings = list(wordnet_synsets_in_vocab.values())

In [None]:
for tps_neighbourhood_size in tps_neighbourhood_sizes:
    print(f"Neighbourhood size: {tps_neighbourhood_size}")
    
    # Compute TPS scores
    tps_scores = []
    for word in tqdm(wordnet_synsets_words_in_vocab):
        tps_score = tps(
            target_word=word,
            word_embeddings=last_embedding_weights,
            words_vocabulary=None,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_embeddings_pairwise_dists=None,
            annoy_index=annoy_index
        )
        tps_scores.append(tps_score)
        
    # Compute correlation
    tps_score_synsets_corr, _ = pearsonr(
        x=tps_scores,
        y=wordnet_synsets_words_in_vocab_meanings
    )
    table_1_dict["TPS_n vs. synsets"].append(tps_score_synsets_corr)
    
    # Plot TPS scores to Wordnet synsets
    plt.figure(figsize=(10, 5))
    plt.scatter(
        x=tps_scores,
        y=wordnet_synsets_words_in_vocab_meanings
    )
    plt.xlabel("TPS")
    plt.ylabel("Synsets in Wordnet")
    plt.title(f"Correlation: {tps_score_synsets_corr:.5f}")
    plt.show()

## TPS for top 10k words (vs. word frequencies)

In [None]:
num_top_k_words = 10000
top_k_frequencies = [int(count) for count in word_counts[:num_top_k_words]]

In [None]:
for tps_neighbourhood_size in tps_neighbourhood_sizes:
    print(f"Neighbourhood size: {tps_neighbourhood_size}")
    
    # Compute TPS scores
    tps_scores = []
    for word in tqdm(words[:num_top_k_words]):
        tps_score = tps(
            target_word=word,
            word_embeddings=last_embedding_weights,
            words_vocabulary=None,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_embeddings_pairwise_dists=None,
            annoy_index=annoy_index
        )
        tps_scores.append(tps_score)

    # Compute correlation
    tps_score_frequency_corr, _ = pearsonr(
        x=tps_scores,
        y=top_k_frequencies
    )
    table_1_dict["TPS_n vs. frequency"].append(tps_score_frequency_corr)
    
    # Plot TPS scores to word frequencies
    plt.figure(figsize=(10, 5))
    plt.scatter(
        x=tps_scores,
        y=top_k_frequencies
    )
    plt.xlabel("TPS")
    plt.ylabel("Word frequency")
    plt.title(f"Correlation: {tps_score_frequency_corr:.5f}")
    plt.show()

In [None]:
# Show "Table 1"
table_1_df = pd.DataFrame(table_1_dict)
table_1_df.set_index("n", inplace=True)
table_1_df