In [None]:
%load_ext nb_black

In [None]:
# Imports
from os import makedirs
from os import listdir
from os.path import join
import numpy as np
import joblib
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from tqdm.auto import tqdm

rng_seed = 399
np.random.seed(rng_seed)

# Directory constants
topological_data_analysis_data_dir = "data"
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
topological_polysemy_experimentation_dir = join(
    output_dir, "topological_polysemy_experimentation"
)
word_embeddings_data_dir = join(root_code_dir, "word_embeddings", "data")

# Extend sys path for importing custom Python files
import sys

sys.path.append(root_code_dir)

from word_embeddings.word2vec import load_model_training_output  # noqa: E402

In [None]:
# Load SemEval data
semeval_2010_14_word_senses = joblib.load(
    join(topological_data_analysis_data_dir, "semeval_2010_14_word_senses.joblib")
)

In [None]:
# Prepare data
semeval_target_word_tps_scores = {}
semeval_target_words = np.array(list(semeval_2010_14_word_senses["all"].keys()))
semeval_gs_clusters = np.array(list(semeval_2010_14_word_senses["all"].values()))

In [None]:
# Constants
tps_neighbourhood_sizes = [10, 40, 50, 60, 100]
tps_word_embeddings_names = [
    "enwiki",
    "semeval_2010_task_14",
    "fasttext_cc_300d",
    "glove_cc_840b_300d",
    "google_news_3m",
]
tps_word_embeddings_model_names = [
    "enwiki",
    "semeval_2010_task_14",
    "cc.en.300.vec",
    "glove.840B.300d",
    "GoogleNews-vectors-negative300",
]
tps_word_embeddings_is_external = [False, False, True, True, True]
tps_word_embeddings_paths = [
    join(word2vec_training_dir, "word2vec_enwiki_jan_2021_word2phrase"),
    join(word2vec_training_dir, "word2vec_semeval_2010_task_14"),
    join(word_embeddings_data_dir, "fastText"),
    join(word_embeddings_data_dir, "GloVe"),
    join(word_embeddings_data_dir, "GoogleNews"),
]
tps_vs_gs_key = "TPS_n vs. GS"
tps_vs_synsets_key = "TPS_n vs. synsets"
tps_vs_frequency_key = "TPS_n vs. frequency"
num_top_k_words_frequencies = 10000

In [None]:
# Create tables
tps_experiment_table_dicts = []
for (
    word_embeddings_name,
    word_embeddings_model_name,
    word_embeddings_path,
    word_embeddings_is_external,
) in zip(
    tps_word_embeddings_names,
    tps_word_embeddings_model_names,
    tps_word_embeddings_paths,
    tps_word_embeddings_is_external,
):
    # Load data
    if word_embeddings_is_external:
        model_words_filepath = join(
            word_embeddings_path, f"{word_embeddings_model_name}_words.txt"
        )
        with open(model_words_filepath, "r") as words_file:
            model_words = np.array(words_file.read().split("\n"))
        word_to_int = {word: i for i, word in enumerate(model_words)}
    else:
        print(f"Loading {word_embeddings_name} word embeddings...")
        w2v_training_output = load_model_training_output(
            model_training_output_dir=word_embeddings_path,
            model_name="word2vec",
            dataset_name=word_embeddings_model_name,
        )
        model_words = w2v_training_output["words"]
        word_to_int = w2v_training_output["word_to_int"]
        word_counts = w2v_training_output["word_counts"]
        print("Done!")

    # Filter SemEval words in vocabulary
    semeval_target_words_in_vocab_filter = [
        i for i, word in enumerate(semeval_target_words) if word in word_to_int
    ]
    semeval_target_words_in_vocab = semeval_target_words[
        semeval_target_words_in_vocab_filter
    ]
    semeval_gs_clusters_in_vocab = semeval_gs_clusters[
        semeval_target_words_in_vocab_filter
    ]
    num_semeval_words = len(semeval_gs_clusters_in_vocab)

    # Find words in vocabulary that have synsets in Wordnet
    wordnet_synsets_words_in_vocab_meanings = []
    print("Find words in vocabulary that have synsets in Wordnet...")
    for word in tqdm(model_words):
        num_synsets_word = len(wn.synsets(word))
        if num_synsets_word > 0:
            wordnet_synsets_words_in_vocab_meanings.append(num_synsets_word)

    result_dict: dict = {
        "n": tps_neighbourhood_sizes,
        tps_vs_gs_key: [],
        tps_vs_synsets_key: [],
    }
    if not word_embeddings_is_external:
        result_dict[tps_vs_frequency_key] = []

    # Fill in dictionary
    for n_size in tps_neighbourhood_sizes:

        # TPS_n score vs. GS
        tps_scores_semeval = np.load(
            join(topological_polysemy_experimentation_dir, f"tps_{n_size}_vs_gs.npy")
        )
        tps_score_vs_gs_correlation, _ = pearsonr(
            x=tps_scores_semeval, y=semeval_gs_clusters_in_vocab
        )
        result_dict[tps_vs_gs_key].append(tps_score_vs_gs_correlation)

        # TPS_n score vs. Synsets
        tps_scores_wordnet_synsets = np.load(
            join(
                topological_polysemy_experimentation_dir, f"tps_{n_size}_vs_synsets.npy"
            )
        )
        tps_score_vs_wordnet_synsets_correlation, _ = pearsonr(
            x=tps_scores_wordnet_synsets, y=wordnet_synsets_words_in_vocab_meanings
        )
        result_dict[tps_vs_synsets_key].append(tps_score_vs_wordnet_synsets_correlation)

        # TPS_n score vs. frequency
        if not word_embeddings_is_external:
            tps_score_vs_word_frequency_correlation, _ = pearsonr(
                x=tps_score_word_frequencies,
                y=word_counts[:num_top_k_words_frequencies],
            )
            result_dict[tps_vs_frequency_key].append(
                tps_score_vs_word_frequency_correlation
            )
    tps_experiment_table_dicts.append(result_dict)

In [None]:
# Imports
from os import makedirs
from os.path import join
import joblib
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import pandas as pd
import gudhi as gd
from gudhi.wasserstein import wasserstein_distance

from nltk.corpus import wordnet as wn

import plotly.offline as pyo
pyo.init_notebook_mode()

# Directory constants
topological_data_analysis_data_dir = "data"
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys
sys.path.append(root_code_dir)

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances, words_to_vectors
from word_embeddings.word2vec import load_model_training_output
from vis_utils import plot_word_vectors
from topological_data_analysis.tda_utils import plot_persistence_diagram
from topological_polysemy import tps

# Prepare data

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(word2vec_training_dir, "word2vec_enwiki_jan_2021_word2phrase"),
    model_name="word2vec",
    dataset_name="enwiki",
    return_normalized_embeddings=True,
    return_scann_instance=True
)
last_embedding_weights_normalized = w2v_training_output["last_embedding_weights_normalized"]
last_embedding_weights_scann_instance = w2v_training_output["last_embedding_weights_scann_instance"]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]
word_counts = w2v_training_output["word_counts"]

In [None]:
# Load SemEval data
semeval_2010_14_word_senses = joblib.load(
    join(topological_data_analysis_data_dir, "semeval_2010_14_word_senses.joblib")
)

# Topolocial polysemy

In [None]:
tps_neighbourhood_sizes = [10, 40, 50, 60, 100, 150, 200, 500, 1000, 1500]
table_1_dict = {
    "n": tps_neighbourhood_sizes,
    "TPS_n vs. GS": [],
    "TPS_n vs. synsets": [],
    "TPS_n vs. frequency": [],
}

## TPS for 100 SemEval target words

In [None]:
# Prepare data
semeval_target_word_tps_scores = {}

semeval_target_words = np.array(list(semeval_2010_14_word_senses["all"].keys()))
semeval_target_words_in_vocab_filter = [
    i for i, word in enumerate(semeval_target_words) if word in word_to_int
]
semeval_target_words_in_vocab = semeval_target_words[
    semeval_target_words_in_vocab_filter
]
semeval_gs_clusters = np.array(list(semeval_2010_14_word_senses["all"].values()))
semeval_gs_clusters_in_vocab = semeval_gs_clusters[semeval_target_words_in_vocab_filter]

num_semeval_words = len(semeval_gs_clusters_in_vocab)

In [None]:
# Compute TPS for 100 SemEval target words
semeval_target_word_tps_scores = {}
for tps_neighbourhood_size in tps_neighbourhood_sizes:
    print(f"Neighbourhood size: {tps_neighbourhood_size}")
    
    # Compute TPS scores
    semeval_tps_scores = []
    for semeval_target_word, semeval_target_word_clusters in tqdm(
        zip(semeval_target_words_in_vocab, semeval_gs_clusters_in_vocab),
        total=num_semeval_words,
    ):
        tps_score = tps(
            target_word=semeval_target_word,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            ann_instance=last_embedding_weights_scann_instance
        )
        semeval_tps_scores.append(tps_score)
        
    # Compute correlation
    semeval_tps_score_gs_corr, semeval_tps_score_gs_corr_p_value = pearsonr(
        x=semeval_tps_scores,
        y=semeval_gs_clusters_in_vocab
    )
    
    # Set result
    semeval_target_word_tps_scores[tps_neighbourhood_size] = {
        "tps_scores": semeval_tps_scores,
        "gs_tps_correlation": semeval_tps_score_gs_corr
    }
    table_1_dict["TPS_n vs. GS"].append(semeval_tps_score_gs_corr)
    
    # Plot TPS scores to GS
    plt.figure(figsize=(10, 5))
    plt.scatter(
        x=semeval_tps_scores,
        y=semeval_gs_clusters_in_vocab
    )
    plt.xlabel("TPS")
    plt.ylabel("Clusters in GS")
    plt.title(f"Correlation: {semeval_tps_score_gs_corr:.5f}, p-value: {semeval_tps_score_gs_corr_p_value:.5f}")
    plt.show()

## TPS for Wordnet synsets that are in vocabulary

In [None]:
# Find words in vocabulary that have synsets in Wordnet
wordnet_synsets_in_vocab = {} 
for word in tqdm(words):
    num_synsets_word = len(wn.synsets(word))
    if num_synsets_word > 0:
        wordnet_synsets_in_vocab[word] = num_synsets_word

In [None]:
wordnet_synsets_words_in_vocab = list(wordnet_synsets_in_vocab.keys())
wordnet_synsets_words_in_vocab_meanings = list(wordnet_synsets_in_vocab.values())

In [None]:
for tps_neighbourhood_size in tps_neighbourhood_sizes:
    print(f"Neighbourhood size: {tps_neighbourhood_size}")
    
    # Compute TPS scores
    tps_scores = []
    for word in tqdm(wordnet_synsets_words_in_vocab):
        tps_score = tps(
            target_word=word,
            word_embeddings=last_embedding_weights,
            words_vocabulary=None,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_embeddings_pairwise_dists=None,
            ann_instance=last_embedding_weights_scann_instance
        )
        tps_scores.append(tps_score)
        
    # Compute correlation
    tps_score_synsets_corr, _ = pearsonr(
        x=tps_scores,
        y=wordnet_synsets_words_in_vocab_meanings
    )
    table_1_dict["TPS_n vs. synsets"].append(tps_score_synsets_corr)
    
    # Plot TPS scores to Wordnet synsets
    plt.figure(figsize=(10, 5))
    plt.scatter(
        x=tps_scores,
        y=wordnet_synsets_words_in_vocab_meanings
    )
    plt.xlabel("TPS")
    plt.ylabel("Synsets in Wordnet")
    plt.title(f"Correlation: {tps_score_synsets_corr:.5f}")
    plt.show()

## TPS for top 10k words (vs. word frequencies)

In [None]:
num_top_k_words = 10000
top_k_frequencies = word_counts[:num_top_k_words]
table_1_dict["TPS_n vs. frequency"] = []

In [None]:
for tps_neighbourhood_size in tps_neighbourhood_sizes:
    print(f"Neighbourhood size: {tps_neighbourhood_size}")
    
    # Compute TPS scores
    tps_scores = []
    for word in tqdm(words[:num_top_k_words]):
        tps_score = tps( 
            target_word=word,
            word_embeddings=last_embedding_weights,
            words_vocabulary=None,
            word_to_int=word_to_int,
            neighbourhood_size=tps_neighbourhood_size,
            word_embeddings_normalized=last_embedding_weights_normalized,
            word_embeddings_pairwise_dists=None,
            ann_instance=last_embedding_weights_scann_instance
        )
        tps_scores.append(tps_score)

    # Compute correlation
    tps_score_frequency_corr, _ = pearsonr(
        x=tps_scores,
        y=top_k_frequencies
    )
    table_1_dict["TPS_n vs. frequency"].append(tps_score_frequency_corr)
    
    # Plot TPS scores to word frequencies
    plt.figure(figsize=(10, 5))
    plt.scatter(
        x=tps_scores,
        y=top_k_frequencies
    )
    plt.xlabel("TPS")
    plt.ylabel("Word frequency")
    plt.title(f"Correlation: {tps_score_frequency_corr:.5f}")
    plt.show()

In [None]:
# Show "Table 1"
table_1_df = pd.DataFrame(table_1_dict)
table_1_df.set_index("n", inplace=True)
table_1_df