In [None]:
# Imports
from os import makedirs
from os.path import join
import re
import pickle
import numpy as np
import pandas as pd
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import sys
sys.path.append("..")

import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

from umap import UMAP
from sklearn.decomposition import PCA

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances

import gudhi as gd

from utils import get_model_checkpoint_filepaths, pairwise_cosine_distances
from analysis_utils import words_in_clusters, plot_silhouette_scores
from text_preprocessing_utils import preprocess_text

## Prepare data

In [None]:
# Get last word embeddings from training
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir="../output/word2vec_training/03-Nov-2020_11-01-00",
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

In [None]:
# Load words and create word to int lookup dict
with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as file:
    words = np.array(file.read().split("\n"))
word_to_int = {word: i for i, word in enumerate(words)}

In [None]:
# Load country-capital data
country_capital_df = pd.read_csv("data/country-info.csv")

In [None]:
country_capital_pairs_in_vocab = country_capital_df[["country", "capital"]].isin(words).apply(all, axis=1)
country_capital_in_vocab_df = country_capital_df[country_capital_pairs_in_vocab]
print(f"Total {len(country_capital_df)} country/capital pairs, of them {len(country_capital_in_vocab_df)} in vocabulary.")

In [None]:
country_capital_in_vocab_df

## Visualize countries with continent as label

In [None]:
fig = px.scatter(
    country_capital_in_vocab_df,
    x="longitude",
    y="latitude",
    title="Capitals of countries of the world in lat/lng coordinates",
    labels={"longitude": "Longitude", "latitude": "Latitude"},
    color="region",
    hover_data=["country", "capital"]
)
fig.show()

## Compute cluster labels using country words and capital words

In [None]:
def cluster_country_capital_words(words: np.ndarray, embedding_weights: np.ndarray, word_to_int: dict, cluster_sizes: list, embedders: list):
    """
    TODO: Docs
    """
    # Create array with word vectors
    word_vecs = np.zeros((len(words), embedding_weights.shape[1]))
    for i, word in enumerate(words):
        word_vecs[i] = embedding_weights[word_to_int[word]]
    
    # Compute cluster labels
    cluster_labels = {}
    for k in cluster_sizes:
        cluster_labels[k] = KMeans(n_clusters=k).fit_predict(word_vecs)

    # Create embeddings
    embeddings = {}
    for embedding_name, embedders in embedders:
        embeddings[embedding_name] = embedders.fit_transform(word_vecs)
    
    return word_vecs, cluster_labels, embeddings

In [None]:
# Get word vectors of country capitals
countries = country_capital_in_vocab_df["country"].values
country_capitals = country_capital_in_vocab_df["capital"].values

# Constants
country_capital_cluster_sizes = [5] # [5, 6, 7]
country_capital_embedders = [
    ("PCA", PCA(
        n_components=2,
        random_state=rng_seed
    )),
    ("UMAP", UMAP(
        n_components=2,
        #n_neighbors=20,
        #min_dist=0.15,
        metric="cosine",
        random_state=rng_seed
    )),
]

# -- Do clustering --
# - Country words
country_word_vecs, country_cluster_labels, country_embeddings = cluster_country_capital_words(
    words=countries,
    embedding_weights=last_embedding_weights,
    word_to_int=word_to_int,
    cluster_sizes=country_capital_cluster_sizes,
    embedders=country_capital_embedders
)

# - Capital words
country_capital_word_vecs, country_capital_cluster_labels, country_capital_embeddings = cluster_country_capital_words(
    words=country_capitals,
    embedding_weights=last_embedding_weights,
    word_to_int=word_to_int,
    cluster_sizes=country_capital_cluster_sizes,
    embedders=country_capital_embedders
)

In [None]:
def visualize_country_capital_embedding(
    cluster_labels: dict,
    word_vector_embeddings: np.ndarray,
    coordinate_system: str,
    countries: np.ndarray,
    capitals: np.ndarray,
    print_countries_in_clusters: bool = False
):
    """
    TODO: Docs
    """
    # Visualize embedding
    for cluster_size, cluster_labels in cluster_labels.items():

        # Plot
        fig = px.scatter(
            x=word_vector_embeddings[:, 0],
            y=word_vector_embeddings[:, 1],
            title=f"Capitals of countries of the world in {coordinate_system} coordinates with {cluster_size} clusters",
            labels={"x": "x1", "y": "x2"},
            color=cluster_labels,
            hover_data={"country": countries, "capital": country_capitals}
        )
        fig.show()

        if print_countries_in_clusters:
            cluster_words, _ = words_in_clusters(cluster_labels, countries)
            print("-- Countries (and regions) in clusters --")
            for word_cluster in cluster_words:
                print("Countries", word_cluster)

                regions = country_capital_in_vocab_df[country_capital_in_vocab_df["country"].isin(word_cluster)]["region"].values
                print("Regions", regions)
                print()

In [None]:
# Lat/lng visualization - using countries for clustering
visualize_country_capital_embedding(
    cluster_labels=country_cluster_labels,
    word_vector_embeddings=country_capital_in_vocab_df[["longitude", "latitude"]].values,
    coordinate_system="lat/lng",
    countries=countries,
    capitals=country_capitals,
    print_countries_in_clusters=False
)

In [None]:
# Lat/lng visualization - using capital of countries for clustering
visualize_country_capital_embedding(
    cluster_labels=country_capital_cluster_labels,
    word_vector_embeddings=country_capital_in_vocab_df[["longitude", "latitude"]].values,
    coordinate_system="lat/lng",
    countries=countries,
    capitals=country_capitals,
    print_countries_in_clusters=False
)

In [None]:
# UMAP visualization - using countries for clustering
visualize_country_capital_embedding(
    cluster_labels=country_cluster_labels,
    word_vector_embeddings=country_embeddings["UMAP"],
    coordinate_system="UMAP",
    countries=countries,
    capitals=country_capitals,
    print_countries_in_clusters=False
)

In [None]:
# UMAP visualization - using capital of countries for clustering
visualize_country_capital_embedding(
    cluster_labels=country_capital_cluster_labels,
    word_vector_embeddings=country_embeddings["UMAP"],
    coordinate_system="UMAP",
    countries=countries,
    capitals=country_capitals,
    print_countries_in_clusters=False
)

In [None]:
# PCA visualization - using countries for clustering
visualize_country_capital_embedding(
    cluster_labels=country_cluster_labels,
    word_vector_embeddings=country_embeddings["PCA"],
    coordinate_system="PCA",
    countries=countries,
    capitals=country_capitals,
    print_countries_in_clusters=False
)

In [None]:
# PCA visualization - using capital of countries for clustering
visualize_country_capital_embedding(
    cluster_labels=country_capital_cluster_labels,
    word_vector_embeddings=country_embeddings["PCA"],
    coordinate_system="PCA",
    countries=countries,
    capitals=country_capitals,
    print_countries_in_clusters=False
)

## VR-complex and persitence diagrams of countries and capitals

In [None]:
# Precompute cosine distance matrix
word_embeddings_distances_country = pairwise_cosine_distances(country_word_vecs)
word_embeddings_distances_country_capital = pairwise_cosine_distances(country_capital_word_vecs)
precomputed_distances_matrices = [
    ("Country", word_embeddings_distances_country),
    ("Country capital", word_embeddings_distances_country_capital)
]

In [None]:
for name, distance_matrix in precomputed_distances_matrices:
    print(f"-- {name} --")
    
    # Building Vietoris-Rips complex 
    skeleton_word2vec = gd.RipsComplex(
        distance_matrix=distance_matrix,
        # max_edge_length=0.8
    )
    
    simplex_tree = skeleton_word2vec.create_simplex_tree(max_dimension=2)
    barcodes = simplex_tree.persistence()
    gd.plot_persistence_diagram(barcodes)
    plt.show()