In [None]:
# Imports
from os import makedirs
from os.path import join
import pickle
import numpy as np
import pandas as pd
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import sys
sys.path.append("..")

import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

from umap import UMAP
from sklearn.decomposition import PCA

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances

from utils import get_model_checkpoint_filepaths
from analysis_utils import words_in_clusters, plot_silhouette_scores

In [None]:
# Get last word embeddings from training
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir="../output/word2vec_training/17-Oct-2020_01-00-56",
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

In [None]:
# Load words and create word to int lookup dict
with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as file:
    words = np.array(file.read().split("\n"))
word_to_int = {word: i for i, word in enumerate(words)}

In [None]:
# Load country-capital data
country_capital_df = pd.read_csv("data/country_capitals.csv")
country_capital_pairs_in_vocab = country_capital_df[["country", "city"]].isin(words).apply(all, axis=1)
country_capital_in_vocab_df = country_capital_df[country_capital_pairs_in_vocab]
print(f"Total {len(country_capital_df)} country/capital pairs, of them {len(country_capital_in_vocab_df)} in vocabulary.")

In [None]:
fig = px.scatter(
    country_capital_df,
    x="lng",
    y="lat",
    title="Capitals of countries of the world in lat/lng coordinates",
    labels={"lng": "Longitude", "lat": "Latitude"},
    hover_data=["country", "city"]
)
fig.show()

In [None]:
# Get word vectors of country capitals
countries = country_capital_in_vocab_df["country"].values
country_capitals = country_capital_in_vocab_df["city"].values
city_word_vecs = np.zeros((len(country_capitals), last_embedding_weights.shape[1]))
for i, city in enumerate(cities):
    city_word_vecs[i] = last_embedding_weights[word_to_int[city]]

In [None]:
# Compute cluster labels
country_capital_cluster_sizes = [5, 6, 7]
country_capital_cluster_labels = []
for k in country_capital_cluster_sizes:
    cluster_labels = KMeans(n_clusters=k).fit_predict(city_word_vecs)
    country_capital_cluster_labels.append(cluster_labels)

In [None]:
# Compute 2D UMAP embedding
city_word_vecs_umap_2d = UMAP(
    n_components=2,
    n_neighbors=20,
    min_dist=0.15,
    metric="cosine",
).fit_transform(city_word_vecs)

In [None]:
# Visualize embedding
for cluster_size, cluster_labels in zip(country_capital_cluster_sizes, country_capital_cluster_labels):
    fig = px.scatter(
        x=city_word_vecs_umap_2d[:, 0],
        y=city_word_vecs_umap_2d[:, 1],
        title=f"Capitals of countries of the world in UMAP coordinates with {cluster_size} clusters",
        labels={"x": "UMAP 1", "y": "UMAP 2"},
        color=cluster_labels,
        hover_data={"country": countries, "capital": country_capitals}
    )
    fig.show()