In [None]:
# Imports
from os import makedirs
from os.path import join
import re
import pickle
import numpy as np
import pandas as pd
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import sys
sys.path.append("..")

import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

from umap import UMAP
from sklearn.decomposition import PCA

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances

import requests

from utils import get_model_checkpoint_filepaths
from analysis_utils import words_in_clusters, plot_silhouette_scores
from text_preprocessing_utils import preprocess_text

In [None]:
# Get last word embeddings from training
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir="../output/word2vec_training/31-Oct-2020_14-45-28",
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

In [None]:
# Load words and create word to int lookup dict
with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as file:
    words = np.array(file.read().split("\n"))
word_to_int = {word: i for i, word in enumerate(words)}

In [None]:
# Load country-capital data
# country_capital_df = pd.read_csv("data/country_capitals.csv")

In [None]:
def preprocess_name(name: str) -> str:
    """
    TODO: Docs
    """
    remove_paranthesis_re = re.compile("^(.+?)\(.*?\)(.*?)$")
    name_no_paranthesis_results = re.findall(remove_paranthesis_re, name)
    if len(name_no_paranthesis_results) > 0:
        name = "".join(name_no_paranthesis_results[0]).strip()
    name = " ".join(preprocess_text(name)).replace(" ", "_")
    return name

In [None]:
# Get DataFrame with countries, capitals, regions, lat and lng
req = requests.get("https://restcountries.eu/rest/v2")
req_json = req.json()
country_capital_df_dict = {
    "country": [],
    "capital": [],
    "region": [],
    "latitude": [],
    "longitude": [],
}
for country in req_json:
    
    # Check if country has capital
    if country["capital"] == "":
        continue
        
    # Add to dict
    country_capital_df_dict["country"].append(preprocess_name(country["name"]))
    country_capital_df_dict["capital"].append(preprocess_name(country["capital"]))
    country_capital_df_dict["region"].append(country["region"])
    country_capital_df_dict["latitude"].append(country["latlng"][0])
    country_capital_df_dict["longitude"].append(country["latlng"][1])

country_capital_df = pd.DataFrame(country_capital_df_dict)

In [None]:
country_capital_pairs_in_vocab = country_capital_df[["country", "capital"]].isin(words).apply(all, axis=1)
country_capital_in_vocab_df = country_capital_df[country_capital_pairs_in_vocab]
print(f"Total {len(country_capital_df)} country/capital pairs, of them {len(country_capital_in_vocab_df)} in vocabulary.")

In [None]:
fig = px.scatter(
    country_capital_df,
    x="longitude",
    y="latitude",
    title="Capitals of countries of the world in lat/lng coordinates",
    labels={"longitude": "Longitude", "latitude": "Latitude"},
    color="region",
    hover_data=["country", "capital"]
)
fig.show()

In [None]:
# Get word vectors of country capitals
countries = country_capital_in_vocab_df["country"].values
country_capitals = country_capital_in_vocab_df["capital"].values
country_capital_word_vecs = np.zeros((len(country_capitals), last_embedding_weights.shape[1]))
for i, capital in enumerate(country_capitals):
    country_capital_word_vecs[i] = last_embedding_weights[word_to_int[capital]]

In [None]:
# Compute cluster labels
country_capital_cluster_sizes = [5, 6, 7]
country_capital_cluster_labels = []
for k in country_capital_cluster_sizes:
    cluster_labels = KMeans(n_clusters=k).fit_predict(country_capital_word_vecs)
    country_capital_cluster_labels.append(cluster_labels)

In [None]:
# Compute 2D UMAP embedding
country_capital_word_vecs_umap_2d = UMAP(
    n_components=2,
    #n_neighbors=20,
    #min_dist=0.15,
    metric="cosine",
    random_state=rng_seed
).fit_transform(country_capital_word_vecs)

In [None]:
# Visualize embedding
for cluster_size, cluster_labels in zip(country_capital_cluster_sizes, country_capital_cluster_labels):
    
    # Plot
    fig = px.scatter(
        x=country_capital_word_vecs_umap_2d[:, 0],
        y=country_capital_word_vecs_umap_2d[:, 1],
        title=f"Capitals of countries of the world in UMAP coordinates with {cluster_size} clusters",
        labels={"x": "UMAP 1", "y": "UMAP 2"},
        color=cluster_labels,
        hover_data={"country": countries, "capital": country_capitals}
    )
    fig.show()
    
    cluster_words, _ = words_in_clusters(cluster_labels, countries)
    print(f"Countries in clusters")
    for word_cluster in cluster_words:
        print(word_cluster)