In [None]:
# Imports
from os import makedirs
from os.path import join
import pickle
import numpy as np
import pandas as pd
rng_seed = 399
np.random.seed(rng_seed)
from scipy.spatial.distance import pdist, cdist, squareform
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.auto import tqdm
import sys
sys.path.append("..")

import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances

from utils import get_model_checkpoint_filepaths
from analysis_utils import words_in_clusters, plot_silhouette_scores

In [None]:
# Get last word embeddings from training
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir="../output/word2vec_training/17-Oct-2020_01-00-56",
    model_name="word2vec",
    dataset_name="enwiki",
)
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

In [None]:
# Load words and create word to int lookup dict
with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as file:
    words = np.array(file.read().split("\n"))
word_to_int = {word: i for i, word in enumerate(words)}

In [None]:
# Load country-capital data
country_capital_df = pd.read_csv("data/country_capitals.csv")

In [None]:
fig = px.scatter(
    country_capital_df,
    x="lng",
    y="lat",
    title="Capitals of countries of the world in lat/lng coordinates",
    labels={"lng": "Longitude", "lat": "Latitude"},
    hover_data=["country", "city"]
)
fig.show()