In [None]:
# Imports
import os
import pickle
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from umap import UMAP
from configparser import ConfigParser

from matplotlib import pyplot as plt
import plotly.offline as pyo
pyo.init_notebook_mode()

import sys
sys.path.append("..")

from importlib import reload
import eval_utils
import utils
reload(eval_utils)
reload(utils)

from eval_utils import (
    get_word_vec,
    similar_words,
    create_embeddings_of_train_weight_checkpoints,
    visualize_embeddings_over_time,
    plot_word_relationships_2d,
    plot_word_vectors,
    evaluate_model_word_analogies
)
from utils import get_model_checkpoint_filepaths
from word2vec import Word2vec, load_model

In [None]:
# Constants
output_dir = "../output/word2vec_training/31-Oct-2020_14-45-28"
checkpoint_filepaths_dict = get_model_checkpoint_filepaths(
    output_dir=output_dir,
    model_name="word2vec",
    dataset_name="enwiki",
)

In [None]:
# Load model training configuration
model_training_conf = ConfigParser()
model_training_conf.read(checkpoint_filepaths_dict["model_training_conf_filepath"])

In [None]:
# Load words and create word to int lookup dict
with open(checkpoint_filepaths_dict["train_words_filepath"], "r") as file:
    words = np.array(file.read().split("\n"))
word_to_int = {word: i for i, word in enumerate(words)}

In [None]:
# Get vocabulary size, embedding dimension, word to int dictionary and words used in the models
vocab_size = model_training_conf["MODELCONFIG"].getint("vocab_size")
embedding_dim = model_training_conf["MODELCONFIG"].getint("embedding_dim")

In [None]:
# Get target embedding weights of last model
last_embedding_weights_filepath = checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"][-1]
last_embedding_weights = np.load(last_embedding_weights_filepath, mmap_mode="r").astype(np.float64)

## Visualize training over the course of epochs

In [None]:
# Due to computational limitations, we only visualize the top 1000 most common words.
vis_embeddings_vocab_size = 10000

In [None]:
# Create embeddings of word embeddings from all train checkpoints
umap_embeddings_over_time, cluster_labels_over_time = create_embeddings_of_train_weight_checkpoints(
    model_weights_filepaths=checkpoint_filepaths_dict["intermediate_embedding_weight_filepaths"],
    vocab_size=vis_embeddings_vocab_size,
    embedding_dim=embedding_dim,
    clusterer=KMeans(n_clusters=10, random_state=rng_seed),
    transformer=UMAP(n_components=3, random_state=rng_seed)
)

In [None]:
# Visualize training
visualize_embeddings_over_time(
    transformed_word_embeddings=umap_embeddings_over_time,
    cluster_labels=cluster_labels_over_time,
    vocab_size=vis_embeddings_vocab_size,
    words=words
)

## Find similar words

In [None]:
# Find closest word to
similar_words(
    positive_words=["man"],
    weights=last_embedding_weights,
    word_to_int=word_to_int,
    words=words,
    top_n=10,
    vocab_size=100000
)

In [None]:
# Test similarities
similar_words(
    positive_words=["woman", "king"],
    negative_words=["man"],
    weights=last_embedding_weights,
    word_to_int=word_to_int,
    words=words,
    top_n=10,
    vocab_size=100000
)

## Plot word relationships

In [None]:
# Create 2D PCA embeddings of last model
embedding_weights_2d_pca = PCA(n_components=2, random_state=rng_seed).fit_transform(last_embedding_weights)

In [None]:
pairs = [
    ('man', 'woman'),
    ('king', 'queen')
]
plot_word_relationships_2d(
    pairs,
    embedding_weights_2d_pca,
    word_to_int,
    x_label="PC 1",
    y_label="PC 2"
)

In [None]:
# Create 2D UMAP embeddings of last model
embedding_weights_2d_umap = UMAP(n_components=2, random_state=rng_seed).fit_transform(last_embedding_weights[:10000])

In [None]:
# Plot words one through nine to check for cirular shape
zero_to_nine = [
    'zero',
    'one',
    'two',
    'three',
    'four',
    'five',
    'six',
    'seven',
    'eight',
    'nine'
]
plot_word_vectors(
    zero_to_nine,
    embedding_weights_2d_umap,
    word_to_int,
    x_label="UMAP 1",
    y_label="UMAP 2"
)

## Country and Capital PCA plot

In [None]:
# Get word vectors of countried and capitals
countries_to_capitals = [
    ("china", "beijing"),
    ("russia", "moscow"),
    ("japan", "tokyo"),
    ("turkey", "ankara"),
    ("poland", "warsaw"),
    ("germany", "berlin"),
    ("france", "paris"),
    ("italy", "rome"),
    ("greece", "athens"),
    ("spain", "madrid"),
    ("portugal", "lisbon")
]

In [None]:
plt.figure(figsize=(10, 10))
ax = plt.axes()
for country_word, capital_word in countries_to_capitals:
    country_vec = embedding_weights_2d_pca[word_to_int[country_word]]
    capital_vec = embedding_weights_2d_pca[word_to_int[capital_word]]
    
    plt.scatter(country_vec[0], country_vec[1], marker="x")
    plt.text(country_vec[0] + 0.003, country_vec[1] + 0.003, country_word, fontsize=12)
    
    plt.scatter(capital_vec[0], capital_vec[1], marker="x")
    plt.text(capital_vec[0] + 0.003, capital_vec[1] + 0.003, capital_word, fontsize=12)
    
    # Draw arrow
    ax.arrow(
        country_vec[0],
        country_vec[1],
        capital_vec[0] - country_vec[0],
        capital_vec[1] - country_vec[1],
        head_width=0.003,
        length_includes_head=True,
        ls="--",
        color="#ddd"
    )
plt.title("Country and Capital Vectors Projected by PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
# TODO: Cluster alle land i feks 5. cluster => kontinent?
# - land
# - mat
# - land
# - språk
# - yrker
# - sport
# - programmeringsspråk (vs. språk)
# - musikksjangre