In [None]:
# Imports
import os
import pickle
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from umap import UMAP

import plotly.offline as pyo
pyo.init_notebook_mode()

from importlib import reload
import eval_utils
reload(eval_utils)

from eval_utils import (
    get_word_vec,
    similar_words_vec,
    create_embeddings_of_train_checkpoints,
    visualize_embeddings_over_time,
    plot_word_relationships_2d,
    plot_word_vectors,
    evaluate_model_questions_words
)
from train_utils import get_model_checkpoint_filepaths
from word2vec import Word2vec

In [None]:
# Constants
checkpoints_dir = "checkpoints"
checkpoint_filepaths = get_model_checkpoint_filepaths(checkpoints_dir)
last_model_filepath = checkpoint_filepaths[-1]

In [None]:
# Load questions-words pairs
with open("data/questions-words.pickle", "rb") as file:
    questions_words = pickle.load(file)

In [None]:
# Load last model
word2vec = Word2vec()
word2vec.load_model(last_model_filepath)

In [None]:
# Get vocabulary size, embedding dimension, word to int dictionary and words used in the models
vocab_size = word2vec.tokenizer.vocab_size
embedding_dim = word2vec.embedding_weights.shape[1]
word_to_int = word2vec.tokenizer.word_to_int
words = word2vec.tokenizer.words

In [None]:
# Get target embedding weights of last model
embedding_weights = word2vec.embedding_weights

## Visualize training over the course of epochs

In [None]:
# Create embeddings of word embeddings from all train checkpoints
umap_embeddings_over_time, cluster_labels_over_time = create_embeddings_of_train_checkpoints(
    checkpoint_filepaths,
    vocab_size,
    embedding_dim,
    KMeans(n_clusters=10, random_state=rng_seed),
    UMAP(n_components=2, random_state=rng_seed)
)

In [None]:
# Visualize training
visualize_embeddings_over_time(
    umap_embeddings_over_time,
    cluster_labels_over_time,
    words
)

## Find similar words

In [None]:
# Test similarities
a_vec = get_word_vec("king", word_to_int, embedding_weights)
b_vec = get_word_vec("man", word_to_int, embedding_weights)
c_vec = get_word_vec("woman", word_to_int, embedding_weights)
d_vec = a_vec - b_vec + c_vec

similar_words_vec(d_vec, embedding_weights, word2vec.tokenizer.words, top_n=20)

## Plot word relationships

In [None]:
# Create 2D PCA embeddings of last model
embedding_weights_2d_pca = PCA(n_components=2, random_state=rng_seed).fit_transform(embedding_weights)

In [None]:
pairs = [
    ('man', 'woman'),
    ('king', 'queen')
]
plot_word_relationships_2d(
    pairs,
    embedding_weights_2d_pca,
    word_to_int,
    x_label="PC1",
    y_label="PC2"
)

In [None]:
# Create 2D UMAP embeddings of last model
embedding_weights_2d_umap = UMAP(n_components=2, random_state=rng_seed).fit_transform(embedding_weights)

In [None]:
# Plot words one through nine to check for cirular shape
zero_to_nine = [
    'zero',
    'one',
    'two',
    'three',
    'four',
    'five',
    'six',
    'seven',
    'eight',
    'nine'
]
plot_word_vectors(
    zero_to_nine,
    embedding_weights_2d_umap,
    word_to_int,
    x_label="UMAP 1",
    y_label="UMAP 2"
)

## Validate model on "questions-words" pairs

In [None]:
evaluate_model_questions_words(
    questions_words,
    embedding_weights,
    word_to_int,
    words,
    top_n=5
)