In [1]:
# Imports
import requests
import collections
import re
import os
from os.path import join as join_path
import zipfile
import numpy as np
rng_seed = 399
np.random.seed(rng_seed)
import pickle
import pandas as pd
from tqdm.auto import tqdm
from typing import Union

import tensorflow as tf
tf.random.set_seed(rng_seed)
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Dense, Reshape, dot, Embedding
from tensorflow.keras.preprocessing.sequence import skipgrams, make_sampling_table
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence, tokenizer_from_json
from tensorflow.keras.utils import plot_model, to_categorical, Sequence, Progbar
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Silence NumbaPerformanceWarning (for UMAP)
from numba.core.errors import NumbaPerformanceWarning
import warnings
warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)

# Dimensionality reduction
import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Clustering
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
import hdbscan

# Custom files
from importlib import reload
import utils
reload(utils)

from utils import clean_sents
from models import build_word2vec_model_dense

[nltk_data] Downloading package stopwords to /Users/triki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/triki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/triki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/triki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define constants
# ----------------
data_dir = 'data'
checkpoints_dir = 'checkpoints'
text8_data_tokenizer_config_path = join_path(data_dir, 'text8-tokenizer.json')
text8_data_sequences_path = join_path(data_dir, 'text8-seqs.p')

# We have over 30k articles with a large vocubulary. Due to computational
# restrictions, we limit ourselves to the top 1000 words from all texts.
max_vocab_size = 1000
sampling_window_size = 5
num_negative_samples = 15
embedding_dim = 100

# Training constants
epochs = 15
batch_size = 64
# ----------------

In [3]:
# Read tokenizer from file
with open(text8_data_tokenizer_config_path, 'r') as file:
    tokenizer = tokenizer_from_json(file.read())
vocab_size = np.minimum(max_vocab_size, len(tokenizer.word_index))
words = np.array([tokenizer.index_word[i] for i in range(1, vocab_size + 1)])

In [4]:
def get_weights(model: Model):
    '''
    TODO: Docs
    '''
    weights = model.layers[2].get_weights()[0]
    weights = weights[1:] # Exclude first "junk" row
    return weights

In [5]:
# Prepare matrix of word embeddings of all epochs
epochs_lst = np.arange(1, epochs + 1)
model_checkpoint_paths = [f'{checkpoints_dir}/word2vec-model-epoch-{epoch_nr}.model' for epoch_nr in epochs_lst]
embeddings = np.zeros((len(epochs_lst) * vocab_size, embedding_dim))
embeddings_cluster_labels = np.zeros(vocab_size)
for i, checkpoint_path in enumerate(model_checkpoint_paths):
    model = load_model(checkpoint_path)
    weights = get_weights(model)
    
    embeddings[i * vocab_size:(i + 1) * vocab_size] = weights
    
    # Use cluster labels from last embedding
    if i == epochs - 1:
        embeddings_cluster_labels = KMeans(n_clusters=10).fit_predict(weights)

# Create UMAP embedding of all epochs
umap_embedding_all_epochs = umap.UMAP(n_components=3, random_state=rng_seed).fit_transform(embeddings)


Graph is not fully connected, spectral embedding may not work as expected.



In [6]:
# Create Pandas DataFrame for Plotly animations
umap_embedding_df_dict = {
    'epoch': [],
    'x': [],
    'y': [],
    'z': [],
    'cluster_label': [],
    'word': []
}
for epoch_num in epochs_lst:
    weights = umap_embedding_all_epochs[(epoch_num - 1) * vocab_size:epoch_num * vocab_size]
    
    # Add to df
    umap_embedding_df_dict['epoch'].extend(np.repeat(epoch_num, vocab_size))
    umap_embedding_df_dict['x'].extend(weights[:, 0])
    umap_embedding_df_dict['y'].extend(weights[:, 1])
    umap_embedding_df_dict['z'].extend(weights[:, 2])
    umap_embedding_df_dict['cluster_label'].extend(embeddings_cluster_labels)
    umap_embedding_df_dict['word'].extend(words)

# Create df from dict
umap_embedding_df = pd.DataFrame(umap_embedding_df_dict)

In [7]:
# Visualize animation of UMAP embeddings
fig = px.scatter_3d(
    umap_embedding_df,
    x="x",
    y="y",
    z="z",
    range_x=[umap_embedding_all_epochs[:, 0].min(), umap_embedding_all_epochs[:, 0].max()],
    range_y=[umap_embedding_all_epochs[:, 1].min(), umap_embedding_all_epochs[:, 1].max()],
    range_z=[umap_embedding_all_epochs[:, 2].min(), umap_embedding_all_epochs[:, 2].max()],
    animation_frame="epoch",
    color="cluster_label",
    hover_name="word",
    title='UMAP embeddings over time'
)
fig.update_scenes({'aspectmode': 'cube'})

In [8]:
# Extract weights from last epoch and perform some more testing
last_weights = get_weights(load_model(model_checkpoint_paths[-1]))

In [9]:
def similar_words_vec(word_vec: np.ndarray, weights: np.ndarray, words: list, top_n: int = 10, skip_first: int = 0):
    cos_sims = word_vec @ weights.T / (np.linalg.norm(word_vec) * np.linalg.norm(weights, axis=1))
    cos_sims = np.clip(cos_sims, 0, 1) 
    sorted_indices = cos_sims.argsort()[::-1]
    top_words = list(words[sorted_indices - 1][skip_first:skip_first + top_n])
    top_sims = cos_sims[sorted_indices][skip_first:skip_first + top_n]
    
    # Create word similarity pairs
    pairs = list(zip(top_words, top_sims))
    
    return pairs

def get_word_vec(word: str, words: list, weights: np.ndarray) -> np.ndarray:
    return weights[np.where(words == word)[0][0]]

def similar_words(word: str, weights: np.ndarray, words: list, top_n: int = 10):
    return similar_words_vec(get_word_vec(word, words, weights), weights, words, top_n, skip_first=1)

In [10]:
similar_words('boy', last_weights, words)

[('get', 0.9987074),
 ('universe', 0.9983425),
 ('st.', 0.9983306),
 ('charge', 0.9982151),
 ('lake', 0.99817115),
 ('physical', 0.99817115),
 ('cost', 0.9981687),
 ('bridge', 0.99815),
 ('access', 0.99813884),
 ('formed', 0.9981262)]

In [11]:
# King - man + woman = ?
man_vec = get_word_vec('man', words, last_weights)
woman_vec = get_word_vec('woman', words, last_weights)
king_vec = get_word_vec('king', words, last_weights)
queen_vec_pred = king_vec - man_vec + woman_vec
similar_words_vec(queen_vec_pred, last_weights, words, top_n=25)

[('different', 0.9981661),
 ('element', 0.9973645),
 ('seen', 0.9968732),
 ('gave', 0.99678797),
 ('asia', 0.9967806),
 ('career', 0.9967389),
 ('received', 0.99671674),
 ('seat', 0.99669373),
 ('site', 0.99666077),
 ('health', 0.9966479),
 ('formed', 0.9965937),
 ('get', 0.996564),
 ('scientific', 0.99652684),
 ('team', 0.99651),
 ('best', 0.9964886),
 ('car', 0.99647665),
 ('artist', 0.99645734),
 ('provide', 0.9964526),
 ('digital', 0.9964477),
 ('construction', 0.99644053),
 ('speech', 0.99643475),
 ('married', 0.99640965),
 ('march', 0.99640065),
 ('length', 0.99639326),
 ('date', 0.9963609)]

In [12]:
def arrow_plot(relationship_pairs: list, word_embeddings_pca: np.ndarray, words: np.ndarray):
    '''
    TODO: Docs
    '''
    fig = go.Figure()
    for (from_word, to_word) in relationship_pairs:
        from_word_vec = get_word_vec(from_word, words, word_embeddings_pca)
        to_word_vec = get_word_vec(to_word, words, word_embeddings_pca)
        
        # Plot points in 3D
        fig.add_trace(go.Scatter(
            x=[from_word_vec[0], to_word_vec[0]],
            y=[from_word_vec[1], to_word_vec[1]],
            mode="markers+text",
            text=[from_word, to_word],
            textposition="bottom center",
            hovertext=[from_word, to_word]
        ))
        
        fig.update_layout(
            title="Plot of relationship pairs",
            xaxis_title="PC1",
            yaxis_title="PC2"
        )
        
        # Annotate points with arrows
        fig.add_annotation(
            ax=from_word_vec[0],
            ay=from_word_vec[1],
            axref='x',
            ayref='y',
            x=to_word_vec[0],
            y=to_word_vec[1],
            xref='x',
            yref='y',
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            opacity=0.5
        )
    fig.update_layout(showlegend=False)
    fig.show()

In [13]:
word_embeddings_2d_pca = PCA(n_components=2, random_state=rng_seed).fit_transform(last_weights)

In [14]:
# Plot relationship between man <-> woman and king <-> queen
pairs = [
    ('man', 'woman'),
    ('king', 'queen')
]
arrow_plot(pairs, word_embeddings_2d_pca, words)

In [15]:
def word_plot(words_to_plot: list, word_embeddings: np.ndarray, words: np.ndarray):
    '''
    TODO: Docs
    '''
    fig = go.Figure()
    word_vecs = np.array([get_word_vec(word, words, word_embeddings) for word in words_to_plot])
    
    # Plot points
    fig.add_trace(go.Scatter(
        x=word_vecs[:, 0],
        y=word_vecs[:, 1],
        mode="markers+text",
        hovertext=words_to_plot,
        text=words_to_plot,
        textposition="bottom center"
    ))
        
    fig.update_layout(
        title="Plot of words projected to 2D",
        xaxis_title="Axis 1",
        yaxis_title="Axis 2"
    )
 
    fig.update_layout(showlegend=False)
    fig.show()

In [16]:
# Plot words one through nine to check for cirular shape
zero_to_nine = [
    'zero',
    'one',
    'two',
    'three',
    'four',
    'five',
    'six',
    'seven',
    'eight',
    'nine'
]
word_plot(zero_to_nine, word_embeddings_2d_pca, words)

## Validate Word2vec model
We validate the last fitted Word2Vec model using question-word pairs from Googles Word2Vec repo.

In [17]:
# Download data
fetch_questions_words_data = False
if fetch_questions_words_data:
    !wget https://raw.githubusercontent.com/svn2github/word2vec/master/questions-words.txt -O data/questions-words.txt

# Read data
with open('data/questions-words.txt', 'r') as file:
    questions_words_content = file.read()

In [18]:
def parse_question_words(content: str):
    '''
    TODO: Docs
    '''
    # Parse question words pairs for each section
    questions_words_sections = re.findall(r'(: .+)', questions_words_content)
    questions_words_delims = '|'.join(questions_words_sections)

    # Split question words content into list
    questions_words_content_splits = [
        content_split[1:len(content_split) - 1].split('\n')
        for content_split in re.split(questions_words_delims, questions_words_content)
        if len(content_split) > 0
    ]

    # Split string of words into list of words
    questions_words_content_splits = [
        [questions_words.split() for questions_words in content_split]
        for content_split in questions_words_content_splits
    ]

    # Construct dictionary with question-word entries
    questions_words = {
        questions_words_sections[i][2:]: questions_words_content_splits[i]
        for i in range(len(questions_words_sections))
    }
    
    return questions_words

In [19]:
# Parse question word pairs
questions_words = parse_question_words(questions_words_content)

In [22]:
def evaluate_model_qw(questions_words: dict, weights: np.ndarray, words: list):
    '''
    TODO: Docs
    '''
    num_sections = len(questions_words.keys())
    question_words_accs = np.zeros(num_sections)
    for i, (section_name, question_word_pairs) in zip(range(num_sections), questions_words.items()):
        print(f'-- Evaluating {section_name}... --')
        num_correct = 0
        total = len(question_word_pairs)
        for word_pairs in question_word_pairs:
            
            # Clean words (same as training) before evaluation
            word_pairs_clean = clean_sents(word_pairs)
            
            # Ensure all words are in vocabulary
            words_in_vocab = True
            for word in word_pairs_clean:
                if len(np.where(words == word)[0]) == 0:
                    words_in_vocab = False
                    break
            if not words_in_vocab:
                total = total - 1
                continue
            
            # Evaluate prediction
            a_word, b_word, c_word, d_word = word_pairs_clean
            a_vec = get_word_vec(a_word, words, last_weights)
            b_vec = get_word_vec(b_word, words, last_weights)
            c_vec = get_word_vec(c_word, words, last_weights)
            d_word_pred = similar_words_vec(a_vec - b_vec + c_vec, last_weights, words, top_n=1)[0][0]
            if d_word == d_word_pred:
                num_correct = num_correct + 1
            else:
                print(f'Incorrect prediction: {a_word} is to {b_word} as {c_word} is to {d_word} (predicted: {d_word_pred})')
        
        if total == 0:
            question_words_accs[i] = -1 # Meaning no predictions made
            print('All questions words missing from vocabulary')
        else:
            question_words_accs[i] = num_correct / total
            print(f'Accuracy: {(question_words_accs[i] * 100):.2f}%')
    
    return question_words_accs

In [23]:
evaluate_model_qw(questions_words, last_weights, words)

-- Evaluating capital-common-countries... --
Incorrect prediction: london is to england as rome is to italy (predicted: creation)
Incorrect prediction: rome is to italy as london is to england (predicted: men)
Accuracy: 0.00%
-- Evaluating capital-world... --
Incorrect prediction: london is to england as rome is to italy (predicted: creation)
Accuracy: 0.00%
-- Evaluating currency... --
All questions words missing from vocabulary
-- Evaluating city-in-state... --
All questions words missing from vocabulary
-- Evaluating family... --
Incorrect prediction: father is to mother as king is to queen (predicted: different)
Incorrect prediction: father is to mother as man is to woman (predicted: show)
Incorrect prediction: father is to mother as son is to daughter (predicted: best)
Incorrect prediction: father is to mother as son is to daughter (predicted: best)
Incorrect prediction: king is to queen as man is to woman (predicted: show)
Incorrect prediction: king is to queen as son is to daugh

Incorrect prediction: india is to indian as ireland is to irish (predicted: county)
Incorrect prediction: india is to indian as italy is to italian (predicted: race)
Incorrect prediction: india is to indian as japan is to japanese (predicted: basic)
Incorrect prediction: india is to indian as russia is to russian (predicted: largely)
Incorrect prediction: india is to indian as spain is to spanish (predicted: stage)
Incorrect prediction: india is to indian as australia is to australian (predicted: matter)
Incorrect prediction: india is to indian as china is to chinese (predicted: built)
Incorrect prediction: india is to indian as england is to english (predicted: sea)
Incorrect prediction: india is to indian as france is to french (predicted: value)
Incorrect prediction: india is to indian as germany is to german (predicted: england)
Incorrect prediction: ireland is to irish as italy is to italian (predicted: race)
Incorrect prediction: ireland is to irish as japan is to japanese (predi

Incorrect prediction: horse is to horse as machine is to machine (predicted: radio)
Incorrect prediction: horse is to horse as man is to men (predicted: show)
Incorrect prediction: horse is to horse as road is to road (predicted: longer)
Incorrect prediction: horse is to horse as woman is to woman (predicted: element)
Incorrect prediction: horse is to horse as building is to building (predicted: practice)
Incorrect prediction: horse is to horse as car is to car (predicted: difference)
Incorrect prediction: horse is to horse as child is to child (predicted: team)
Incorrect prediction: horse is to horse as color is to color (predicted: therefore)
Incorrect prediction: horse is to horse as computer is to computer (predicted: king)
Incorrect prediction: horse is to horse as hand is to hand (predicted: every)
Incorrect prediction: machine is to machine as man is to men (predicted: show)
Incorrect prediction: machine is to machine as road is to road (predicted: longer)
Incorrect prediction: 

array([ 0.,  0., -1., -1.,  0., -1., -1.,  0.,  0., -1.,  0.,  0.,  0.,
        0.])