# Training a Word2Vec model on the Wikipedia page about the *United States*

### Grab the webpage

In [1]:
# import requests

# r = requests.get("https://en.wikipedia.org/wiki/United_States")

In [2]:
# Load the file saved from september, 2021
with open("us-wiki-september-2021.html", "r") as f:
    text = f.read()

### Preprocessing pipeline (repurposed from Lab-02)

In [3]:
import re
from tqdm import tqdm
from typing import List

import spacy
from spacy.language import Language


pipeline_name = 'WikiUS'

@Language.component(pipeline_name)
def preprocess(doc):
    doc = [token for token in doc if not token.is_punct]
    doc = [token for token in doc if not token.is_stop]
    doc = [token.text.lower().strip() for token in doc]
    doc = [token for token in doc if 0 < len(token) <= 12]
    return " ".join(doc)


class Pipeline:
    
    # http://emailregex.com/
    email_re = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)
    *|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]
    |\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]
    (?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}
    (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:
    (?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
    # replace = [ (pattern-to-replace, replacement),  ...]
    replace = [
        ("<[^>]*>", " "),
        (email_re, " "),                           # Matches emails
        (r"(?<=\d),(?=\d)", ""),                   # Remove commas in numbers
        (r"\d+", " "),                             # Map digits to special token <numbr>
        (r"[*\^\.$&@<>,\-/+{|}=?#:;'\"\[\]]", ""), # Punctuation and other junk
        (r"[\n\t\r]", " "),                        # Removes newlines, tabs, creturn
        (r"[^\x00-\x7F]+", ""),                    # Removes non-ascii chars
        (r"\\+", " "),                             # Removes double-backslashs
        (r"\s+n\s+", " "),                         # 'n' leftover from \\n
        (r"\s+", " ")                              # Strips extra whitespace
    ]
    
    def __init__(self):
        self.pipeline = spacy.load('en_core_web_sm')
        self.pipeline.add_pipe(pipeline_name);
        
    def __call__(self, *args, **kwargs):
        return self.transform(*args, **kwargs)

    def transform(self, doc: str):
        for repl in self.replace:
            doc = re.sub(repl[0], repl[1], doc)
        return self.pipeline(doc)


pipeline = Pipeline();

  from .autonotebook import tqdm as notebook_tqdm


### Normalize text

In [4]:
docs = []
i = 0
while i < len(text):
    text_chunk = text[i:i + int(1e5)]
    docs.append(pipeline(text_chunk))
    i += int(1e5)
doc = " ".join(docs);

# Wiki articles have many citations
doc = doc.replace('citeref', '').replace("tmulti", "")

### Excerpt

In [5]:
doc[65000:70000]

'ossing worldwide sales s bob dylan emerged folk revival americas celebrated songwriters james brown led development funk recent american creations include hip hop salsa techno house music mid thcentury american pop stars bing crosby frank sinatra elvis presley global celebrities artists late th century michael jackson prince madonna whitney houston popular artists mid s late s include mariah carey britney spears justin timberlake christina aguilera beyonc wellknown american singers s include katy perry bruno mars lady gaga taylor swift ariana grande cinema main article cinema united states hollywood sign los angeles california hollywood northern district los angeles california leaders motion picture production worlds commercial motion picture exhibition given new york city thomas edison s kinetoscope early th century film industry largely based hollywood st century increasing number films film companies subject forces director d w griffith american filmmaker silent film period central

### Construct a vocabulary

In [6]:
from collections import Counter

doc = doc.split(" ")
words_histogram = Counter(doc)

freq_threshold = 2
vocab = {}
for word, count in sorted(words_histogram.items(), 
                          key=lambda wrd_cnt: wrd_cnt[0]):
    if word and count >= freq_threshold:
        vocab[word] = len(vocab)

### Remove OOV words

In [7]:
doc = [word for word in doc if vocab.get(word) is not None]

In [8]:
# Vocabulary size
N = len(vocab)

# Training set size
M = len(doc)

M, N

(16317, 2364)

# Build a word2vec model

### Softmax (numerically stable version)

In [9]:
import numpy as np

def softmax(Z) -> np.ndarray:
    Z_exp = np.exp(Z - np.max(Z))
    partition = np.sum(Z_exp)
    return Z_exp / partition

### Hyperparameters

In [10]:
# Word embedding dimension (K << N)
K = 15

# Number of passes through the data
epochs = 65

# The number of words on both sides
# of center word to consider as 
# context
context_window = 6

# Learning rate for gradient updates
lr = 0.01

### Construct a projection matrix shape = (K x N)

Note: W is our projection matrix, w represents a word (these are different things)

In [11]:
def initialize_embedding():
    """
    Make a randomly initialized embedding 
    matrix of shape K x N
    """
    return np.random.random((K, N))

### Training loop

In [12]:
from tqdm import tqdm

# Word embeddings
U = initialize_embedding()

# Context embeddings
V = initialize_embedding()

# One-hot encoded word representations
X = np.eye(N)

with tqdm(total=int(epochs * (M - 2 * context_window))) as bar:
    for epoch in range(epochs):
            
        ce_losses = []
        acc = []
        
        for i in range(context_window, M - context_window, 1):
            
            word_doc_idx = i
            word = doc[word_doc_idx]
            word_idx = vocab.get(word, 0)
            
            context_words = doc[i - context_window: i + context_window + 1]
            context_words = np.delete(context_words, context_window)
            context_idxs = [vocab.get(context_word, 0) for context_word in context_words]        
            
            # print(word, context_words)
            
            # One-hot encoded center word (1 x N)
            x_w = X[word_idx]
            
            # One-hot encoded context words (2 * context window x N)
            X_c = X[context_idxs]
            
            # Embedding for word w (1 x K)
            u_w = U[:, word_idx]
            
            # Inner product between embedding for word w and embedding for all context words in V (1 x N)
            Z = u_w.dot(V)
            
            # Probability distribution over all context words for center word w (1 x N)
            P = softmax(Z)
            
            # Cross entropy loss
            ce = -np.sum(np.log(P[context_idxs]))
            ce_losses.append(ce)
            
            # Prediction errors (2 * context_window x N)
            errors = P - X_c
            
            for error in errors:
                
                # Gradients w.r.t. U_w: dNLL/dU_w = sum(V.dot(P - X_c))
                grad_U_w = V.dot(error.T) # (K x 1)

                # Gradient w.r.t. V: dNLL/dV = u_w x P - X_c
                grad_V = np.expand_dims(u_w, axis=1).dot(np.expand_dims(error, axis=0))
                
                # Gradient updates
                U[:, word_idx] -= lr * grad_U_w.T
                V -= lr * grad_V
            
            # Compute accuracy of context word being in top-50 from softmax probability
            top = np.argsort(P)[-50:]
            acc.append(sum([1 for idx in context_idxs if idx in top]) / (2 * context_window))
            
            bar.update()
            
        bar.set_description("epoch: %d, ce-loss: %.4f, acc: %.4f" %
                    (epoch + 1, np.mean(ce_losses), np.mean(acc)))
            

epoch: 65, ce-loss: 65.6998, acc: 0.5559: 100%|██████████| 1059825/1059825 [19:20<00:00, 913.58it/s] 


### Compute most similar words

In [13]:
wordlookup = {idx: word for word, idx in vocab.items()}

Unormed = U / np.expand_dims(np.linalg.norm(U, axis=0, ord=2), axis=0)

def compute_top_n_similar_words(word, n=20, method='dot'):
    idx = vocab[word]
    if method == 'dot':
        neighbors_idxs = np.argsort(U[:, idx].dot(U))[-n:][::-1]
    elif method == 'cosine':
        neighbors_idxs = np.argsort(Unormed[:, idx].dot(Unormed))[-n:][::-1]
    neighbors = [wordlookup[idx] for idx in neighbors_idxs]
    return neighbors

In [14]:
words = ['budget', 'cia', 'virginia', 'suffrage', 'civil', 
         'war', 'election', 'declaration', 'president', 'obama', 
         'trump', 'democratic', 'republican', 'baseball', 'west',
         'vice', 'speaker', 'white', 'iraq', 'games', 'supreme',
         'abortion', 'left', 'right', 'senate', 'house']

for word in words:
    print(word, ": ", compute_top_n_similar_words(word, n=10, method='cosine'))

budget :  ['budget', 'fiscal', 'spent', 'office', 'federal', 'reserve', 'municipal', 'currency', 'department', 'amounted']
cia :  ['cia', 'factbook', 'federal', 'intelligence', 'agency', 'vehicle', 'clock', 'central', 'ciagov', 'accounting']
virginia :  ['virginia', 'gazette', 'williamsburg', 'jefferson', 'wisconsin', 'nbspjun', 'north', 'utah', 'al', 'peter']
suffrage :  ['suffrage', 'prohibition', 'womens', 'saw', 'movement', 'measures', 'led', 'abolition', 'legislation', 'prominent']
civil :  ['civil', 'progressive', 'charles', 'branch', 'rights', 'era', 'slavery', 'movement', 'ultimately', 'republicans']
war :  ['war', 'cold', 'vietnam', 'ii', 'gorbachev', 'ussoviet', 'terror', 'iraq', 'tallest', 'end']
election :  ['election', 'presidential', 'republican', 'joe', 'party', 'president', 'democrat', 'elected', 'biden', 'founded']
declaration :  ['declaration', 'independence', 'continental', 'draft', 'written', 'wilson', 'founding', 'committee', 'lee', 'britain']
president :  ['presid

### Compute analogies

In [15]:
def compute_analogy(word1, word2, word3):
    """
    Desired behavior:
        word2 - word1 + word3 = word4
        king - man + woman = queen
    """
    u1 = Unormed[:, vocab[word1]]
    u2 = Unormed[:, vocab[word2]]
    u3 = Unormed[:, vocab[word3]]
    u4 = u2 - u1 + u3
    u4_idxs = np.argsort(u4.dot(Unormed))[-10:][::-1]
    word4_candidates = [wordlookup[idx] for idx in u4_idxs]
    return word4_candidates

In [16]:
analogies = [('mitch', 'senate', 'nancy'),
             ('obama', 'president', 'kamala'),
             ('adams', 'england', 'jefferson')]

for analogy in analogies:
    print(compute_analogy(*analogy))

['site', 'senate', 'house', 'v', 'nancy', 'chief', 'legislative', 'officers', 'px', 'pelosi']
['vice', 'leadership', 'kamala', 'pelosi', 'includes', 'nancy', 'senate', 'speaker', 'president', 'chief']
['jefferson', 'wrote', 'virginia', 'england', 'moylan', 'evidence', 'encounters', 'williamsburg', 'inhabitants', 'migrated']


# Visualize embeddings using Tensorboard Projector

https://projector.tensorflow.org/

### Save embeddings/vocab to disk

Use the cell below to save your embeddings and vocab to disk (`vectors.tsv`, `metadata.tsv`). The embedding projector expects data in the following format:



`vectors.tsv` (N=3 K=4 embeddings):

    0.1\t0.2\t0.5\t0.9
    0.2\t0.1\t5.0\t0.2
    0.4\t0.1\t7.0\t0.8


`metadata.tsv` (N=3 word vocabulary):

    three
    word
    vocabulary

You can use the helper functions in the cell below to save and load embeddings/vocab to/fro disk.

In [17]:
def save_matrix(matrix, fpath):
    D1, D2 = matrix.shape
    tsv = ""
    for i in range(D1):
        for j in range(D2):
            tsv += str(matrix[i, j]) + '\t'
        tsv = tsv.strip('\t') + '\n'
    tsv = tsv.strip('\n')
    with open(fpath, "w") as fd:
        fd.write(tsv)

        
def load_matrix(fpath):
    matrix = []
    with open(fpath, 'r') as fd:
        tsv = fd.read()
    for line in tsv.split('\n'):
        row = []
        for value in line.split('\t'):
            row.append(float(value))
        matrix.append(row)
    return np.array(matrix)

        
def save_vocab(vocab: dict, fpath):
    tsv = ""
    for word, idx in sorted(vocab.items(), key=lambda item: item[1]):
        tsv += word + '\n'
    tsv = tsv.strip('\n')
    with open(fpath, "w") as fd:
        fd.write(tsv)


def load_vocab(fpath):
    with open(fpath, "r") as fd:
        tsv = fd.read()
    vocab = {}
    for line in tsv.split('\n'):
        vocab[line.strip()] = len(vocab)
    return vocab

In [18]:
save_matrix(U.T, "vectors.tsv")
save_vocab(vocab, "metadata.tsv")

len(load_vocab("metadata.tsv")), load_matrix("vectors.tsv").T.shape

(2364, (15, 2364))

### Instructions for uploading data into the Projector

Click the load button called *load* on the left side of the screen. You will then be given the option to upload two files (image below). Upload your `vectors.tsv` and `metadata.tsv` files from the last step and then click out of the pop up (oddly the pop-up does not close once your files have been uploaded).

<img src="projector-load.png" alt="Embedding/Vocab upload" width="500" height="500">

### Instructions for using the Projector

There are three compression algorithms that you can use, I recommend tSNE with the default settings. After ~2000 iterations tSNE should yield words clustered in a way that loosly reflects underlying semantic relationships. There is a search bar on the right; baked into the UI is a entity tagger that automatically tags words from `metadata.tsv` with entities (if one is recognized). When you type in the name of a state for example, the Projector will highlight the other words tagged with <STATE>, and you will a tight clustering of states in embeddings space (below).

<img src="projector.png" alt="Embedding projector snapshot" width="1000" height="700">