In [None]:
import time
import torch
from torch.utils.data import DataLoader, TensorDataset
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import DATASETS
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from tqdm import tqdm
import pickle
import random
import numpy as np
from collections import Counter, defaultdict
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
import gensim.downloader
from torch import FloatTensor as FT

# Get the interactive Tools for Matplotlib
%matplotlib notebook
%matplotlib inline

plt.style.use('ggplot')

### Information
- torchtext repo: https://github.com/pytorch/text/tree/main/torchtext
- torchtext documentation: https://pytorch.org/text/stable/index.html

In [None]:
# Where do I want to run my job. You can do "cuda" on linux machines
DEVICE = "cuda" if torch.cuda.is_available() else  "cpu"
# The batch size in Adam or SGD
BATCH_SIZE = 512
# Number of epochs
NUM_EPOCHS = 10
# Predict from 2 words the inner word for CBOW
# I.e. I'll have a window like ["a", "b", "c"] of continuous text (each is a word)
# We'll predict each of wc = ["a", "c"] from "b" = wc for Skip-Gram
# For CBOW, we'll use ["a", "c"] to predict "b" = wo
WINDOW = 1

# Negative samples
K = 4

The text8 Wikipedia corpus. 100M characters.

In [5]:
# Put the data in your Google Drive
from google.colab import drive
drive.mount('/content/drive')

!du -h text8

f = open('/content/drive/MyDrive/text8/text8', 'r')
text = f.read()
# One big string of size 100M.
print(len(text))

Mounted at /content/drive
100000000


In [6]:
punc = '!"#$%&()*+,-./:;<=>?@[\\]^_\'{|}~\t\n'

# Can do regular expressions here too.
for c in punc:
    if c in text:
        text.replace(c, ' ')

In [7]:
# A very crude tokenizer you get for free: lower case and also split on spaces.
TOKENIZER = get_tokenizer("basic_english")

In [8]:
words = TOKENIZER(text)
f = Counter(words)

In [9]:
len(words)

17005207

In [10]:
# Do a very crude filter on the text which remoes all words whuch
text = [word for word in words if f[word] > 5]

In [11]:
text[0:5]

['anarchism', 'originated', 'as', 'a', 'term']

In [12]:
VOCAB = build_vocab_from_iterator([text])

In [13]:
# word -> int hash map
stoi = VOCAB.get_stoi()
# int -> word hash map
itos = VOCAB.get_itos()

In [14]:
stoi['as']

11

In [15]:
# Total number of words
len(stoi)

63641

In [16]:
f = Counter(text)
# This is the probability that we pick a word in the corpus.
z = {word: f[word] / len(text) for word in f}

In [17]:
threshold = 1e-5
# Probability that word is kept while subsampling.
# This is explained here and sightly differet from the paper: http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/
p_keep = {word: (np.sqrt(z[word] / 0.001) + 1)*(0.0001 / z[word]) for word in f}

In [18]:
# This is in the integer space.
train_dataset = [word for word in text if random.random() < p_keep[word]]

# Rebuild the vocabulary.
VOCAB = build_vocab_from_iterator([train_dataset])

In [19]:
len(train_dataset)

7846065

In [20]:
# word -> int mapping.
stoi = VOCAB.get_stoi()
# int -> word mapping.
itos = VOCAB.get_itos()

In [21]:
# The vocabulary size after we do all the filters.
len(VOCAB)

63641

In [22]:
# The probability we draw something for negative sampling.
f = Counter(train_dataset)
p = torch.zeros(len(VOCAB))

# Downsample frequent words and upsample less frequent.
s = sum([np.power(freq, 0.75) for word, freq in f.items()])

for word in f:
    p[stoi[word]] = np.power(f[word], 0.75) / s

In [23]:
# Map everything to integers.
train_dataset = [stoi[word] for word in text]

In [24]:
# This just gets the (wc, wo) pairs that are positive - they are seen together!
def get_tokenized_dataset(dataset, verbose=False):
    x_list = []

    for i, token in enumerate(dataset):
        m = 1

        start = max(0,i-m)
        left_tokens = dataset[start:i]

        end = min(i+m,len(dataset)-1)
        right_tokens = dataset[i+1:end+1]

        if len(left_tokens) == len(right_tokens):
            w_context = left_tokens + right_tokens

            wc = token

            x_list.extend(
                [w_context + [wc]]
            )

    return x_list

In [25]:
train_x_list = get_tokenized_dataset(train_dataset, verbose=False)

In [26]:
pickle.dump(train_x_list, open('train_x_list.pkl', 'wb'))

In [27]:
train_x_list = pickle.load(open('train_x_list.pkl', 'rb'))

In [28]:
# These are (wc, wo) pairs. All are y = +1 by design.
train_x_list[:10]

[[5233, 11, 3083],
 [3083, 6, 11],
 [11, 227, 6],
 [6, 1, 227],
 [227, 3133, 1],
 [1, 44, 3133],
 [3133, 56, 44],
 [44, 153, 56],
 [56, 132, 153],
 [153, 741, 132]]

In [29]:
# The number of things of BATCH_SIZE = 512.
len(train_x_list) // BATCH_SIZE

32579

### Set up the dataloader.

In [30]:
train_dl = DataLoader(
    TensorDataset(
        torch.tensor(train_x_list).to(DEVICE),
    ),
    batch_size=BATCH_SIZE,
    shuffle=True
)

In [31]:
for xb in train_dl:
    assert(xb[0].shape == (BATCH_SIZE, 3))
    break

### Words we'll use to asses the quality of the model ...

In [32]:
valid_ids = torch.tensor([
    stoi['money'],
    stoi['lion'],
    stoi['africa'],
    stoi['musician'],
    stoi['dance'],
])

### Get the model.

In [33]:
class CBOWNegativeSampling(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(CBOWNegativeSampling, self).__init__()
        self.A = nn.Embedding(vocab_size, embed_dim) # Context vectors - center word.
        self.B = nn.Embedding(vocab_size, embed_dim) # Output vectors - words around the center word.
        self.init_weights()

    def init_weights(self):
        # Is this the best way? Not sure.
        initrange = 0.5
        self.A.weight.data.uniform_(-initrange, initrange)
        self.B.weight.data.uniform_(-initrange, initrange)

    def forward(self, x):
        # N is the batch size.
        # x is (N, )

        # Context words are 2m things, m = 1 so w_context is (N, 2) while wc is (N, 1)
        w_context, wc = x[:, :-1], x[:, -1]

        # Each of these is (N, 2, D) since each context has 2 word.
        # We want this to be (N, 1, D)
        a_avg = self.A(w_context).mean(axis=1)

        # Each of these is (N, 1, D) since each target has 1 word.
        b = self.B(wc)

        # The product between each context and target vector.
        # Each of these is (N, 1, D) since each batch has 1 word.
        # The logits is now (N, 1) since we sum across the final dimension.
        logits = (a_avg * b).sum(axis=-1)

        return logits

In [34]:
@torch.no_grad()
def validate_embeddings(
    model,
    valid_ids,
    itos
):
    """ Validation logic """

    # We will use context embeddings to get the most similar words
    # Other strategies include: using target embeddings, mean embeddings after avaraging context/target
    embedding_weights = model.A.weight

    normalized_embeddings = embedding_weights.cpu() / np.sqrt(
        np.sum(embedding_weights.cpu().numpy()**2, axis=1, keepdims=True)
    )

    # Get the embeddings corresponding to valid_term_ids
    valid_embeddings = normalized_embeddings[valid_ids, :]

    # Compute the similarity between valid_term_ids (S) and all the embeddings (V).
    # We do S x d (d x V) => S x D and sort by negative similarity.
    top_k = 10 # Top k items will be displayed.
    similarity = np.dot(valid_embeddings.cpu().numpy(), normalized_embeddings.cpu().numpy().T)

    # Invert similarity matrix to negative
    # Ignore the first one because that would be the same word as the probe word
    similarity_top_k = np.argsort(-similarity, axis=1)[:, 1: top_k+1]

    # Print the output.
    for i, word_id in enumerate(valid_ids):
        # j >= 1 here since we don't want to include the word itself.
        similar_word_str = ', '.join([itos[j] for j in similarity_top_k[i, :] if j >= 1])
        print(f"{itos[word_id]}: {similar_word_str}")

    print('\n')

### Set up the model

In [35]:
LR = 10.0
NUM_EPOCHS = 10
EMBED_DIM = 300

In [36]:
model = CBOWNegativeSampling(len(VOCAB), EMBED_DIM).to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# The learning rate is lowered every epoch by 1/10.
# Is this a good idea?
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)

In [37]:
model

CBOWNegativeSampling(
  (A): Embedding(63641, 300)
  (B): Embedding(63641, 300)
)

In [38]:
validate_embeddings(model, valid_ids, itos)

money: festus, laon, mancha, kompiler, asean, verlag, necessitating, parr, constantine, disadvantaged
lion: diabetics, kiswahili, jamming, shelby, nationally, mediating, gardens, redonda, fptp, skyhawk
africa: plassey, ancus, taocp, kwon, preceding, rallying, substitutable, munro, manifesto, breaking
musician: obito, unsustainable, transfusions, steppenwolf, histoire, rescues, elam, trailing, conurbations, bamberg
dance: cosmonaut, cristian, gesta, jer, martyr, saya, maternally, houma, ellice, merry




### Train the model

In [39]:
ratios = []

def train(dataloader, model, optimizer, epoch):
    model.train()
    total_acc, total_count, total_loss, total_batches = 0, 0, 0.0, 0.0
    log_interval = 500

    for idx, x_batch in tqdm(enumerate(dataloader)):

        x_batch = x_batch[0]

        batch_size = x_batch.shape[0]

        optimizer.zero_grad()

        logits = model(x_batch)

        # Get the positive samples loss. Notice we use weights here.
        positive_loss = torch.nn.BCEWithLogitsLoss()(input=logits, target=torch.ones(batch_size).to(DEVICE).float())

        # For each batch, get some negative samples.
        # We need a total of batch_size * K samples across a batch.
        # We then reshape this batch.
        # These are effectively the output words.
        negative_samples = torch.multinomial(p, batch_size * K, replacement=True)

        # Context words are 2m things, m = 1 so w_context is (N, 2) while wc is (N, 1)
        w_context, wc = x_batch[:, :-1], x_batch[:, -1]

        """
        if w_context looks like below (batch_size = 3)
        [
        a,
        b,
        c
        ] and K = 2 we'd like to get:

        [
        a,
        a,
        b,
        b,
        c,
        c
        ]

        This will be batch_size * K rows.
        """

        # This should be (N * K, 2)
        w_context = torch.concat([
            w.repeat(K, 1) for w in torch.tensor(w_context).split(1)
        ])

        wc = negative_samples.unsqueeze(-1)

        # Get the negative samples. This shoould be (N * K, 3).
        x_batch_negative = torch.concat([w_context, wc.to(DEVICE)], axis=1)

        """
        Note the way we formulated the targets: they are all 0 since these are negative samples.
        We do the BCEWithLogitsLoss by hand basically here.
        Notice we sum across the negative samples, per positive word.

        This is literally the equation in the lecture notes.
        """

        # (N, K, D) -> (N, D) -> (N)
        negative_loss = model(x_batch_negative).neg().sigmoid().log().reshape(
            batch_size, K
        ).sum(1).mean().neg().to(DEVICE)

        loss = (positive_loss + negative_loss).mean()

        # Get the gradients via back propagation.
        loss.backward()

        # Clip the gradients? Generally a good idea.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Used for optimization. This should be roughly 0.001, on average.
        # You can use this to see if your learning rate is right - you can also plot the loss performance.
        with torch.no_grad():
            r = [
                (LR * p.grad.std() / p.data.std()).log10().item() for _, p in model.named_parameters()
            ]
            ratios.append(r)

        # Do an optimization step. Update the parameters A and B.
        optimizer.step()
        total_loss += loss.item()
        total_batches += 1

        if idx % log_interval == 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| loss {:8.3f} ".format(
                    epoch,
                    idx,
                    len(dataloader),
                    total_loss / total_batches
                )
            )
            validate_embeddings(model, valid_ids, itos)
            total_loss, total_batches = 0.0, 0.0

### Some results from the run look like below:

Results?

In [None]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()

    train(train_dl, model, optimizer, epoch)
    # We have a learning rate scheduler here.
    # Basically, given the state of the optimizer, this lowers the learning rate in a smart way.
    scheduler.step()

  w.repeat(K, 1) for w in torch.tensor(w_context).split(1)
1it [00:01,  1.87s/it]

| epoch   1 |     0/32580 batches | loss    4.136 
money: festus, laon, mancha, kompiler, asean, verlag, necessitating, parr, constantine, disadvantaged
lion: diabetics, kiswahili, jamming, shelby, nationally, mediating, gardens, redonda, fptp, skyhawk
africa: plassey, ancus, taocp, kwon, preceding, rallying, substitutable, munro, manifesto, breaking
musician: obito, unsustainable, transfusions, steppenwolf, histoire, rescues, elam, trailing, conurbations, bamberg
dance: cosmonaut, cristian, gesta, jer, martyr, saya, houma, maternally, ellice, merry




501it [02:11,  3.05it/s]

| epoch   1 |   500/32580 batches | loss    3.713 
money: festus, laon, asean, verlag, mancha, kompiler, necessitating, parr, constantine, disadvantaged
lion: diabetics, kiswahili, jamming, nationally, shelby, mediating, gardens, mandate, redonda, skyhawk
africa: plassey, taocp, ancus, kwon, preceding, rallying, substitutable, manifesto, munro, occur
musician: obito, histoire, transfusions, unsustainable, steppenwolf, trailing, rescues, elam, conurbations, bamberg
dance: cosmonaut, gesta, cristian, jer, martyr, saya, maternally, ellice, houma, merry




1001it [04:34,  2.99it/s]

| epoch   1 |  1000/32580 batches | loss    3.355 
money: festus, laon, asean, verlag, parr, mancha, necessitating, kompiler, constantine, disadvantaged
lion: diabetics, kiswahili, jamming, shelby, nationally, mediating, gardens, mandate, redonda, fptp
africa: taocp, plassey, kwon, ancus, rallying, preceding, substitutable, manifesto, munro, occur
musician: obito, transfusions, histoire, unsustainable, steppenwolf, rescues, trailing, master, bamberg, conurbations
dance: cosmonaut, gesta, cristian, jer, martyr, saya, maternally, earle, houma, ellice




1501it [06:55,  3.30it/s]

| epoch   1 |  1500/32580 batches | loss    3.022 
money: festus, asean, verlag, parr, laon, mancha, constantine, necessitating, kompiler, disadvantaged
lion: diabetics, kiswahili, jamming, shelby, nationally, mediating, gardens, redonda, skyhawk, mandate
africa: ancus, kwon, taocp, plassey, rallying, manifesto, substitutable, preceding, occur, munro
musician: obito, transfusions, histoire, unsustainable, rescues, steppenwolf, view, trailing, master, ewald
dance: cosmonaut, gesta, jer, cristian, martyr, earle, maternally, saya, houma, littoral




2001it [09:16,  3.34it/s]

| epoch   1 |  2000/32580 batches | loss    2.790 
money: asean, parr, festus, verlag, constantine, laon, mancha, disadvantaged, obedience, necessitating
lion: diabetics, kiswahili, jamming, nationally, shelby, mediating, gardens, redonda, skyhawk, fptp
africa: ancus, kwon, taocp, manifesto, plassey, rallying, occur, substitutable, preceding, munro
musician: obito, transfusions, histoire, ewald, unsustainable, rescues, steppenwolf, view, trailing, menander
dance: cosmonaut, gesta, jer, cristian, martyr, earle, maternally, european, maimed, littoral




2501it [11:37,  3.36it/s]

| epoch   1 |  2500/32580 batches | loss    2.621 
money: parr, asean, festus, constantine, where, laon, verlag, mancha, leo, were
lion: diabetics, kiswahili, nationally, jamming, shelby, gardens, mediating, redonda, skyhawk, fptp
africa: kwon, ancus, manifesto, taocp, plassey, occur, rallying, substitutable, over, breaking
musician: transfusions, obito, ewald, histoire, view, master, rescues, unsustainable, menander, trailing
dance: cosmonaut, gesta, martyr, jer, cristian, maimed, european, earle, houma, maternally




2523it [11:43,  3.72it/s]

### Some diagnostics.

Sometimes you want to add diagnostics to your NN.
- For each iteration, get the $L_2$ norm of the gradients* learning_rate and divide this by the parameters. This should be about -3 on the log scale, i.e. the ratio should be about 0.001. If the ratios are smaller, the learning rate might be too small, otherwise too large. Below we do this for the Skip-Gram model above.


In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 4))
legends = []
for i, (name, p) in enumerate(model.named_parameters()):
    plt.plot([ratios[j][i] for j in range(len(ratios))])
    legends.append('param {}'.format(name))
plt.plot([0, len(ratios)], [-3, -3], 'r')
plt.legend(legends)

How do you make sure the learing rate is good? Look at the above plots and suppose we have $log(LR * ratio) = -5$. Then $LR * ratio = 10^{-5}$. Assuming the ratio does not vary much, this probably means we need to increase the learing rate $LR$. Similarly for a reverse direction.