# Import necessary package

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim

# Build the 2-gram dataset sample

In [2]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


# Define the naive CBOW model

In [3]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return(log_probs)

# To transfer words to index

In [4]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)

tensor([46, 43,  0, 40])

# Basic setting before train

In [5]:
device = torch.device('cuda:0')
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), embedding_dim=10, context_size=CONTEXT_SIZE*2)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Training Loop

In [6]:
for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in data:
        context_ids = make_context_vector(context, word_to_ix)
        context_ids = context_ids.to(device)
        model.zero_grad()
        log_probs = model(context_ids)
        label = torch.tensor([word_to_ix[target]], dtype=torch.long)
        label = label.to(device)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)

[tensor([241.0637]), tensor([142.7159]), tensor([50.8770]), tensor([5.9412]), tensor([1.8961]), tensor([1.3363]), tensor([1.0427]), tensor([0.8569]), tensor([0.7278]), tensor([0.6324])]


# Check the first word vector

In [8]:
model.embeddings(make_context_vector(data[0][0], word_to_ix).cuda())

tensor([[-0.7385, -0.7515, -0.3060,  1.8100,  0.1676, -0.5854,  2.3268, -0.6096,
          1.4381, -0.7245],
        [ 0.7227,  1.5373, -1.1604,  0.7701, -2.1556,  0.1742,  1.5158,  0.7121,
          1.2520,  1.0408],
        [-1.4066,  0.4095, -1.7305, -0.0594,  0.5031, -1.4818,  0.0935, -0.9810,
         -0.7171,  1.4907],
        [-0.3117, -1.6283, -1.0737, -0.0849,  0.0432, -1.1670,  0.7933, -0.9270,
         -1.0110,  0.8907]], device='cuda:0', grad_fn=<EmbeddingBackward>)