# Continuous Bag-of-Words

In [1]:
import pandas as pd

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

print(f'PyTorch version: {torch.__version__}')
print("GPU found :)" if torch.cuda.is_available() else "No GPU :(")

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

PyTorch version: 1.7.1
GPU found :)


## Retrieve vocabulary

In [3]:
df_vocab = pd.read_csv('./flickr8k/annotations/annotations_image_id.csv', sep=';')
df_vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40460 entries, 0 to 40459
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   image_id  40460 non-null  object
 1   caption   40460 non-null  object
dtypes: object(2)
memory usage: 632.3+ KB


In [4]:
raw_sentences = list(df_vocab.iloc[:, 1])

raw_text = raw_sentences[0]

# Build raw_text
for i in range(1, len(raw_sentences)):
    raw_text += ' ' + raw_sentences[i]

#print(raw_text)
raw_text = raw_text.split()

# Get vocabulary
vocab = set(raw_text)
vocab_size = len(vocab)
print(vocab_size)

9629


## Build CBOW

In [13]:
# Size of the context of one word, i.e. words on the left and words on the right we keep as context
CONTEXT_SIZE = 2
EMBEDDING_DIM = 300

# Map each word to an index
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Build the data to train the model
data = []

for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    idx = list(range(i - CONTEXT_SIZE, i)) + list(range(i + 1, i + CONTEXT_SIZE + 1))
    context = [raw_text[k] for k in idx]
    target = raw_text[i]
    
    data.append((context, target))

print(data[:3])

[(['A', 'child', 'a', 'pink'], 'in'), (['child', 'in', 'pink', 'dress'], 'a'), (['in', 'a', 'dress', 'is'], 'pink')]
[(0, (['A', 'child', 'a', 'pink'], 'in')), (1, (['child', 'in', 'pink', 'dress'], 'a')), (2, (['in', 'a', 'dress', 'is'], 'pink'))]


In [6]:
class CBOW(nn.Module):
    
    def __init__(self, context_size, embedding_dim, vocab_size):
        
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * 2 * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [7]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long).to(device)

## Train the model

In [None]:
losses = []
num_epoch = 5
step_count = len(data)
loss_function = nn.NLLLoss()
model = CBOW(CONTEXT_SIZE, EMBEDDING_DIM, vocab_size).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(num_epoch):
    total_loss = 0
    for i, sample in enumerate(data):

        context, target = sample
        # Prepare the inputs to be passed to the model
        context_idxs = make_context_vector(context, word_to_ix)

        # Reset grad
        model.zero_grad()

        # Run forward and get log probabilities over the word that matches the context
        log_probs = model(context_idxs)

        # Compute loss
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long).to(device))

        # Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        
        if((i+1) % 1000 == 0):
            print(
                        f"Epoch [{epoch + 1}/{num_epoch}]"
                        f", step [{i + 1}/{step_count}]"
                        f", loss: {loss.item():.4f}"
                        f", total loss: {total_loss:.4f}"
                    )
        
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

Epoch [1/5], step [1000/476702], loss: 8.9400, total loss: 8816.3221
Epoch [1/5], step [2000/476702], loss: 0.5576, total loss: 16779.3860
Epoch [1/5], step [3000/476702], loss: 4.9743, total loss: 23815.3299
Epoch [1/5], step [4000/476702], loss: 2.8110, total loss: 30178.1256
Epoch [1/5], step [5000/476702], loss: 9.2805, total loss: 36445.2720
Epoch [1/5], step [6000/476702], loss: 7.0085, total loss: 42226.7163
Epoch [1/5], step [7000/476702], loss: 0.4405, total loss: 47824.1943
Epoch [1/5], step [8000/476702], loss: 6.1613, total loss: 53576.7170
Epoch [1/5], step [9000/476702], loss: 4.5952, total loss: 59313.2254
Epoch [1/5], step [10000/476702], loss: 8.4296, total loss: 64603.4270
Epoch [1/5], step [11000/476702], loss: 2.1138, total loss: 69429.6583
Epoch [1/5], step [12000/476702], loss: 0.0878, total loss: 74385.3465
Epoch [1/5], step [13000/476702], loss: 1.9168, total loss: 79567.2594
Epoch [1/5], step [14000/476702], loss: 1.3770, total loss: 84586.6596
Epoch [1/5], ste

In [None]:
# Save model 
torch.save(model.state_dict(), './models/')

In [None]:
# Load model for test
model = CBOW(CONTEXT_SIZE, EMBEDDING_DIM, vocab_size).to(device)
model.load_state_dict(torch.load('./models/...'))