# Word embedding model using N-gram

## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

print(f'PyTorch version: {torch.__version__}')
print("GPU found :)" if torch.cuda.is_available() else "No GPU :(")

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from text_preprocessing import TextPreprocessor

PyTorch version: 1.7.1
GPU found :)


## Dimensions

In [2]:
EMBEDDING_SIZE = 512
CONTEXT_SIZE = 4

## Preprocessing

In [3]:
tp = TextPreprocessor('./flickr8k/annotations/annotations_image_id_train.csv', sep=';')

In [4]:
# Add start and stop words for each sentence and build words list
raw_sentences = (" ".join(list(map(lambda s: '<start> ' + s + ' <stop>', tp.raw_sentences)))).split()

# Build ngrams
ngrams = list()

for i in range(len(raw_sentences) - CONTEXT_SIZE):
    
    ngram = ([raw_sentences[i+k] for k in range(CONTEXT_SIZE)], raw_sentences[i+CONTEXT_SIZE])
    ngrams.append(ngram)

print(ngrams[:3])

[(['<start>', 'A', 'child', 'in'], 'a'), (['A', 'child', 'in', 'a'], 'pink'), (['child', 'in', 'a', 'pink'], 'dress')]


In [5]:
class NgramsDataset(Dataset):
    """Image captioning dataset"""
    
    def __init__(self, ngrams):      
        self.ngrams = ngrams
        
    def __len__(self):
        return len(self.ngrams) 
        
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()            

        context = self.ngrams[index][0]
        context = list(map(lambda w: tp.word_to_idx(w), context))
        
        target = tp.word_to_idx(self.ngrams[index][1])
        
        sample = {'context': torch.tensor(context), 'target': torch.tensor(target)}

        return sample


In [6]:
dataset = NgramsDataset(ngrams)

batch_size = 1024

train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Model

In [7]:
class NGram(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGram, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        
        embeds = self.embeddings(inputs).view(len(inputs), -1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        
        return log_probs

## Model training

In [8]:
num_epoch = 10
step_count = len(train_loader)
loss_function = nn.NLLLoss()
model = NGram(tp.vocab_size, EMBEDDING_SIZE, CONTEXT_SIZE).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)


for epoch in range(num_epoch):
    for i, sample in enumerate(train_loader):

        context = sample['context'].to(device)
        target = sample['target'].to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, target)

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        
        # Debug
        if((i+1) % int(step_count/10) == 0):
            print(
                        f"Epoch [{epoch + 1}/{num_epoch}]"
                        f", step [{i + 1}/{step_count}]"
                        f", loss: {loss.item():.4f}"
                    )

Epoch [1/10], step [40/404], loss: 4.2476
Epoch [1/10], step [80/404], loss: 3.6838
Epoch [1/10], step [120/404], loss: 3.5214
Epoch [1/10], step [160/404], loss: 3.5556
Epoch [1/10], step [200/404], loss: 3.4874
Epoch [1/10], step [240/404], loss: 3.1855
Epoch [1/10], step [280/404], loss: 3.4051
Epoch [1/10], step [320/404], loss: 3.2361
Epoch [1/10], step [360/404], loss: 3.2790
Epoch [1/10], step [400/404], loss: 3.2117
Epoch [2/10], step [40/404], loss: 2.9381
Epoch [2/10], step [80/404], loss: 2.9675
Epoch [2/10], step [120/404], loss: 3.0198
Epoch [2/10], step [160/404], loss: 2.9371
Epoch [2/10], step [200/404], loss: 2.9130
Epoch [2/10], step [240/404], loss: 2.9320
Epoch [2/10], step [280/404], loss: 2.7778
Epoch [2/10], step [320/404], loss: 2.9541
Epoch [2/10], step [360/404], loss: 2.8595
Epoch [2/10], step [400/404], loss: 2.8264
Epoch [3/10], step [40/404], loss: 2.7966
Epoch [3/10], step [80/404], loss: 2.6855
Epoch [3/10], step [120/404], loss: 2.7185
Epoch [3/10], ste

In [None]:
# Save model 
torch.save(model.state_dict(), './models/ngram')