In [1]:
import os
import sys
sys.path.append('/home/animesh/demo/project/opt/cocoapi/PythonAPI')
from pycocotools.coco import COCO
import torch.utils.data as data
import numpy as np
import requests
import time
import torch
import torch.nn as nn
import torchvision.models as models
import math
from torchvision import transforms
from data_loader import get_loader

# initializing COCO API for instance annotations
dataType = 'val2017'
dataDir = '/home/animesh/demo/project/opt/cocoapi'
instances_annFile = os.path.join(dataDir, 'annotations/instances_{}.json'.format(dataType))
captions_annFile = os.path.join(dataDir, 'annotations/captions_{}.json'.format(dataType))
coco = COCO(instances_annFile)
coco_caps = COCO(captions_annFile)
ckeys = coco.anns.keys()
ids = list(ckeys)

loading annotations into memory...
Done (t=0.49s)
creating index...
index created!
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


In [2]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN,self).__init__()
        cnn = models.resnet50(pretrained = True)
        
        # gradient computation not required since model is pretrained
        for param in cnn.parameters():
            param.requires_grad_(False)
            
        # list(resnet.children())[:-1] gets a list of all the modules in the ResNet-50 model except for the last one, 
        # which is the fully connected layer that outputs the class scores. We exclude this layer because we want to 
        # replace it with our own embedding layer.
        
        self.cnn = nn.Sequential(*(list(cnn.children())[:-1])) 
        
        # This defines a new linear layer with input size equal to the number of features produced by the ResNet-50 model's 
        # last fully connected layer (i.e. resnet.fc.in_features) and output size equal to the specified embed_size. 
        # This linear layer maps the extracted image features to the embedding space.
        self.embedding = nn.Linear(cnn.fc.in_features, embed_size)
        
        
    # forward pass of batch of images through Resnet
    def forward(self, images):
        features = self.cnn(images)
        features = features.view(features.size(0),-1) 
        embedded_features = self.embedding(features)
        return embedded_features
    
    
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        
        # An embedding layer (nn.Embedding) that maps word indices to embedding vectors.
        self.embed = nn.Embedding(vocab_size, embed_size)
        
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        
        # A linear layer (nn.Linear) that maps the LSTM hidden states to word scores, 
        # i.e., the probability distribution over the vocabulary.
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    
    def forward(self, features, captions):
        # captions[:,:-1] is a slice operation that selects all rows and all columns of the captions tensor, 
        # except for the last column.This is done to remove the last word index from each caption sequence because 
        # it is always the <end> token, which is not required as an input to the decoder network during training.
        embeddings = self.embed(captions[:,:-1])
        
        # Here, features.unsqueeze(1) adds an extra dimension to the features tensor at position 1, 
        # so that its shape becomes (batch_size, 1, feature_size). This is done to make the tensor compatible 
        # for concatenation with embeddings tensor, which has shape (batch_size, seq_length-1, embed_size).
        # The resulting tensor of concatenation has shape (batch_size, seq_length, embed_size) where seq_length 
        # is the length of the input caption sequence plus one (because we have concatenated the image features 
        # at the beginning of the sequence).
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        
        # an LSTM layer that takes in the concatenated tensor of image features and word embeddings as input.
        hiddens, _ = self.lstm(embeddings)
        
        # The resulting tensor outputs contains the scores of each word in the vocabulary at each time step, indicating 
        # the probability of the word being the next word in the caption sequence. 
        outputs = self.linear(hiddens)
        
        return outputs

In [3]:
batch_size = 128          # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = True    # if True, load existing vocab file
embed_size = 512           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
print_every = 100          # determines window for printing average loss

norm1 = (0.485, 0.456, 0.406)
norm2 = (0.229, 0.224, 0.225)

transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.RandomVerticalFlip(0.1),
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize(norm1,norm2      # normalize image for pre-trained model
                         )])

# Building data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# The size of the vocabulary.
datas = data_loader.dataset
vocab_size = len(datas.vocab)

# Initializing the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Defining the loss function. 
criterion = nn.CrossEntropyLoss()

# The learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embedding.parameters()) 

#optimizer.
optimizer = torch.optim.Adam(params=params, lr = 0.001)

# The total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=1.03s)
creating index...


  0%|▎                                                                                                                                                            | 1004/591753 [00:00<00:58, 10035.61it/s]

index created!
Obtaining caption lengths...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 591753/591753 [00:36<00:00, 16030.26it/s]


In [None]:
num_epochs = 1
for epoch in range(0, num_epochs):
    
    for i_step in range(1, total_step+1):
        
        # Randomly sampling a caption length, and sample indices with that length.
        datas = data_loader.dataset 
        indices = datas.get_train_indices()
        # Creating and assigning a batch sampler to retrieve a batch with the sampled indices.
        data_s = data.sampler
        new_sampler = data_s.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        images, captions = next(iter(data_loader))
        encoder.zero_grad()
        decoder.zero_grad()
        # Passing the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        # Calculating the batch loss.
        view_outputs = outputs.view(-1, vocab_size)
        view = captions.view(-1)
        loss = criterion(view_outputs,view)
        loss.backward()
        optimizer.step()
        stats = 'Epoch [%d/%d], Step [%d/%d],Perplexity: %5.4f ,Loss: %.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        print('\r' + stats, end="")
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Saving the weights.
    if epoch:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))
        torch.save(optimizer.state_dict(), os.path.join('./models', 'optim-%d.pkl' % epoch))

Epoch [5/5], Step [100/4624], Loss: 3.7198, Perplexity: 41.25523
Epoch [5/5], Step [200/4624], Loss: 3.3102, Perplexity: 27.39091
Epoch [5/5], Step [300/4624], Loss: 3.1293, Perplexity: 22.8578
Epoch [5/5], Step [400/4624], Loss: 3.0080, Perplexity: 20.24750
Epoch [5/5], Step [500/4624], Loss: 3.8884, Perplexity: 48.8309
Epoch [5/5], Step [600/4624], Loss: 3.0953, Perplexity: 22.0935
Epoch [5/5], Step [700/4624], Loss: 2.9158, Perplexity: 18.4628
Epoch [5/5], Step [800/4624], Loss: 2.7595, Perplexity: 15.7921
Epoch [5/5], Step [900/4624], Loss: 2.6937, Perplexity: 14.7869
Epoch [5/5], Step [1000/4624], Loss: 2.4682, Perplexity: 11.8010
Epoch [5/5], Step [1100/4624], Loss: 3.1929, Perplexity: 24.3601
Epoch [5/5], Step [1200/4624], Loss: 2.3458, Perplexity: 10.4418
Epoch [5/5], Step [1300/4624], Loss: 2.3857, Perplexity: 10.8669
Epoch [5/5], Step [1400/4624], Loss: 2.4690, Perplexity: 11.8107
Epoch [5/5], Step [1500/4624], Loss: 2.4626, Perplexity: 11.7348
Epoch [5/5], Step [1600/4624], 