### Install necessary libraries for the first run
Uncomment the cell below for installing the libraries. If this file is ran on Google Colab, only `transformers` library needs to be installed, rest all libraries are pre-installed in Google Colab 

In [None]:
# !pip install --upgrade torchvision
# !pip install transformers -qqq
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 -qqq

### Importing all necessary libraries 

In [3]:
# from google.colab import drive
import math
import random
import time
import torch
import os
import torchvision
from torchvision.models import resnet50
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tokenizers import Tokenizer
from PIL import Image
import numpy as np
from transformers import CLIPProcessor
import random
from torchvision import transforms

### Setting device in use for training the model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Loading the tokenizer which was made for the Flickr dataset

In [4]:
tokenizer_path = "/home/ivlabs/Documents/Kshitij/archive/Flickr_tokenizer.json"
tokenizer = Tokenizer.from_file(tokenizer_path)
print(tokenizer.get_vocab_size())
tokenizer.enable_padding(pad_id=4)
# vars(Tokenizer)

2706


### Setting the seed for reproducing the results on multiple runs

In [5]:
SEED = 2424

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Dataset and Dataloader
For creating the tokenizer, comment out the `MyCollate()` class and run the next cell. After the tokenizer is created with the alias `tokenizer.json`, uncomment the `MyCollate()` class and rerun the cell.

In [34]:
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


class CaptioningDataset(Dataset):
  def __init__(self, split='train'):
    super().__init__()
    self.split=split
    
    data_path = "/home/ivlabs/Documents/Kshitij/archive/captions.txt"
    self.images_path = "/home/ivlabs/Documents/Kshitij/archive/Images/"

    with open(data_path) as f:
      lines = f.readlines()

    lines = lines[1:]
    random.shuffle(lines)
    
    images=[]
    captions=[]

    for some in lines:
      i,c = some.split(',',1)
      images.append(i)
      captions.append(c.rstrip('\n'))
      
    # images = images[1:]
    # captions = captions[1:]
    train_len = 30000

    test_len = (len(captions) - train_len)//2
    
    if self.split=='train':
      images = images[0:train_len]
      captions = captions[0:train_len]

    elif self.split=='test':
      images = images[train_len:train_len+test_len]
      captions = captions[train_len:train_len+test_len]

    elif self.split=='validation':
      images = images[train_len+test_len:train_len+(2*test_len)]
      captions = captions[train_len+test_len:train_len+(2*test_len)]

    self.images = images
    self.captions = captions

  def __len__(self):
    return len(self.images)


  def __getitem__(self, index):
    # print('here')
    want_caption = self.captions[index]
    want_image = self.images_path + self.images[index]
    want_img_location = want_image
    want_image = Image.open(want_image)
    # want_image = np.array(want_image.resize((224,224))).reshape(224,224,-1)
    # want_image = np.array(want_image.resize((224,224))).reshape(-1,224,224)
    want_image = preprocess(want_image)
    if want_image.shape[0]==1:
      want_image = np.concatenate((want_image,want_image,want_image),axis=0)
    # want_image = want_image.tolist()
    # print('here now')
    return want_image, want_caption, want_img_location

class MyCollate:
  def __init__(self):
    self.tokenizer = tokenizer

  def __call__(self,batch):
    images=[]
    captions=[]
    image_locations= []
    for i in batch:
      # print(i)
      images.append(i[0])
      captions.append(i[1])
      image_locations.append(i[2])
    
    # print(images)
    # print(captions)
    
    captions = self.tokenizer.encode_batch(captions)

    want_captions = []
    attn = []

    for i in captions:
      # print(i.ids)
      want_captions.append(i.ids)
      attn.append(i.attention_mask)
    # print(want_captions)
    want_captions = torch.Tensor(want_captions).int()
    attn = torch.Tensor(attn)
    # images = torch.Tensor(images)
    images = torch.stack(images)
    # print(want_captions.shape)
    return images, want_captions.T, attn.T, image_locations

### Uncomment the cell below for building the tokenizer


In [35]:
# from pathlib import Path
# from tokenizers import Tokenizer, processors
# from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
# from tokenizers.trainers import BpeTrainer, WordLevelTrainer,WordPieceTrainer, UnigramTrainer
# from tokenizers.pre_tokenizers import Whitespace, BertPreTokenizer
# dataset = CaptioningDataset()

# with open("/home/ivlabs/Documents/Kshitij/archive/dataset.txt",'a') as f:
#   for i in dataset.captions:
#     f.write(i)

# unk_token = "<UNK>"  # token for unknown words
# spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>","<PAD>"]  # special tokens


# tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
# tokenizer.pre_tokenizer = BertPreTokenizer()
# trainer = WordLevelTrainer(special_tokens = spl_tokens,min_frequency=5)

# files = ["/home/ivlabs/Documents/Kshitij/archive/dataset.txt"]
# tokenizer.train(files, trainer)
# cls_token_id = tokenizer.token_to_id("<CLS>")
# sep_token_id = tokenizer.token_to_id("<SEP>")

# # tokenizer.post_processor = processors.TemplateProcessing(
# #     single=f"<CLS>:0 $A:0 <SEP>:0",
# #     pair=f"<CLS>:0 $A:0 <SEP>:0 $B:1 <SEP>:1",
# #     special_tokens=[("<CLS>", cls_token_id), ("<SEP>", sep_token_id)],
# # )
# tokenizer.save("/home/ivlabs/Documents/Kshitij/archive/Flickr_tokenizer.json")
# print("Tokenizer saved")


### Testing the dataloader 

In [37]:

dataset = CaptioningDataset(split='train')
trainloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=MyCollate())
DEVICE = 'cuda'
for i in trainloader:
  img = i[0]
  # print(i[0].pixel_values.shape)
  text = i[1]
  # print(text.shape)
  # print(text)
  break
print(img.shape)
print(text.shape)

### Building the Image encoder and Text decoder
The image encoder in this version of the model was chosen to be CNN (ResNet101) of which the last layer was removed to achieve a single 2048 dimensional embedding for each image. Different image encoders like ResNet10, ResNet50 etc. can also be used but in that case, the class variable `self.resnet_dim` has to be changed from 2048 to other respective dimensionalities. In this notebook, the text decoder is a Gated Reccurent Unit (GRU) 

In [39]:
# img_encoder_model = models.resnet18(weights=ResNet18_Weights.DEFAULT)
# img_encoder_model = torch.nn.Sequential(*(list(img_encoder_model.children())[:-1]))
# # for params in img_encoder_model.parameters():
# #         params.requires_grad = False

img_encoder_model = torchvision.models.resnet101(pretrained=True)
img_encoder_model = torch.nn.Sequential(*(list(img_encoder_model.children())[:-1]))

class Encoder(nn.Module):
  def __init__(self, img_dim, num_proj_layers):
    super().__init__()
    self.resnet_dim = 2048
    self.img_dim = img_dim
    self.img_enc = img_encoder_model

    self.num_proj_layers = num_proj_layers

    layers = []
    for i in range(num_proj_layers):
      if i==0:
        layers.append(nn.Sequential(nn.Linear(self.resnet_dim, self.img_dim),
                                    nn.ReLU()
                                    ))
      else:
        layers.append(nn.Sequential(nn.Linear(self.img_dim, self.img_dim),
                                    nn.ReLU()
                                    ))
    self.layers = nn.ModuleList(layers)  
  
  def forward(self, img):
    b = img.shape[0]
    img = self.img_enc(img)
    img = img.squeeze()
    # img = img.view(b,self.resnet_dim, -1).permute(0,2,1)
    # img = img.squeeze()
    for i in range(self.num_proj_layers):
      img = self.layers[i](img)
    return img.squeeze()

class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dec_dropout):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, dropout=dec_dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dec_dropout)

    def forward(self, input=None, states=None, enc_output=None):
        if input is None:
          # print("input is NOne")
          embedding = enc_output
        else:
          # print("input is not NOne")
          embedding = self.dropout(self.embedding(input))                         # input = [1, batch_size]  embedding = [1, batch_size, embedding_dim]       
        
        if states is not None:
          # print(states[0].shape)
          # print(states[1].shape)
          output, states = self.rnn(embedding, states)                            # output = [seq_len+1, batch_size, num_directions*hidden_dim]   **Here number of directions is 1
        else:
          output, states = self.rnn(embedding) 
        output = self.fc(output).unsqueeze(0)                                   # output = [seq_len+1, batch_size, vocab_size]

        return output, states



### Combining the image encoder and text decoder into a single model

In [41]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_forcing_ratio):
        enc_output = self.encoder(source)                                       # source = [batch_size, ]  target = [batch_size, seq_len]  enc_output = [batch_size, img_dim
        # dec_states = enc_states
        batch_size = target.shape[1]                                                 
        seq_len = target.shape[0]
        predictions = torch.zeros(seq_len, batch_size, tokenizer.get_vocab_size()).to(device)
        input = enc_output.unsqueeze(0)                                       # input = [1, batch_size]
        # print(input.shape)
        for t in range(seq_len):
          # print(input)
          if t==0:
            # print("setting input None")
            output, dec_states = self.decoder(input=None, states=None, enc_output=input)
            # print(output.shape)
          else:
            # print("entered this")
            # print(input)
            output, dec_states = self.decoder(input, states=dec_states, enc_output=None) 
          output = output.squeeze()
          # print(output.shape)             
          predictions[t] = output.view(batch_size, self.decoder.output_dim)
          if random.random() < teacher_forcing_ratio:
              input = target[t].unsqueeze(0)
          else:
              input = output.argmax(-1).unsqueeze(0)

        return predictions

### Defining the training and other helper functions

In [42]:
def Train(iterator, model, criterion, optimizer, clip=1):
  model.train()
  epoch_loss=0
  for _, batch in enumerate(iterator):
    model.zero_grad()
    img = batch[0].to(device)
    text = batch[1].to(device)
    # img = img.pixel_values.to(device)
    # img = {'pixel_values':img}
    # print(text.shape)
    model_input_text = text[:-1,:]
    model_output_text = text[1:,:]
    outputs = model(img, model_input_text, teacher_forcing_ratio=0.8)
    outputs = outputs.view(-1, outputs.shape[-1])
    model_output_text = model_output_text.contiguous().view(-1)
    batch_loss = criterion(outputs, model_output_text.to(device).long())
    batch_loss.backward()
    # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += batch_loss.item()
    
  return epoch_loss/len(iterator)

def Epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return (elapsed_mins, elapsed_secs)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

### Defining the hyperparameters, loss function and optimizer

In [44]:
CLIP = 1
NUM_EPOCHS = 20
HIDDEN_DIM = 768
TRG_VOCAB_SIZE = tokenizer.get_vocab_size()
EMBEDDING_DIM = 768
NUM_LAYERS = 4
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3
criterion = nn.CrossEntropyLoss(ignore_index = 4)
encoder = Encoder(img_dim=768, num_proj_layers=2).to(device)
decoder = Decoder(TRG_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DEC_DROPOUT).to(device)
seq2seq = Seq2Seq(encoder, decoder).to(device)
print(f'The model has {count_parameters(seq2seq):,} trainable parameters.')
LR = 0.0001
optimizer = optim.Adam(seq2seq.parameters(), LR)

### Looping through the dataloader for training the model

In [48]:

MODEL_TYPE = "GRU"
OUTPUT_PATH = f"/home/ivlabs/Documents/Kshitij/thanmay/models/{MODEL_TYPE}"
MODEL_STORE_PATH = os.path.join(OUTPUT_PATH,f"{MODEL_TYPE}_checkpoint_epoch.pth")
EPOCH_SAVE = 4 # Save the model every EPOCH_SAVE epochs
outfile = open(os.path.join(OUTPUT_PATH, f"{MODEL_TYPE}_train_losses.txt"), "w")
outfile.write("Training Loss\tTraining PPL\n")

train_losses = []
valid_losses = []
min_losses = 100
prev_epoch = 1
# min_losses = [float('inf'), float('inf')]
NUM_EPOCHS = 40
start_time = time.time()
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = Train(iterator=trainloader, model=seq2seq, criterion=criterion, optimizer=optimizer, clip=1)
    train_losses.append(train_loss)
    if epoch % EPOCH_SAVE == 0:
        torch.save(seq2seq.state_dict(), MODEL_STORE_PATH.replace("epoch",str(epoch)))
    elapsed_time = Epoch_time(start_time, time.time())
    print(f"Time taken for epochs {prev_epoch} to {epoch}: {elapsed_time[0]}m {elapsed_time[1]}s")
    start_time = time.time()
    prev_epoch = epoch + 1
    print(f"Training Loss: {train_loss:.4f} ")
    print(f"Training PPL: {math.exp(train_loss):.4f} ")
    outfile.write(f"{train_loss:.4f}\t{math.exp(train_loss):.4f}\n")

outfile.close()

Time taken for epochs 1 to 1: 8m 18s
Training Loss: 4.9493 
Training PPL: 141.0814 
Time taken for epochs 2 to 2: 8m 5s
Training Loss: 4.5434 
Training PPL: 94.0084 
Time taken for epochs 3 to 3: 8m 7s
Training Loss: 4.3783 
Training PPL: 79.7019 
Time taken for epochs 4 to 4: 8m 6s
Training Loss: 4.2868 
Training PPL: 72.7315 
Time taken for epochs 5 to 5: 8m 7s
Training Loss: 4.2178 
Training PPL: 67.8809 
Time taken for epochs 6 to 6: 8m 7s
Training Loss: 4.1462 
Training PPL: 63.1940 
Time taken for epochs 7 to 7: 8m 7s
Training Loss: 4.0902 
Training PPL: 59.7536 
Time taken for epochs 8 to 8: 8m 8s
Training Loss: 4.0503 
Training PPL: 57.4174 
Time taken for epochs 9 to 9: 8m 6s
Training Loss: 4.0096 
Training PPL: 55.1252 
Time taken for epochs 10 to 10: 8m 6s
Training Loss: 3.9767 
Training PPL: 53.3415 
Time taken for epochs 11 to 11: 8m 7s
Training Loss: 3.9539 
Training PPL: 52.1371 
Time taken for epochs 12 to 12: 8m 6s
Training Loss: 3.9202 
Training PPL: 50.4091 
Time tak

### Loading the model for testing 

In [49]:
MODEL_TYPE = "GRU"
OUTPUT_PATH = f"/home/ivlabs/Documents/Kshitij/thanmay/models/{MODEL_TYPE}"
MODEL_STORE_PATH = os.path.join(OUTPUT_PATH,f"{MODEL_TYPE}_checkpoint_40.pth")
seq2seq.load_state_dict(
    torch.load(MODEL_STORE_PATH))

<All keys matched successfully>

In [50]:

test_dataset = CaptioningDataset(split='test')
testloader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=MyCollate())

In [51]:
def testing(model, iterator, tokenizer):
    predictions = []
    locations = []
    captions = []
    model.eval()
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            img = batch[0].to(device)
            text = batch[1].to(device) # shape = (trg len, batch_size)
            target = batch[1].to(device)
            batch_size = text.shape[1]
            model_input_text = text[:-1, :]
            model_output_text = text[1:, :]
            outputs = model(img, model_input_text, teacher_forcing_ratio=0.8)
            batch_locations = batch[-1]
            # print(locations)
            # print("===================")
            outputs = torch.softmax(outputs, dim=-1) # shape = (trg len, batch_size, vocab_size)
            outputs = torch.argmax(outputs, dim=-1) # shape = (batch_size, trg len)
            predictions.extend(tokenizer.decode_batch(outputs.T.tolist()))
            captions.extend(tokenizer.decode_batch(text.T.tolist()))
            locations.extend(batch_locations)
        return predictions, locations, captions


In [52]:
# !pip install evaluate -qqq
# !pip install rouge_score -qqq

import evaluate

meteor = evaluate.load('meteor')
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

2023-03-27 07:34:16.957731: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-27 07:34:18.074717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/ros/noetic/lib::/home/ivlabs/.mujoco/mjpro150/bin
2023-03-27 07:34:18.074779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/ros/noetic/lib::/home/ivlabs/.mujoco/mjpro150/bin
[nltk_data] Download

In [24]:
MODEL_TYPE = "GRU"
OUTPUT_PATH = f"/home/ivlabs/Documents/Kshitij/thanmay/models/{MODEL_TYPE}"
MODEL_STORE_PATH = os.path.join(OUTPUT_PATH,f"{MODEL_TYPE}_checkpoint_epoch.pth")
EPOCH_SAVE = 4 # Save the model every EPOCH_SAVE epochs
outfile = open(os.path.join(OUTPUT_PATH, f"{MODEL_TYPE}_scores.txt"), "w")
outfile.write("EPOCH\tBLEU\tMETEOR\tROUGE1\nROUGE2\tROUGE_L\tROUGE_Lsum\n")

NUM_EPOCHS = 40
for epoch in range(EPOCH_SAVE, NUM_EPOCHS + 1, EPOCH_SAVE):
    seq2seq.load_state_dict(torch.load(MODEL_STORE_PATH.replace("epoch",str(epoch))))
    predictions, locations, captions = testing(seq2seq,testloader,tokenizer)
    bleu_results = bleu.compute(predictions=predictions, references=captions)
    meteor_results = meteor.compute(predictions=predictions, references=captions)
    rouge_results = rouge.compute(predictions=predictions, references=captions)
    outfile.write(f"{epoch}\t{bleu_results['bleu']}\t{meteor_results['meteor']}\t{rouge_results['rouge1']}\t{rouge_results['rouge2']}\t{rouge_results['rougeL']}\t{rouge_results['rougeLsum']}\n")    
outfile.close()


In [25]:
# captions = tokenizer.decode_batch(captions)
# print(torch.Tensor(captions))
bleu_results = bleu.compute(predictions=predictions, references=captions)
meteor_results = meteor.compute(predictions=predictions, references=captions)
rouge_results = rouge.compute(predictions=predictions, references=captions)