In [3]:
import math
from model import EncoderCNN, DecoderRNN
from data_loader import get_loader
from data_loader_val import get_loader as val_get_loader
from pycocotools.coco import COCO
from torchvision import transforms 
from tqdm.notebook import tqdm
import torch.nn as nn
import torch
import torch.utils.data as data
from collections import defaultdict
import json
import os
import sys
import numpy as np
from nlp_utils import clean_sentence, bleu_score

%load_ext autoreload
%autoreload 2
    
# Define EncoderCNN class
class EncoderCNN(nn.Module):
    def __init__(self,embed_size):
        super(EncoderCNN, self).__init__()
        resnet= models.resnet50(pretrained=True )

# Disabled learning for perameters
        for param in resnet.parameters():
            param.requires_grad_(False)

        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.embed = nn.Linear(resnet.fc.in_features, embed_size)

    def Forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.embed(features)
        return features

[nltk_data] Downloading package punkt to /home/hariom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# This class define the decoder part CNN-RNN model for image caption
# Decoder

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        """
        Args:
            embed_size: final embedding size of the CNN encoder
            hidden_size: hidden size of the LSTM
            vocab_size: size of the vocabulary
            num_layers: number of layers of the LSTM
        """
        super(DecoderRNN, self).__init__()

        # Assigning hidden dimensions
        self.hidden_dim = hidden_size 

        # Map each word index to a dense word embedding tensor of embed_size
        self.embed = nn.Embedding(vocab_size, embed_size)

        # Creating LSTM Layer
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

        # Apply a linear layer to the output of the LSTM
        self.linear = nn.Linear(hidden_size, vocab_size)

        # Initialization of the hidden state
        self.hidden = (torch.zeros(1, 1, hidden_size), torch.zeros(1, 1, hidden_size))

    def forward(self, features, captions):
        """
        Args:
            features: features tensor. shape is (bs, embed_size)
            captions: captions tensor. shape is (bs, cap_length)
        Returns:
            outputs: scores of the linear layer
        """

        # Remove <end> from captions and embed captions
        cap_embedding = self.embed(captions[:, :-1])

        # Concatenate the features and the caption embeddings
        embeddings = torch.cat((features.unsqueeze(dim=1), cap_embedding), dim=1)

        # Pass the embeddings through the LSTM
        lstm_out, self.hidden = self.lstm(embeddings)

        # Apply the linear layer to the LSTM output
        outputs = self.linear(lstm_out)

        return outputs

    def sample(self, inputs, states=None, max_len=20):
        """
        Accepts pre-processed image tensor (inputs) and returns predicted
        sentence (list of tensor ids of length max_len)
        Args:
            inputs: shape is (1, 1, embed_size)
            states: initial hidden state of the LSTM
            max_len: maximum length of the predicted sentence
        Returns:
            res: list of predicted word indices
        """
        res = []

        # Now we feed the LSTM output
        for i in range(max_len):
            lstm_out, states = self.lstm(inputs, states)
            outputs = self.linear(lstm_out.squeeze(dim=1))
            _, predicted_idx = outputs.max(dim=1)
            res.append(predicted_idx.item())

            if predicted_idx == 1:  # Assuming 1 is the index for <end>
                break
            inputs = self.embed(predicted_idx)
            inputs = inputs.unsqueeze(1)

        return res

In [5]:
# We need to know the length of our dataset vocabulary
from data_loader import get_loader
from data_loader_val import get_loader as val_get_loader
from torchvision import transforms 

# Dataset directory path
cocoapi_dir = r"../cocoapi/"

# Configuration parameters
batch_size = 128  # Batch size
vocab_threshold = 5  # Minimum word count threshold
vocab_from_file = True  # If True, load existing vocab file
embed_size = 256  # Dimensionality of image and word embeddings
hidden_size = 512  # Number of features in hidden state of the RNN decoder
num_epochs = 3  # Number of training epochs
save_every = 1  # Determines frequency of saving model weights
print_every = 20  # Determines window for printing average loss
log_file = "training_log.txt"  # Name of file with saved training loss and perplexity

# Image transformations for training
transform_train = transforms.Compose([
    # Resize the smaller edge of the image to 256
    transforms.Resize(256),
    # Get a 224x224 crop from a random location
    transforms.RandomCrop(224),
    # Horizontally flip the image with probability=0.5
    transforms.RandomHorizontalFlip(),
    # Convert the PIL image to a tensor
    transforms.ToTensor(),
    transforms.Normalize(
        (0.485, 0.456, 0.406),
        (0.229, 0.224, 0.225),
    ),
])

# Build dataloader
data_loader = get_loader(
    transform=transform_train,
    mode="train",
    batch_size=batch_size,
    vocab_threshold=vocab_threshold,
    vocab_from_file=vocab_from_file,
    cocoapi_loc=cocoapi_dir,
)

# Get vocabulary size
vocab_size = len(data_loader.dataset.vocab)
print("vocab size is: ",vocab_size)

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=5.20s)
creating index...


KeyError: 'id'

In [7]:
import torch
import torch.nn as nn
import torchvision.models as models

# Define or import vocab_size
vocab_size = 11543  # Example value, replace with the actual size of your vocabulary
embed_size = 256    # Example value, ensure it's defined
hidden_size = 512   # Example value, ensure it's defined

# Initialize the encoder and decoder
encoder = EncoderCNN(embed_size)  # Make sure EncoderCNN is defined and imported if necessary
decoder = DecoderRNN(embed_size, vocab_size, hidden_size)  # Ensure DecoderRNN is defined and imported

# Move models to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

DecoderRNN(
  (embed): Embedding(512, 256)
  (lstm): LSTM(256, 11543, batch_first=True)
  (linear): Linear(in_features=11543, out_features=512, bias=True)
)

In [8]:
# Define parameters
embed_size = 256  # Dimensionality of word embeddings
vocab_size = 11543  # Number of unique words in vocabulary

# Instantiate an embedding layer
embed = nn.Embedding(vocab_size, embed_size)

# Example word index
word_index = 10  # Example index of a word

# Get the embedding for this word
word_embedding = embed(torch.tensor(word_index))

# Print the shape of the embedding for a single word
print("Shape of the embedding for a single word:", word_embedding.shape)

# Get the whole embedding matrix
embedding_matrix = embed.weight.data

# Print the shape of the embedding matrix
print("Shape of the embedding matrix:", embedding_matrix.shape)

Shape of the embedding for a single word: torch.Size([256])
Shape of the embedding matrix: torch.Size([11543, 256])
