# Import Libraries

In [10]:
import torch
import torch.nn as nn
import torchvision.models as models

# Encoder

In [11]:
resnet152_model = models.resnet152(pretrained=True)

In [12]:
torch.save(resnet152_model.state_dict(), 'resnet152_model.pth')

In [13]:
resnet152_model.fc

Linear(in_features=2048, out_features=1000, bias=True)

In [14]:
class Encoder(nn.Module):
    # constructor
    def __init__(self, embed_size, pretrained=True, model_weight_path=None):
        """
        Encoder is the first part to our model.
        The main porpose of encoder is to extract the usefule feature from an image
        We will use Resnet152 architecture pre-trained on ImageNet dataset
        Parameters
        ----------
        :param embed_size (int): the embed_size will be the output of the encoder since embed_size represents the input of the decoder
        :param pretrained (bool): if we want to load the pretrained weigth or not
        :param model_weight_path (sting): path to the pre trained weight
        """
        super(Encoder, self).__init__()
        # Load pretrained resnet152 on ImageNet
        if pretrained:
            self.resnet152 = models.resnet152(pretrained=True)
        else:
            self.resnet152 = models.resnet152(pretrained=False)
            self.resnet152.load_state_dict(torch.load(model_weight_path))
            
        # Freeze the parameters of pre trained model
        for param in self.resnet152.paramters():
            param.requires_grad_(False)
            
        # change the last fully connected layer output with embed_size
        self.resnet50.fc = nn.Linear(self.resnet50.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        
    
    def forward(self, images):
        features = self.resnet152(images)
        return self.dropout(self.relu(features))
    

# Decoder

In [15]:
class Decoder(nn.Module):
    # constructor
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        """
        Decoder is the second part of our model.
        Decoder takes as input the outputs of the encoder: the image feature vectors
        The input of the decoder and output of the encoder must be the same size
        Parameters
        ----------
        :param embed_size (int) : Dimensionality of image and word embeddings
        :param hidden_size (int) : number of features in hidden state of the RNN decoder
        :param vocab_size  (int) : The size of vocabulary or output size
        :param num_layers (int) : Number of layers
        
        """
        super(Decoder, self).__init__()
        
        elf.hidden_size = hidden_size
        
        self.embed = nn.Embedding(vocab_size, embed_size)
        
        self.lstm = nn.LSTM(input_size=embed_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first = True)
        
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)
                
        self.init_weights()
    
    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        # Concatenating features to embedding
        embeddings = torch.cat((features.unsqueeze(0), embedding), dim=0)
        lstm_out, _ = self.lstm(embeddings)
        outputs = self.linear(lstm_out)
        return outputs
    
    def init_weights(self):
        """Initialize weights."""
        # embeding layer
        self.embed.weight.data.uniform_(-0.1, 0.1)        
        #fully connected layers
        torch.nn.init.xavier_uniform_(self.linear.weight)
        self.fc1.bias.data.fill_(0.01)

# Model

In [16]:
class Model(nn.Module):
    # constructor
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, pretrained=True, model_weight_path=None):
        """
        The Image Captioning model
        it will pass the data to encoder, encoder will be connected to decoder
        Parameters
        ----------
        :param embed_size (int) : Dimensionality of image and word embeddings
        :param hidden_size (int) : number of features in hidden state of the RNN decoder
        :param vocab_size  (int) : The size of vocabulary or output size
        :param num_layers (int) : Number of layers
        :param pretrained (bool): if we want to load the pretrained weigth or not
        :param model_weight_path (sting): path to the pre trained weight
        """
        super(Model, self).__init__()
        self.encoder = Encoder(embed_size, pretrained=Trpretrainedue, model_weight_path=model_weight_path)
        self.decoder = Decoder(embed_size, hidden_size, vocab_size, num_layers)
        
    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs