In [None]:
import torch.nn as nn
import torchvision.models as models
import torch
import math

class Encoder(nn.Module):
    """
        Computes the visual features for the model
    """
    def __init__(self, hidden_size):
        """
            Args: hidden_size: size of the encoder output
        """
        super(Encoder, self).__init__()
        res50_model = models.resNet50(pretrained=True)
        # Obtain all the layers of resNet50 model
        layers = list(res50_model.children())
        # Removing the last layer
        layers = layers[:-1]
        self.resNet50_model = nn.Sequential(*layers)
        self.linear = nn.Linear(res50_model.fc.in_features, hidden_size)
        self.batchNorm = nn.BatchNorm1d(hidden_size, momentum=0.01)
    
    def forward(self, x):
        """
            Forward pass computation
        """
        with torch.no_grad():
            x1 = self.resnet50_model(x)
        # print("Shape of output from resNet = {}".format(x1.size()))
        x1 = x1.reshape(x1.size(0), -1)
        # Trainable
        x1 = self.linear(x1)
        x1 = self.batchNorm(x1)
        
        return x1

class Embedder(nn.Module):
    """ Used to store the embedding """
    def __init__(self, vocab_size, embed_size):
        """ Ctor """
        super(Embedder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
    def forward(self, x):
        """ Call the embedding function """
        return self.embedding(x)

class positionalEncoder(nn.Module):
    """
        Implements the positional encoder used in Attention is All You Need paper
    """
    def __init__(self, embed_size, max_caption_length=80):
        """
            Ctor
        """
        super(positionalEncoder, self).__init__()
        self.embed_size = embed_size
        # Create a positional encoder "Position and i"
        posEncoder = torch.zeros(max_caption_length, embed_size)
        for pos in range(0, max_caption_length):
            for i in range(max_caption_length):
                posEncoder[pos, i] = math.sin(pos / (10000 ** ((2 * i)/embed_size)))
                posEncoder[pos, i+1] = math.cos(pos / (10000 ** ((2 * (i + 1)) / embed_size)))
                
        # posEncoder is unsqueezed here
        posEncoder = posEncoder.unsqueeze(0)
        # This ensures that the model's parameters aren't trained
        self.register_buffer('posEncoder', posEncoder)
    
    def Forward(self, x):
        # Make embeddings larger
        x = x * math.sqrt(self.embed_size)
        seq_len = x.size(1)
        # Store it as a variable without any requirement of a gradient computation.
        if(use_gpu):
            x = x + Variable(self.posEncoder[:,:seq_len], requires_grad=False).cuda()
        else:
            x = x + Variable(self.posEncoder[:,:seq_len], requires_grad=False)
        return x
    
# TODO: Creating masks.

class multiHeadAttention(nn.Module):
    """
        multiHeadAttention model
    """
    def __init__(self, heads, embed_size,dropout_percent=0.1,\
                 attentionType="scaledDotProduct", masked = False):
        """
            Ctor
        """
        super(multiHeadAttention, self).__init__()
        self.masked=masked
        if(self.masked):
            self.mask = 

        self.attentionType = attentionType
        self.embed_size = embed_size
        self.d_k = embed_size // heads
        self.h = heads
        
        self.q_linear = nn.Linear(embed_size, embed_size)
        self.v_linear = nn.Linear(embed_size, embed_size)
        self.k_linear = nn.Linear(embed_size, embed_size)
        
        self.dropout = nn.Dropout(dropout_percent)
        self.out = nn.Linear(embed_size, embed_size)
        
    def Attention(self, q, k, v, d_k,attentionType="scaledDotProduct", mask=None):
        """
            Based on the type of attention network, we compute the values and return the output
        """
        if(attentionType == "scaledDotProduct"):
            scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
            if mask is not None:
                mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
            scores = F.softmax(scores, dim=-1)
            if dropout is not None:
                scores = dropout(scores)
            output = torch.matmul(scores, v)
        elif(attentionType == "contentBased"):
            raise NotImplementedError("{} not implemented".format(attentionType))
        elif(attentionType == "locationBased"):
            raise NotImplementedError("{} not implemented".format(attentionType))
        elif(attentionType == "dotProduct"):
            raise NotImplementedError("{} not implemented".format(attentionType))
        
        return output
    
    def Forward(self, q, k, v, mask=None):
        """
            Compute the forward pass result of the network
        """
        # batch_size = q.size(0)
        batch_size = q.size(0)
        # Perform the linear operation and split it into h heads
        k = self.k_linear(k).view(batch_size, -1, self.h, self.d_k)
        q = self.q_linear(q).view(batch_size, -1, self.h, self.d_k)
        v = self.v_linear(v).view(batch_size, -1, self.h, self.d_k)
        # Transpose to get the dimensions batch_size * h * s1 * embed_size
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)
        # Calculate the scores using function we would define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # Concatenate the heads and put them through the final linear layer
        concat = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_size)
        output = self.out(concat)
        
        # Compute the output here
        return output

class FeedForwardLayer(nn.Module):
    """
        Implements the feed forward layer of the Transformer model
    """
    def __init__(self, embed_size, hidden_layer_units=2048, dropout_percent=0.1):
        super(feedForwardLayer, self).__init__()
        self.linear_1 = nn.Linear(embed_size, hidden_layer_units)
        self.dropout = nn.Dropout(dropout_percent)
        self.linear_2 = nn.Linear(hidden_layer_units, embed_size)
    
    def Forward(self, x):
        """
            Forward pass on the input
        """
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x
    
class Norm(nn.Module):
    """
        Implements the normalization in the original paper
    """
    def __init__(self, embed_size, eps=1e-6):
        super(Norm, self).__init__()
        self.size = embed_size
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
        
    def Forward(self, x):
        norm = self.alpha  * (x - torch.mean(x, dim=-1, keepdim=True)) \
        / (torch.std(x, dim=-1, keepdim=True)) + self.bias
        return norm
    
class Decoder(nn.Module):
    """
        Implements the baseline model for the decoder
    """
    # __init__() function is used to compute the attentionType and dropout_percent
    def __init__(self, vocab_size, embed_size, N, heads, \
                 hidden_units, attentionType="scaledDotProduct", dropout_percent=0.1):
        """
            Args:
                vocab_size - Different words in the input dictionary
                embed_size - Different words in the embedding
                N - The value of number of samples in a dataset
                heads - heads here refers to the number of samples in the dataset
                hidden_units - Different values of hidden_units
            
        """
        # N = Number of points in the decoder
        self.N = N
        # Initializing the data members
        self.dropout_percent = dropout_percent
        self.hidden_units_ff = hidden_units
        super(Decoder, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # TODO: Check whether the maximum caption length is actually 80?
        self.positionalEncoder = positionalEncoder(embed_size) #, max_caption_length=80)
        
        # Layernorm is performed for each of the different layers
        self.layerNorm_1 = nn.LayerNorm(embed_size, eps=1e-06)
        self.layerNorm_2 = nn.LayerNorm(embed_size, eps=1e-06)
        self.layerNorm_3 = nn.LayerNorm(embed_size, eps=1e-06)
            
        # TODO: Compute the mask for the attention layer
        self.attention_1 = multiHeadAttention(heads, embed_size, dropout_percent)
        self.attention_2 = multiHeadAttention(heads, embed_size, dropout_percent)
        self.ff = FeedForwardLayer(embed_size, hidden_units, dropout_percent)
        
        self.linear = nn.Linear(embed_size, embed_size)
        self.output = nn.Softmax(dim=1)
        
    def Forward(self, features, targets, features_mask, targets_mask):
        """
            Computes the forward pass of the given model
        """
        x = self.embed()
    
    def Attention(self, q, k, v, d_k, attentionType="scaledDotProduct", mask=None):
        """
            Based on the type of attention network, we compute the values and return the output
        """
        if(attentionType == "scaledDotProduct"):
            scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
            if mask is not None:
                mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
            scores = F.softmax(scores, dim=-1)
            if dropout is not None:
                scores = dropout(scores)
            output = torch.matmul(scores, v)
            return output
        elif(attentionType == "contentBased"):
            raise NotImplementedError("{} not implemented".format(attentionType))
        elif(attentionType == "locationBased"):
            raise NotImplementedError("{} not implemented".format(attentionType))
        elif(attentionType == "dotProduct"):
            raise NotImplementedError("{} not implemented".format(attentionType))
            
        