In [1]:
import torch
import torch.nn as nn

In [None]:
"""
import torch
import torch.nn as nn
    ## Define the Embedding layer
embedding = nn.Embedding(num_embeddings=8, embedding_dim=4)

    ## Sample input tensor with token indices
indices = torch.tensor([[1, 2, 4, 5]])

## indices = torch.tensor([[1, 5, 2, 9]])       ## Error code => IndexError: index out of range

    ## Retrieve the embeddings
embedded = embedding(indices[0])

print(indices.shape)    [1, 4]
print("Embedding Matrix:\n", embedding.weight)
print("Embedded Indices:\n", embedded)
print(embedded.shape)       [4, 4]
"""
###########################################
"""
Embedding Matrix:
 Parameter containing:
tensor([[ 0.1194,  1.5258, -0.5091,  1.0211],
        [ 0.0738,  0.6269, -1.3611,  1.4003],
        [ 0.1989, -0.6284, -1.0179,  0.4309],
        [ 0.6177, -0.3000, -0.7234,  1.5775],
        [-0.7160, -0.7790, -1.5383, -3.1813],
        [-0.4399, -0.6546,  0.4862, -1.0826],
        [ 0.3113,  0.4481,  0.3684, -1.6997],
        [-0.6261,  1.3856, -0.0625, -0.0418]], requires_grad=True)
        
Embedded Indices:
 tensor([[ 0.0738,  0.6269, -1.3611,  1.4003],
        [ 0.1989, -0.6284, -1.0179,  0.4309],
        [-0.7160, -0.7790, -1.5383, -3.1813],
        [-0.4399, -0.6546,  0.4862, -1.0826]], grad_fn=<EmbeddingBackward0>)
torch.Size([4, 4])
"""
##########################################

"""
##############  => embedding = nn.Embedding(num_embeddings=7, embedding_dim=4)

=>  The nn.Embedding layer with num_embeddings=8 means it can handle indices from 0 to 7 (a total of 8 valid indices).
    If you try to pass an index that is not within this range, you will get this error.

=>  Make sure the indices you are using are within the valid range of (0 to num_embeddings - 1).
    Check the values in your input tensor to ensure they fall within this range.

=>  num_embeddings=8: This specifies the number of unique tokens (or indices) that the embedding layer will support. 
    In other words, the embedding layer will have 8 rows, each corresponding to a different token.

=>  embedding_dim=4: This specifies the dimensionality of the embedding vectors. Each token will be represented by a 4-dimensional vector.

=>  This line initializes an embedding layer that can handle up to 8 distinct tokens, each mapped to a 4-dimensional vector. 
    The internal matrix of this layer will have the shape (8, 4), where each of the 7 rows contains a 4-dimensional vector.

########## =>  embedded = embedding(indices)

=>  indices: This is a tensor containing integer indices that correspond to tokens in the embedding layer’s vocabulary.
    The tensor should contain indices that fall within the range [0, num_embeddings - 1] (i.e., [0, 6] in this case).

=>  embedding(indices): This line retrieves the embedding vectors for the specified indices from the embedding layer.
    The indices tensor is used to look up the rows in the embedding matrix.

=>  Effect: For each index in the indices tensor, the embedding layer retrieves the corresponding row from the internal matrix.
    The resulting tensor embedded will have the same shape as indices but with each index replaced by its corresponding embedding vector.
"""

In [79]:


import torch
import torch.nn as nn


class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size        ## let's take glove - 512
        self.heads = heads                  ## 8 head ..... for a perticular word how many meanings you want
        self.head_dim = embed_size // heads     ##  head dim = 64 ... because => 512 // 8

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"      ## if the condition does not match then it will return the error

            ## linear layer does not change shape
        self.values = nn.Linear(embed_size, embed_size)         ## size of input and output is same for Q, K, V 
        self.keys = nn.Linear(embed_size, embed_size)               ## shape => (N, length of ( Q or V or K), embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)         ## fully connected output layer

    def forward(self, values, keys, query, mask):               ## forward function will take some input
        ## mask shape => (N, 1, 1, sen_len)

        ## Get number of training examples
        N = query.shape[0]                              ## total number of training samples
                                                        ## values =>  (N, value_len, embed_size)
                                                        ## keys => (N, key_len, embed_size)
                                                        ## query => (N, query_len, embed_size)

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]


        values = self.values(values)        ## linear layer does not change the shape ... input shape = output shape
        keys = self.keys(keys)
        queries = self.queries(query)

        ## Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)   ## convert the 3D array (N, key_len, embed_size) to 4D array
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)              ## embed_size => n_head * head_dim
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)      ## 512 = 8 * 64

        ## Einsum does matrix mult. for query*keys for each training example
        ## with every other training example, don't be confused by einsum
        ## it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])       ## einsum => Einstein summation notation
            ## Q * K           ("nqhd,nkhd->nhqk")
            ## (N, q_len, heads, h_dim) * (N, k_len, heads, h_dim) => (N, heads, q_len, h_dim)

        ## queries shape: (N, query_len, heads, heads_dim),
        ## keys shape: (N, key_len, heads, heads_dim)
        ## energy: (N, heads, query_len, key_len)

        ## Mask padded indices so their weights become 0
        if mask is not None:
            # energy = energy.masked_fill(mask == 0, float("-1e20"))
            energy = energy.masked_fill(mask == 0, float("-inf"))
            ## masked_fill(mask, value) replaces all elements in e where the mask is True with the specified value. 
            ## In this case, it replaces those elements with -1e20.

        ## Normalize energy values similarly to seq2seq + attention
        ## so that they sum to 1. Also divide by scaling factor for
        ## better stability
        ## energy and attention dimension is same 
        ## attention is the new weights that can be multiply with values
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)     ## 1/ Dk ** (1/2)  => Dk = dimensionality of K vec
                ## scale the vector and then apply softmax  .. it will give us new weights 
                ## dim=3 means the last last shape of the 4D array 
            ## attention shape: (N, heads, query_len, key_len)

            ## matrix multiplication of attention vector and values vector
            ## (N, heads, query_len, key_len) * (N, value_len, heads, head_dim) = (N, q_len, heads, head_dim)
            ## reshape the 4D vector (N, q_len, heads, head_dim) and create new 3D vector of shape (N, q_len, embd_dim)
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        ## attention shape: (N, heads, query_len, key_len)
        ## values shape: (N, value_len, heads, heads_dim)
        ## out after matrix multiply: (N, query_len, heads, head_dim), then
        ## we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        ## Linear layer doesn't modify the shape, final shape will be
        ## (N, query_len, embed_size)
        return out



        ## this transformer block can be used is encoder and decoder
        ## this contain ( 1 self attention model => addition and normalization => 1 feed forward layer )
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        """
        Layer normalization is a technique to normalize the input to a neural network layer by adjusting and scaling activations. 
        Unlike batch normalization, which normalizes across the batch dimension, layer normalization normalizes across the feature 
        dimension for each data sample.
        """
        self.norm1 = nn.LayerNorm(normalized_shape = embed_size)       ## Suppose the feature dimension is embed_size 
        self.norm2 = nn.LayerNorm(embed_size)                          ## last dimension of the matrix shape

            ## feed forward ....  input shape and output shape is same
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)     ## (N, query_len, embed_size) this is the output of SelfAttention class

        ## Add skip connection, run through normalization and finally dropout
        ## residual connection with dropouts
        x = self.dropout(self.norm1(attention + query))

        ## feedforward layer of transformer block
        forward = self.feed_forward(x)

        ## residual (skip connection) connection with dropouts
        out = self.dropout(self.norm2(forward + x))
        return out


class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embed_size, num_layers, heads, device,
                forward_expansion, dropout, max_length,):
        super(Encoder, self).__init__()         ## lets take  forward_expansion = 4 .... embed_size = 512, num_layers = 6
        self.embed_size = embed_size            ## heads = 8 , max_length = 100 , src_vocab_size = 10
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)      ## => (N , indices provided, embedding vectors)
                                                                            ##      OR   => (N , src_vocab_size, embedding vectors)
        self.position_embedding = nn.Embedding(max_length, embed_size)

            ## the encoder block will repeat for 6 times ... (num_layers = 6)
            ## 6 encoder is attached one after another
        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, dropout=dropout,forward_expansion=forward_expansion) for _ in range(num_layers)
            ]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        """
        The expand method works by broadcasting the original tensor to a new shape. Broadcasting is a technique where dimensions of a tensor 
        are automatically expanded to be compatible with other tensors in operations.
        """
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)       ## positions = (N, seq_length)

            ## when we feed a sentence to the encoder this is the first layer where the embedding vector and positional embedding 
            ## vector will come ... addition of both vectors and ont the top of that there is a dropout layer 
            ## before feeding this resultant vector to transformer block (self attention, feed forward, normalization )
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))               ## (N , seq_length, embedding vectors)
        )

        ## In the Encoder the query, key, value are all the same, it's in the
        ## decoder this will change. This might look a bit odd in this case.
        ## the out put of 1st transformer block will be input for the 2nd ..... out put of 2nd transformer block will be input for the 3rd ..
        ## same out put is used as ( Q, K , V) 
        for layer in self.layers:
            out = layer(out, out, out, mask)                ## (N , seq_length, embedding vectors)

        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()            ## forward_expansion = 4 .... embed_size = 512 .... heads = 8
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):                   ## x => (N, trg_seq_len, embed_size)
        attention = self.attention(x, x, x, trg_mask)                           ## (N, query_len, embed_size)
        query = self.dropout(self.norm(attention + x))                  ## this is the feed forward layer
        out = self.transformer_block(value, key, query, src_mask)           ## (N, trg_seq_length, embedding vectors)
        return out


class Decoder(nn.Module):
    def __init__( self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
        super(Decoder, self).__init__()         ##  trg_vocab_size = 10 ... num_layers = 6 ...  max_length = 100
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)      ## (N , trg_vocab_size, embedding vectors)
        self.position_embedding = nn.Embedding(max_length, embed_size)          ## (N , max_length, embedding vectors)

            ## the decoder block will repeat for 6 times ... (num_layers = 6)
            ## 6 decoder is attached one after another
        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        """
        The expand method works by broadcasting the original tensor to a new shape. Broadcasting is a technique where dimensions of a tensor 
        are automatically expanded to be compatible with other tensors in operations.
        """
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)               ## (N, seq_length)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))         ## (N, trg_seq_length, embed_size)

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)              ## (N, trg_seq_length, embed_size)

        out = self.fc_out(x)                ## fully connected feed forward  =>  (N, trg_seq_length, embed_size)
        return out


class Transformer(nn.Module):
    def __init__( self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=512, 
                 num_layers=6, forward_expansion=4, heads=8, dropout=0, device="cpu", max_length=100,):
        super(Transformer, self).__init__()

        self.encoder = Encoder( src_vocab_size, embed_size, num_layers, heads, 
                               device, forward_expansion, dropout, max_length,)

        self.decoder = Decoder( trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)      ## (N, 1, 1, src_len)   this will be a true and false matrix
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(N, 1, trg_len, trg_len)
        return trg_mask.to(self.device)


    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2 ,0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(
        device
    ) 
    trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)

    src_pad_idx = 0     ## source padding id
    trg_pad_idx = 0     ## target padding id
    src_vocab_size = 10         ## source vocab size
    trg_vocab_size = 10         ## target vocab size
    model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
        device
    )
    out = model(x, trg[:, :-1])
    print(out.shape)

cuda
torch.Size([2, 7, 10])


In [76]:
print((torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2 ,0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]) != 0).shape)
print((torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2 ,0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]) != 0).unsqueeze(1).shape)
print((torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2 ,0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]) != 0).unsqueeze(1).unsqueeze(2).shape)
print((torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2 ,0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]) != 0).unsqueeze(1).unsqueeze(2))

torch.Size([2, 9])
torch.Size([2, 1, 9])
torch.Size([2, 1, 1, 9])
tensor([[[[ True,  True,  True,  True,  True,  True,  True,  True, False]]],


        [[[ True,  True,  True,  True,  True,  True,  True,  True,  True]]]])
