In [2]:
import torch
import torch.nn as nn
import math

In [None]:
class inputEmbedding(nn.Module):
    def __init__(self, d_model:int, vocab_size:int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)


In [None]:
class positionalEmbedding(nn.Module):
    # The -> None syntax is a type hint, which provides information about the expected return # type of the function. In this case, it suggests that the function does not return anything
    def __init__(self, d_model, seq_len, dropout) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)

        # Create a position index [0, 1, ..., max_len - 1]
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)

        # Calculate div_term using exponential decay based on embedding_dim
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Fill the positional encoding matrix
        pe[:, 0::2] = torch.sin(position * div_term)  # Apply sine to even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Apply cosine to odd indices

        # Add a batch dimension for compatibility
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, embedding_dim]

        # Register as a buffer (not a trainable parameter)
        # The purpose of registering a tensor as a buffer is to make it accessible to the model, while ensuring that it is not treated as a trainable parameter. This can be useful for storing tensors that are used during the forward pass of the model but don't need to be updated during training
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add positional encoding (broadcasting over batch size)
        x = x + (self.pe[:, :x.size(1) :]).requires_grad_(False)
        return self.dropout(x)


In [None]:
class layerNormalization(nn.Module):
    def __init__(self, esp:float = 10**-6):
        super().__init()
        self.esp = esp
        self.alpha = nn.Parameter(torch.ones(1)) # will Multiplied
        self.beta = nn.Parameter(torch.zeros(1)) # will Added
    
    def forward(self, x):
        mean = torch.mean(x, dim = -1, keepdim=True) # by keepdim dim will not deduct
        std = torch.std(x, dim = -1, keepdim=True) # by keepdim dim will not deduct
        eps = self.esp
        return self.alpha * (x - mean) / (std + eps) + self.beta
    

In [None]:
class feedForwardBlock(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super().__init__
        self.fc1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.fc2(self.dropout(torch.relu(self.fc1(x))))

In [None]:
class multiHeadAttention(nn.Module):
    def __init__(self, d_model, h, dropout) -> None:
        super().__init__()  
        self.h = h  
        self.d_model = d_model
        assert d_model % h == 0, 'd_model is not divisible by zero'
        d_k = d_model // h

        self.w_q = nn.Linear(d_model, d_model) # wq
        self.w_k = nn.Linear(d_model, d_model) # wk
        self.w_v = nn.Linear(d_model, d_model) # wv
        
        self.w_o = nn.Linear(d_model, d_model) # wo
        self.droput = nn.Dropout(dropout)

    @ staticmethod
    def attention(query, key, value, mask, dropout):
        d_k = query.shape[-1]

        # (batch_size, h, seq_len, d_model) -> (batch_size, h, seq_len, seq_len)
        # matrix multiplication.
        attention_score = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_score.masked_fill_(mask == 0, -1e9)
        attention_score = attention_score.softmax(dim=-1) # (batch, h, seq_len, seq_len)
        if dropout is not None:
            attention_score = dropout(attention_score)
        
        return (attention_score @ value), attention_score


    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        key = self.w_k(k) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        value = self.w_v(v) # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)

        # (batch_size, seq_len, d_model) -> (batch, seq_len, h, d_k) -> (batch_size, h, seq_len, d_model)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        x, self.attention_score = multiHeadAttention.attention(query, key, value, mask, self.dropout)

        # (batch_size, h, seq_len, d_k) -> (batch_size, seq_len, h, d_model) -> (batch_size, seq_len, d_model)
        x = x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return self.w_o(x)

<!-- assert: The assert keyword is used to check if the condition specified is true. If the condition is false, the assert statement will raise an AssertionError exception.

'd_model is not divisible by zero': This is the error message that will be displayed if the assertion fails, i.e., if d_model is not divisible by h without a remainder. -->

<!-- The masked_fill_() method then replaces the values in the attention_score tensor at the positions where the mask is False (i.e., 0) with the value -1e9 (a large negative number).
This effectively "masks out" the attention scores for the positions that should be ignored, by assigning them a very low value. -->

<!-- contiguous() and non-contiguous memory:

Tensors in PyTorch are stored in memory as a contiguous block of data. This means that the elements of the tensor are stored one after the other in memory.
When you perform certain operations on a tensor, such as transposing or reshaping, the resulting tensor may not be stored contiguously in memory anymore.
The contiguous() method ensures that the tensor is stored in a contiguous block of memory. This is important for certain operations, such as view(), which requires the tensor to be contiguous.
If you try to call view() on a tensor that is not stored contiguously in memory, you will get a -->

<!-- view(x.shape[0], -1, self.h * self.d_k) vs view(x.shape[0], -1):

view(x.shape[0], -1, self.h * self.d_k) explicitly specifies the target shape, where the first dimension is the batch size (x.shape[0]), the last dimension is the product of the number of heads (self.h) and the dimensionality of each head (self.d_k), and the middle dimension is inferred as -1 (which means that PyTorch will calculate this dimension based on the total number of elements in the tensor and the other two specified dimensions).
view(x.shape[0], -1) is a more concise version, where PyTorch will infer the second dimension based on the total number of elements in the tensor and the specified first dimension (batch size).
Both versions will work, but the first one (view(x.shape[0], -1, self.h * self.d_k)) is more explicit and can help with readability and understanding the intended shape of the output tensor. -->


In [None]:
class residualConnection(nn.Module):
    def __init__(self, dropout);
        super().__init__
        self.dropout = nn.Dropout(dropout)
        self.norm = layerNormalization() 

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))


In [None]:
class encoderBlock(nn.Module):
    def __init__(self, self_attention_block : multiHeadAttention, feed_forward_block : feedForwardBlock, dropout):
        super().__init__()
        self.self_attention_block = multiHeadAttention
        self.feed_forward_block = feedForwardBlock
        self.residual_connection = nn.ModuleList([ResidualConnection(dropout)])


    def forward(self, x, src_mask):
        x = self.residual_connection[0](x, lambda x : self.multiHeadAttention(x, x, x, src_mask))
        x = self.residual_connection[1](x, feedForwardBlock)
        return x

In [None]:
class encoder(nn.Module):
    def __init__(self, layers : nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = layerNormalization

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


In [None]:
class decoderBlock(nn.Module):
    def _init__(self, self_attention : multiHeadAttention, cross_attnetin : multiHeadAttention, feed_forward_block : feedForwardBlock, droput):
        super().__init__()
        self.attention_block = seld_attention
        self.cross_attention_block = cross_attention
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([residualConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x : self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[0](x, lambda x : self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[1](x, feed_forward_block)
        return x

<!-- src_mask: A mask to prevent the decoder from attending to padding tokens in the encoder output.
tgt_mask: A mask to prevent the decoder from attending to future tokens during self-attention. -->



In [None]:
class decoder(nn.Module):
    def __init__(self, layers : nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = layerNormalization

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)
