In [2]:
d_model = 3 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [13]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self,d_model,num_heads):
        super(MultiHeadSelfAttention,self)._init_()
        self.num_heads=num_heads
        self.d_model=d_model
        self.depth=d_model//num_heads


        self.Wq=nn.Linear(d_model,d_model)
        self.Wk=nn.Linear(d_model,d_model)
        self.Wv=nn.Linear(d_model,d_model)
        self.dense=nn.Linear(d_model,d_model)
        
    def split_heads(self,x):
        x=x.view(x.size(0),x.size(1),self.num_heads,self.depth)
        return x.permute(0, 2, 1, 3)
        
    def forward(self,x):
        Q=self.Wq(x)
        K=self.Wk(x)
        V=self.Wv(x)

        Q=self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)

        attn_scores=torch.matmul(Q,k.transpose(-2,-1))/(self.depth**0.5)
        attn_weights=F.softmax(attn_scores,dim=-1)
        
        output = torch.matmul(attn_weights, V)
        output = output.permute(0, 2, 1, 3).contiguous()  
        output = output.view(x.size(0), x.size(1), self.d_model)
        return self.dense(output)
        

In [15]:
class PositionwiseFeedForward(nn.Module):
    def _init_(self,d_model,d_ff,dropout_rate):
        super(PositionwiseFeedForward,self)._init_()
        
        self.linear1=nn.Linear(d_model,d_ff)
        self.dropout(nn.Dropout(dropout_rate))
        self.linear2=nn.Linear(nn.Linear(d_ff,d_model))

    def forward(self,x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))


                            
        
        

The input is first passed through linear1 followed by a ReLU activation (F.relu). ReLU introduces non-linearity.
After applying ReLU, dropout regularizes the network by randomly setting some elements to zero.
Finally, the output is passed through linear2 to project back to the original d_model dimensions.

In [16]:
class LayerNorm(nn.Module):
    def _init_(self, d_model, eps=1e-6):
        super(LayerNorm, self)._init_()
        self.layer_norm=nn.LayerNorm(d_model,eps=eps)
    def forward(self,x):
        return self.layer_norm(x)
        

- *eps*: A small constant added for numerical stability to avoid division by zero.
In each encoder or decoder layer, normalization is applied to ensure that the values passed into subsequent layers have a stable distribution.

In [19]:
class EmbeddingLayer(nn.Module):
    def _init_(self, vocab_size, d_model, max_len):
        super(EmbeddingLayer, self)._init_()
        self.word_embeddings = nn.Embedding(vocab_size, d_model)
        self.position_embeddings = nn.Embedding(max_len, d_model)

    def forward(self, input_sequence):
        seq_len = input_sequence.size(1)
        positions = torch.arange(0, seq_len, dtype=torch.long, device=input_sequence.device).unsqueeze(0)
        return self.word_embeddings(input_sequence) + self.position_embeddings(positions)



In [20]:
class EncoderLayer(nn.Module):
    def _init_(self, d_model, num_heads, d_ff, dropout_rate):
        super(EncoderLayer, self)._init_()
        self.attention = MultiHeadSelfAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_rate)
        self.layer_norm1 = LayerNorm(d_model)
        self.layer_norm2 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

    def forward(self, x):
        attn_output = self.attention(x)
        x = self.layer_norm1(x + self.dropout1(attn_output))
        ff_output = self.feed_forward(x)
        return self.layer_norm2(x + self.dropout2(ff_output))


In [21]:
class Encoder(nn.Module):
    def _init_(self, vocab_size, num_layers, d_model, num_heads, d_ff, dropout_rate, max_len):
        super(Encoder, self)._init_()
        self.embedding = EmbeddingLayer(vocab_size, d_model, max_len)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout_rate) for _ in range(num_layers)
        ])

    def forward(self, input_sequence):
        x = self.embedding(input_sequence)
        for layer in self.layers:
            x = layer(x)
        return x

In [22]:
class MaskedMultiHeadSelfAttention(nn.Module):
    def _init_(self, d_model, num_heads):
        super(MaskedMultiHeadSelfAttention, self)._init_()
        self.attention = MultiHeadSelfAttention(d_model, num_heads)

    def forward(self, x):
        seq_len = x.size(1)
        # Create a mask to prevent attending to future tokens
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)
        attn_output = self.attention(x)
        attn_output = attn_output.masked_fill(mask, float('-inf'))
        return attn_output

In [23]:
class DecoderLayer(nn.Module):
    def _init_(self, d_model, num_heads, d_ff, dropout_rate):
        super(DecoderLayer, self)._init_()
        self.masked_attention = MaskedMultiHeadSelfAttention(d_model, num_heads)
        self.attention = MultiHeadSelfAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_rate)
        self.layer_norm1 = LayerNorm(d_model)
        self.layer_norm2 = LayerNorm(d_model)
        self.layer_norm3 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.dropout3 = nn.Dropout(dropout_rate)

    def forward(self, x, encoder_output):
        masked_attn_output = self.masked_attention(x)
        x = self.layer_norm1(x + self.dropout1(masked_attn_output))
        attn_output = self.attention(x, encoder_output)
        x = self.layer_norm2(x + self.dropout2(attn_output))
        ff_output = self.feed_forward(x)
        return self.layer_norm3(x + self.dropout3(ff_output))


In [24]:
class Decoder(nn.Module):
    def _init_(self, vocab_size, num_layers, d_model, num_heads, d_ff, dropout_rate, max_len):
        super(Decoder, self)._init_()
        self.embedding = EmbeddingLayer(vocab_size, d_model, max_len)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout_rate) for _ in range(num_layers)
        ])

    def forward(self, input_sequence, encoder_output):
        x = self.embedding(input_sequence)
        for layer in self.layers:
            x = layer(x, encoder_output)
        return x


In [25]:
class Transformer(nn.Module):
    def _init_(self, vocab_size, num_layers, d_model, num_heads, d_ff, dropout_rate, max_len):
        super(Transformer, self)._init_()
        self.encoder = Encoder(vocab_size, num_layers, d_model, num_heads, d_ff, dropout_rate, max_len)
        self.decoder = Decoder(vocab_size, num_layers, d_model, num_heads, d_ff, dropout_rate, max_len)

    def forward(self, source, target):
        encoder_output = self.encoder(source)
        decoder_output = self.decoder(target, encoder_output)
        return decoder_output
