In [2]:
%pip install torch

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel
import math

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
class attention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(attention, self).__init__()
        self.embed_size=embed_size
        self.num_heads=num_heads
        self.head_dim= embed_size // num_heads

        assert(self.head_dim * num_heads == embed_size), "Embed size must be completely divisible by number of heads"

        self.query=nn.Linear(self.head_dim,self.head_dim,bias=False)
        self.key=nn.Linear(self.embed_size,self.head_dim,bias=False)
        self.value=nn.Linear(self.embed_size,self.head_dim,bias=False)
        self.output=nn.Linear(num_heads*self.head_dim,embed_size)

    def forward(self,query,key,value,mask):
        batch_size=query.shape[0]
        q_len,k_len,v_len=query.shape[1],key.shape[1],value.shape[1]
        print("\n\n\nThe shapes before permutetion")
        query=query.reshape(batch_size,q_len,self.num_heads,self.head_dim)
        print("The shape of query is "+str(query.shape))
        key=key.reshape(batch_size,k_len,self.num_heads,self.head_dim)
        print("The shape of key is "+str(key.shape))
        value=value.reshape(batch_size,v_len,self.num_heads,self.head_dim)
        print("The shape of value is "+str(value.shape))
        # Transpose to perform batch matrix multiplication
        query = query.permute(0, 2, 1, 3)  # (batch_size, num_heads, q_len, head_dim)
        key = key.permute(0, 2, 1, 3)  # (batch_size, num_heads, k_len, head_dim)
        value = value.permute(0, 2, 1, 3)  # (batch_size, num_heads, v_len, head_dim)
        print("\n\n\nThe shapes after permutetion")
        print("The shape of query is "+str(query.shape))
        print("The shape of key is "+str(key.shape))
        print("The shape of value is "+str(value.shape))
        # Calculate attention scores
        scores = torch.matmul(query, key.transpose(-2,-1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))

        # Apply mask if provided
        if mask is not None:
            mask1=torch.full(scores.size(),float('-inf'))
            #print(mask1)
            mask1=torch.triu(mask1,diagonal=1)
            #print(mask1)
            #print(scores)
            scores += mask1
            #print(scores)
        
        # Apply softmax to obtain attention weights
        attention_weights = torch.softmax(scores, dim=-1)

        # Apply dropout if needed (you can add this if desired)
        # attention_weights = self.dropout(attention_weights)

        # Apply attention weights to the values
        output = torch.matmul(attention_weights, value)
        print("The initial shape of output is "+str(output.shape))
        # Reshape and concatenate heads
        output = output.permute(0, 2, 1, 3).contiguous()  # (batch_size, q_len, num_heads, head_dim)
        output = output.reshape(batch_size, q_len, self.num_heads * self.head_dim)
        
        # Linear transformation to get the final output
        output = self.output(output)
        print("The final shape of output is "+str(output.shape))
        return output        
         

In [3]:
import torch
import torch.nn as nn

# Instantiate the Attention class
embed_size = 8
num_heads = 2
attention_model = attention(embed_size, num_heads)

# Define a sample input
batch_size = 2
q_len = 3
k_len = 3
v_len = 3

# Create random tensors as input
query = torch.randn(batch_size, q_len, embed_size)
key = torch.randn(batch_size, k_len, embed_size)
value = torch.randn(batch_size, v_len, embed_size)

# Define a sample mask (you can modify this based on your use case)
mask = 1  # Assuming a fully-connected attention

# Call the forward method
output = attention_model.forward(query, key, value, mask)

# Print the output shape
#print("Output Shape:", output)





The shapes before permutetion
The shape of query is torch.Size([2, 3, 2, 4])
The shape of key is torch.Size([2, 3, 2, 4])
The shape of value is torch.Size([2, 3, 2, 4])



The shapes after permutetion
The shape of query is torch.Size([2, 2, 3, 4])
The shape of key is torch.Size([2, 2, 3, 4])
The shape of value is torch.Size([2, 2, 3, 4])
The initial shape of output is torch.Size([2, 2, 3, 4])
The final shape of output is torch.Size([2, 3, 8])


In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size,num_heads,dropout,foward_expansion):
        super(TransformerBlock,self).__init__()
        self.attention=attention(embed_size,num_heads=num_heads)
        self.norm1=nn.LayerNorm(embed_size)
        self.norm2=nn.LayerNorm(embed_size)

        self.feed_foward=nn.Sequential(
            nn.Linear(embed_size,foward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(foward_expansion*embed_size,embed_size)
        )

        self.dropout=nn.Dropout(dropout)

    def forward(self,query,key,value,mask):
        multi_head_attention=self.attention(query,key,value,mask)
        x=self.dropout(self.norm1(multi_head_attention+query))
        foward=self.feed_foward(x)  
        out=self.dropout(self.norm2(foward + x))
        return out


In [5]:
class PositionalEncoding1(nn.Module):
    def __init__(self, max_len, embed_size):
        super(PositionalEncoding1, self).__init__()
        self.embed_size = embed_size

        # Create constant positional encoding matrix
        pe = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Add batch dimension
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Ensure that positional encoding has the same shape as input tensor x
        batch_size, seq_len = x.shape[:2]
        pe = self.pe[:, :seq_len, :].expand(batch_size, -1, -1)
        return pe


In [23]:
class Encoder(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 embed_size,
                 num_layers,
                 num_heads,
                 device,
                 foward_expansion,
                 dropout,
                 max_length 
                ):
        super(Encoder,self).__init__()
        self.embed_size=embed_size
        self.device=device
        self.input_embedding=nn.Embedding(src_vocab_size,embed_size)
        self.positional_encoding=PositionalEncoding1(max_length,embed_size)

        self.layers=nn.ModuleList(
            [
                TransformerBlock(embed_size,num_heads,dropout,foward_expansion)
                for _ in range(num_layers)
            ]
        )
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,x,mask):
        batch_size,seq_len=x.shape
        print("Batch size is : "+str(batch_size))
        print("Seq length is : "+str(seq_len)),

        embedding=self.input_embedding(x)
        #print("input embedding is :\n\n\n "+str(embedding)),
        print("\n\n\nInput embedding shape is :\n\n\n "+str(embedding.shape)),
#        positions=torch.arange(0,seq_len).expand(batch_size,seq_len).to(device=self.device)
        #print("Position is :\n\n\n "+str(positions)),
        #print("\n\n\nPosition shape is :\n\n\n "+str(positions.shape)),
        # Generate positional encoding dynamically based on the input sequence length
        positional_encoding = self.positional_encoding(x)
        #positional_encoding = positional_encoding.unsqueeze(0).expand(1,batch_size, embed_size, embed_size)
        #print("Positional encoding is :\n\n\n "+str(positional_encoding)),
        print("\n\n\nPositional encoding shape is :\n\n\n "+str(positional_encoding.shape)),
        
        #out=self.dropout(self.input_embedding(x)+self.positional_encoding(positions))
        out=self.dropout(embedding+positional_encoding)

        for layer in self.layers:
            out=layer(out,out,out,mask)
        return out    




In [7]:
# Toy dataset (replace with your dataset)
src_vocab_size = 200  # Example vocabulary size
max_length = 20  # Example maximum sequence length
num_layers = 8
embed_size = 128
num_heads = 8
forward_expansion = 4
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the encoder
encoder = Encoder(src_vocab_size, embed_size, num_layers, num_heads, device, forward_expansion, dropout, max_length)

# Generate some sample input data
sample_input = torch.randint(0, src_vocab_size, (2, max_length))  # Batch size 2, sequence length 20
sample_mask = torch.ones_like(sample_input)

# Pass the input data through the encoder
encoder_output = encoder(sample_input, sample_mask)

# Check the output shape
print("Encoder output shape:", encoder_output.shape)

Batch size is : 2
Seq length is : 20



Input embedding shape is :


 torch.Size([2, 20, 128])



Positional encoding shape is :


 torch.Size([2, 20, 128])



The shapes before permutetion
The shape of query is torch.Size([2, 20, 8, 16])
The shape of key is torch.Size([2, 20, 8, 16])
The shape of value is torch.Size([2, 20, 8, 16])



The shapes after permutetion
The shape of query is torch.Size([2, 8, 20, 16])
The shape of key is torch.Size([2, 8, 20, 16])
The shape of value is torch.Size([2, 8, 20, 16])
The initial shape of output is torch.Size([2, 8, 20, 16])
The final shape of output is torch.Size([2, 20, 128])



The shapes before permutetion
The shape of query is torch.Size([2, 20, 8, 16])
The shape of key is torch.Size([2, 20, 8, 16])
The shape of value is torch.Size([2, 20, 8, 16])



The shapes after permutetion
The shape of query is torch.Size([2, 8, 20, 16])
The shape of key is torch.Size([2, 8, 20, 16])
The shape of value is torch.Size([2, 8, 20, 16])
The initial shape of 

In [8]:
sample_input.shape

torch.Size([2, 20])

In [9]:
class Decoder_block(nn.Module):
    def __init__(self,embed_size,num_heads,forward_expansion,dropout,device):
        super(Decoder_block,self).__init__()
        self.multi_head_attention=attention(embed_size,num_heads)
        self.norm=nn.LayerNorm(embed_size)
        self.transformer_block=TransformerBlock(embed_size,num_heads,dropout,forward_expansion)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,x,key,value,src_mask,targ_mask):
        multi_head_attention=self.multi_head_attention(x,x,x,targ_mask)
        query=self.dropout(self.norm(x+multi_head_attention))
        out=self.transformer_block(query,key,value,src_mask)
        return out




In [27]:
class decoder(nn.Module):
    def __init__(
            self,
            target_vocab_size,
            embed_size,
            num_heads,
            num_layers,
            forward_expansion,
            dropout,
            device,
            max_length

    ):
        super(decoder,self).__init__()
        self.device=device
        self.word_embedding=nn.Embedding(target_vocab_size,embed_size)
        self.positional_encoding=PositionalEncoding1(max_length,embed_size)

        self.layers=nn.ModuleList(
            [
                Decoder_block(embed_size,num_heads,forward_expansion,dropout,device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out=nn.Linear(embed_size,target_vocab_size)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,x,enc_out,src_mask,trg_mask):
        input_embedding=self.word_embedding(x)
        positional_encoding=self.positional_encoding(x)

        x=self.dropout(input_embedding+positional_encoding)

        for layer in self.layers:
            x=layer(x,enc_out,enc_out,src_mask,trg_mask)

            out=self.fc_out(x)
        return out
        


In [14]:
class Transformer(nn.Module):
    def __init__(
            self,
            src_vocab_size,
            trg_vocab_size,
            #src_pad_index,
            #trg_pad_index,
            embed_size=256,
            num_layers=6,
            forward_expansion=4,
            num_heads=8,
            dropout=0,
            device="cuda",
            max_length=100,
            enc_mask=None,
            dec_mask=1
            

    ):
        super(Transformer,self).__init__()
        self.encoder=Encoder(src_vocab_size,embed_size,num_layers,num_heads,device,forward_expansion,dropout,max_length)
        self.decoder=decoder(trg_vocab_size,embed_size,num_heads,num_layers,forward_expansion,dropout,device,max_length)
        #self.src_pad_index=src_pad_index
        #self.trg_pad_index=trg_pad_index
        self.device=device
    

    def forward(self,src,trg):
        enc_src=self.encoder(src,None)
        out=self.decoder(trg,enc_src,None,1)
        return out

In [28]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

x=torch.tensor([[1,5,6,4,3,9,5,2,8],[1,8,7,3,4,5,6,7,2]])
trg=torch.tensor([[1,7,4,3,5,9,2,0],[1,5,6,2,4,7,6,2]]).to(device)

model=Transformer(10,10)

In [29]:
output=model(x,trg[:,:-1])

Batch size is : 2
Seq length is : 9



Input embedding shape is :


 torch.Size([2, 9, 256])



Positional encoding shape is :


 torch.Size([2, 9, 256])



The shapes before permutetion
The shape of query is torch.Size([2, 9, 8, 32])
The shape of key is torch.Size([2, 9, 8, 32])
The shape of value is torch.Size([2, 9, 8, 32])



The shapes after permutetion
The shape of query is torch.Size([2, 8, 9, 32])
The shape of key is torch.Size([2, 8, 9, 32])
The shape of value is torch.Size([2, 8, 9, 32])
The initial shape of output is torch.Size([2, 8, 9, 32])
The final shape of output is torch.Size([2, 9, 256])



The shapes before permutetion
The shape of query is torch.Size([2, 9, 8, 32])
The shape of key is torch.Size([2, 9, 8, 32])
The shape of value is torch.Size([2, 9, 8, 32])



The shapes after permutetion
The shape of query is torch.Size([2, 8, 9, 32])
The shape of key is torch.Size([2, 8, 9, 32])
The shape of value is torch.Size([2, 8, 9, 32])
The initial shape of output is torch.S

In [31]:
x

tensor([[1, 5, 6, 4, 3, 9, 5, 2, 8],
        [1, 8, 7, 3, 4, 5, 6, 7, 2]])

In [30]:
output

tensor([[[ 0.9224,  0.1032,  0.5054,  1.0900, -0.1421, -0.4101, -0.0539,
          -0.8155,  0.2523,  0.0597],
         [ 0.3218,  0.0696,  0.7126,  1.1054, -0.0599,  0.0030, -0.1660,
          -0.6135,  0.1673,  0.0881],
         [ 0.7556, -0.3133, -0.1340,  0.5877, -0.2255, -0.1921, -0.6670,
          -1.3984,  0.5521,  0.2703],
         [ 1.0037, -0.5307,  0.3663,  0.6399, -0.4201, -0.4295, -0.3950,
          -1.3853,  0.4437,  0.3500],
         [ 0.6039, -0.4387,  0.4118,  0.5748, -0.1682, -0.4041, -0.3428,
          -1.2708,  0.1850,  0.2073],
         [ 0.8007, -0.4222, -0.0172,  0.6110, -0.2957, -0.3175, -0.1543,
          -1.0654,  0.2906,  0.1814],
         [ 0.6212, -0.1921, -0.2053,  1.1558, -0.2308, -0.5708,  0.0368,
          -1.0455, -0.0085,  0.5203]],

        [[ 0.9226, -0.0695,  0.2646,  1.2175, -0.0387, -0.6737, -0.4489,
          -1.2014,  0.5484,  0.0328],
         [ 0.7070, -0.3851,  0.0673,  0.6884,  0.0889, -0.6300, -0.4935,
          -1.5010,  0.4247, -0.1440],