<a href="https://colab.research.google.com/github/Jay-hv7/Transformer--deeplearning.ai/blob/main/SelfAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F # for softmax

In [2]:
class SelfAttention(nn.Module): # nn.Module is the base class for all nn
  def __init__(self, d_model=2,
               row_dim=0,
               col_dim=1):  # d_model is size of weight matrices used to create Queries, Key and Values ,2 word embedding values per token
    super().__init__()
    self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False) # weight matrix in transpose form
    self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False) # weight matrix in transpose form
    self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False) # weight matrix in transpose form

    self.row_dim = row_dim
    self.col_dim = col_dim

  def forward(self, token_encodings): # to caluclate self attention values, token embeddings = word embeddings + position encoding
    q = self.W_q(token_encodings)
    k = self.W_k(token_encodings)
    v = self.W_v(token_encodings)

    sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
    scaled_sims = sims/torch.tensor(k.size(self.col_dim)**0.5)
    attention_percents = F.softmax(scaled_sims,dim=self.col_dim)
    attention_scores = torch.matmul(attention_percents, v)
    return attention_scores





In [3]:
encodings_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])

In [4]:
torch.manual_seed(42)

<torch._C.Generator at 0x7ef6b0b24d90>

In [5]:
selfAttention = SelfAttention(d_model=2,row_dim=0,col_dim=1)
selfAttention(encodings_matrix)


tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [6]:
# check results
selfAttention.W_q.weight.transpose(0,1)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)

In [7]:
selfAttention.W_k.weight.transpose(0,1)

tensor([[-0.1549, -0.3443],
        [ 0.1427,  0.4153]], grad_fn=<TransposeBackward0>)

In [8]:
selfAttention.W_v.weight.transpose(0,1)

tensor([[ 0.6233,  0.6146],
        [-0.5188,  0.1323]], grad_fn=<TransposeBackward0>)

In [9]:
selfAttention.W_q(encodings_matrix)

tensor([[ 0.7621, -0.0428],
        [ 1.1063,  0.7890],
        [ 1.1164, -2.1336]], grad_fn=<MmBackward0>)

In [None]:
yes