In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module): #inherits from nn.Module
    def __init__(self, d_in, d_out): # contructor of the class
        super().__init__() # intialize the parent class
        # keyword self in a classs refers to the instance of the class
        self.d_in = d_in
        self.d_out = d_out
        # create a layer that applies an affine transformation to the input
        # y = Ax + b, where A is a weight matrix and b is a bias vector
        # Weights intialized with a uniform distribution
        # its weights and biases are stored as torch.nn.Parameter objects.
        # This makes them part of the model’s .parameters() 
        # returns the parameters of the model when called
        self.Q = nn.Linear(d_in, d_out) 
        self.K = nn.Linear(d_in, d_out)
        self.V = nn.Linear(d_in, d_out)

    def forward(self, x):
        queries = self.Q(x) # apply the affine transformation to the input x
        keys = self.K(x)
        values = self.V(x)
        # Compute the attention scores, bmm is batch matrix multiplication
        # scores = queries * keys^T / sqrt(d_out)
        scores = torch.bmm(queries, keys.transpose(1, 2)) 
        # keys.transpose(1, 2) transposes the last two dimensions
        # (batch_size, seq_len, d_out) -> (batch_size, d_out, seq_len)
        scores = scores / (self.d_out ** 0.5)
        attention = F.softmax(scores, dim=2)
        # converts the attention scores into probabilities along the last dimension, 
        # so each set of scores sums to 1 for every query in the batch.
        hidden_states = torch.bmm(attention, values)
        return hidden_states


In [None]:
SOS_token = 0
EOS_token = 1

index2words = {
    SOS_token: 'SOS',
    EOS_token: 'EOS',
}

words = "How are you doing ? I am good and you ?"
words_list = set(words.lower().split(' '))
for word in words_list:
    index2words[len(index2words)] = word

print(index2words)

words2index = {w: i for i, w in index2words.items()}
print(words2index)

{0: 'SOS',
 1: 'EOS',
 2: '?',
 3: 'i',
 4: 'doing',
 5: 'you',
 6: 'and',
 7: 'how',
 8: 'good',
 9: 'am',
 10: 'are'}

In [None]:
def convert2tensor(sentence):
    words_list = sentence.lower().split(' ')
    indexes = [words2index[word] for word in words_list]
    # .view(1, -1) reshapes the tensor to have a single row
    return torch.tensor(indexes, dtype=torch.long).view(1, -1)

sentence = "How are you doing ?"
indexes = convert2tensor(sentence)

print(indexes.size())

torch.Size([1, 5])

In [33]:
HIDDEN_SIZE = 10
VOCAB_SIZE = len(words2index)

# Create an embedding layer that maps words to vectors of size HIDDEN_SIZE
# The embedding layer is initialized with a uniform distribution
embedding = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
# Create an instance of the Attention class
# creates an object with three learnable linear layers (for Q, K, V),
# ready to compute self-attention on input data.
attention = Attention(HIDDEN_SIZE, HIDDEN_SIZE)

sentence = "How are you doing ?"
input_tensor = convert2tensor(sentence)

# for each word index in input_tensor, 
# embedding layer looks up its vector of size HIDDEN_SIZE
# and returns a tensor of shape (1, seq_len, HIDDEN_SIZE)
embedded = embedding(input_tensor)
print(embedded.size())

# Pass the embedded tensor through the attention layer
# The attention layer computes the attention scores and returns the hidden states
# The output hidden_states will have the same shape as embedded
hidden_states = attention(embedded)
print(hidden_states.size())

torch.Size([1, 5, 10])
torch.Size([1, 5, 10])


In [34]:
d_in = HIDDEN_SIZE
d_out = HIDDEN_SIZE
Q = nn.Linear(d_in, d_out)
K = nn.Linear(d_in, d_out)
V = nn.Linear(d_in, d_out)

queries, keys, values = Q(embedded), K(embedded), V(embedded)
print(queries.size(), keys.size(), values.size())

scores = torch.bmm(queries, keys.transpose(1, 2))
print(scores.size())

sccores = scores / (d_out ** 0.5)
attention = F.softmax(scores, dim=2)
print(attention.size())
print(attention.sum(dim=2))

hidden_states = torch.bmm(attention, values)
print(hidden_states.size())

torch.Size([1, 5, 10]) torch.Size([1, 5, 10]) torch.Size([1, 5, 10])
torch.Size([1, 5, 5])
torch.Size([1, 5, 5])
tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SumBackward1>)
torch.Size([1, 5, 10])


#### Simplied Self-Attention

In [1]:
# STEP 0
import torch
inputs = torch.tensor(
[[0.43, 0.15, 0.89], # Your (x^1)
[0.55, 0.87, 0.66], # journey (x^2)
[0.57, 0.85, 0.64], # starts (x^3)
[0.22, 0.58, 0.33], # with (x^4)
[0.77, 0.25, 0.10], # one (x^5)
[0.05, 0.80, 0.55]] # step (x^6)
)
print(inputs.shape) # With 6 Tokens of each 3 dimensions.

# STEP 1 - Attention Scores
query_2 = inputs[1] # second input token as query
attn_scores_2 = torch.empty(inputs.shape[0]) # same shape as input sequence
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query_2)
    # concise way to multiply two vectors element wise.
    # measure of similarity on how close two vectors.
    # determine the extend each element "attends to" any other element
print(attn_scores_2)

# STEP 2 - Normalize Attention Scores.
# To use Softamx, better at managing extreme values & 
# offer more favourable gradient  properties.
# the output is not negative & interepretable as probabilities
# OPTION A
attn_weights_temp_2 = attn_scores_2 / attn_scores_2.sum()
print("Attention Wieghts", attn_weights_temp_2)
print("Sum of Attention Weights", attn_weights_temp_2.sum()
      )
# OPTION B
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

attn_weights_naive_2 = softmax_naive(attn_scores_2)
print("Attention Wights with Softmax", attn_weights_naive_2)
print("Sum of Attention Weights", attn_weights_naive_2.sum())

# OPTION C
# Softmax naive may have underflow or overflow issues, (num stability)
# Advisable to use PyTorch's Softmax
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print("Attention Weights with PyTorch", attn_weights_2)
print("Sum of Attention Weights", attn_weights_2.sum())

# STEP 3 - Calculating Context Vectors
context_vect_2 = torch.zeros(query_2.shape) # second input token
for i, x_i in enumerate(inputs):
    context_vect_2 += attn_weights_2[i] * x_i
print("Context Vector", context_vect_2)

torch.Size([6, 3])
tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
Attention Wieghts tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum of Attention Weights tensor(1.0000)
Attention Wights with Softmax tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum of Attention Weights tensor(1.)
Attention Weights with PyTorch tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum of Attention Weights tensor(1.)
Context Vector tensor([0.4419, 0.6515, 0.5683])


#### Self Attention For All Token

In [2]:
# SLOWER FOR LOOP
# attn_scores = torch.empty(6, 6)
# for i, x_i in enumerate(inputs):
#     for j, x_j in enumerate(inputs):
#         attn_scores[i, j] = torch.dot(x_i, x_j)
# print("Attention Scores", attn_scores)

# STEP 1 - ATENTION SCORES
attn_scores = inputs @ inputs.T # Matric Multiplications
print(attn_scores)

# STEP 2 - NORMALIZE ATTENTION SCORES - ATTENTION WEIGHTS
attn_weights = torch.softmax(attn_scores, dim=-1)
# apply normalization along last dimension
# it will normalize along columns, so that values in each row sum upto 1
print(attn_weights)

# STEP 3 - CONTEXT VECTORS COMPUTED
context_vecs = attn_weights @ inputs
print("Context Vectors", context_vecs)
# Each row contains three dimensional context vectors

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
Context Vectors tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


#### Self Attention with Trainable Weights


In [3]:
# Trainable weights are crucial so that model can learn
# to produce good context vectors
x_2 = inputs[1] # Second element
d_in = inputs.shape[1] # The embeddings, d=3
d_out = 2 # output embedding, d = 2
# In GPT MOdels, input & Output are same.

torch.manual_seed(123)
# Why Query, Key & Values ?
# -- Search (Query in DB), Key is like a DB Indexing, Searching
# -- Values is the actual content as in key-valu pair in db
# Matrices used to prpject embedded input tokens.
# Would use requires_grad=True, to update matrices during training
W_q = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
W_k = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
W_v = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)

query_2 = x_2 @ W_q # (1, 3) @ (3, 2)
# key_2 = x_2 @ W_k
# value_2 = x_2 @ W_v

# need to obtain all keys & values
keys = inputs @ W_k
values = inputs @ W_v
# Projected to (6, 2) i.e. on to 2 dimensional embedding space.
print("Keys Shape", keys.shape)
print("Values Shape", values.shape)

# Attention Score
attn_scores_2 = query_2 @ keys.T # all attention score for given query
print(attn_scores_2)

# Attention Wieghts
# By dividing them by square root of the embedding dimension of keys
# large dot product - small gradients during backpropogation, due to softmax
# As dot product increases, softmax function behaves like a step function

d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k ** 0.5, dim=-1)
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)


Keys Shape torch.Size([6, 2])
Values Shape torch.Size([6, 2])
tensor([ 0.2172,  0.1376,  0.1730, -0.0491,  0.7616, -0.3809])
tensor([0.2854, 0.4081])


#### A compact self-attention class

In [12]:
import torch.nn as nn
class SelfAttention_v1(nn.Module): # a class derived from nn.Module
    def __init__(self, d_in, d_out):
        super().__init__()
        # intializes trainable weights, tranforming input d_in to d_out
        self.W_q = nn.Parameter(torch.rand(d_in, d_out))
        self.W_k = nn.Parameter(torch.rand(d_in, d_out))
        self.W_v = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        # transform input data into Queries, Keys, values
        queries = x @ self.W_q
        keys = x @ self.W_k
        values = x @ self.W_v

        attn_scores = queries @ keys.T # (6, 2) @ (2, 6) --> (6, 6)
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5, dim=-1 
        )
        context_vec = attn_weights @ values # (6, 6) @ (6, 2) --> (6, 2)
        return context_vec

torch.manual_seed(123)
sa_v1 = SelfAttention_v1(3, 2)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


#### Self Attention class using PyTorch's Linear Layers


In [5]:
# Using nn.Linear has optimized weight initialization scheme,
# leading better model training  (effective & Stable)

class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        queries = self.W_q(x)
        keys = self.W_k(x)
        values = self.W_v(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5, dim=-1
        )
        context_vec = attn_weights @ values
        return context_vec

torch.manual_seed(123)
sa_v2 = SelfAttention_v2(3, 2)
print(sa_v2(inputs))
    


tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)


#### Comparing SelfAttention_v1 and SelfAttention_v2

Transfer the weight matrices from a SelfAttention_v2 object to a Self-Attention_v1, such that both objects then produce the same results. Your task is to correctly assign the weights from an instance of SelfAttention_v2 to an instance of SelfAttention_v1. To do this, you need to understand the relationship between the weights in both versions.

In [None]:
sa_v2.W_k.weight.data.T
sa_v1.W_k.data



tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])