In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BertTokenizer, BertModel
from transformers import pipeline
import torch

# Token Embeddings

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("This is an example of the bert tokenizer")
print(tokens)

['this', 'is', 'an', 'example', 'of', 'the', 'bert', 'token', '##izer']


In [3]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[2023, 2003, 2019, 2742, 1997, 1996, 14324, 19204, 17629]


In [4]:
model = BertModel.from_pretrained("bert-base-uncased")

# get the embedding vector for the word "example"
example_token_id = tokenizer.convert_tokens_to_ids(["example"])[0]
example_embedding = model.embeddings.word_embeddings(torch.tensor([example_token_id]))

print(example_embedding.shape)
print(example_embedding)

torch.Size([1, 768])
tensor([[ 7.0699e-03,  3.9590e-02, -6.2164e-02, -8.4340e-02, -1.2362e-02,
          1.0582e-02, -1.2302e-01, -6.6595e-03, -6.5421e-02,  2.0174e-03,
         -6.1219e-03, -3.7570e-02, -1.0751e-01,  6.5124e-02, -8.2510e-03,
         -6.3290e-02, -1.6745e-02,  7.7046e-02, -5.7412e-03, -3.5633e-02,
         -1.3281e-02,  1.0091e-02, -2.9987e-02, -1.9298e-02, -7.6704e-02,
         -7.6498e-03,  1.4793e-02,  1.8764e-02, -7.9018e-02, -1.6882e-02,
          3.9476e-02, -5.0676e-02,  2.0185e-02, -8.1285e-02, -1.0244e-02,
         -1.2035e-02, -1.6211e-02, -1.5720e-03, -4.7858e-02,  8.2827e-03,
         -4.4718e-03,  4.8962e-02, -9.9693e-03,  2.4308e-02,  6.6937e-02,
         -7.0327e-02, -1.2011e-02,  2.0608e-02, -3.3565e-02, -5.4982e-03,
          4.4249e-02,  2.8569e-02, -5.2312e-02, -2.2065e-04, -2.1409e-02,
         -1.3903e-02, -5.2360e-02,  1.5349e-02, -1.7530e-02, -1.1394e-02,
          1.3658e-02, -5.4222e-02,  2.2562e-02, -4.7320e-02, -3.4117e-02,
         -8.0298e

In [5]:
# BPE tokenizer vocab has 50,257 words, and GPT3 has embedding size of 12,288
vocab_size = 6
output_dim = 3

# we can instantiate an embedding layer in Pytorch 
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

'''
The weight matrix of the embedding layer contains small, random values. 
These values are optimized during LLM training as part of the LLM optimization itself. 
Moreover, we can see that the weight matrix has six rows and three columns. 
There is one row for each of the six possible tokens in the vocabulary, and there is one column for each of the three embedding dimensions.
'''

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


'\nThe weight matrix of the embedding layer contains small, random values. \nThese values are optimized during LLM training as part of the LLM optimization itself. \nMoreover, we can see that the weight matrix has six rows and three columns. \nThere is one row for each of the six possible tokens in the vocabulary, and there is one column for each of the three embedding dimensions.\n'

In [6]:
# Apply it to tokenID to obtain the embedding vector
print(embedding_layer(torch.tensor([3])))

'''
If we compare the embedding vector for token ID 3 to the previous embedding matrix, 
we see that it is identical to the fourth row (Python starts with a zero index, so it’s the row corresponding to index 3). 
In other words, the embedding layer is essentially a lookup operation that retrieves rows from the embedding layer’s weight matrix via a token ID.
'''

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


'\nIf we compare the embedding vector for token ID 3 to the previous embedding matrix, \nwe see that it is identical to the fourth row (Python starts with a zero index, so it’s the row corresponding to index 3). \nIn other words, the embedding layer is essentially a lookup operation that retrieves rows from the embedding layer’s weight matrix via a token ID.\n'

# Attention mechanism

In [7]:
# In self-attention, our goal is to calculate context vectors z(i) for each element x(i) in the input sequence. 
# A context vector can be interpreted as an enriched embedding vector.
# Context vectors purpose is to create enriched representations of each element in an input sequence by incorporating information from all other elements in the sequence.
# This is essential in LLMs, which need to understand the relationship and relevance of words in a sentence to each other.

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

# ex: z(2) is an embedding that contains information about x(2) and all other input elements x(1) to x(T).

In [8]:
# The first step of self-attention is to compute the intermediate values, w, referred as attention scores. 
# We determine these scores by computing the dot product of the query, x(2), with every other input token

# Example: Compute attention score for "journey x(2)"
query = inputs[1]
attn_score2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    print(i,x_i)
    attn_score2[i] = torch.dot(x_i, query)
print(attn_score2)

'''
Beyond viewing the dot product operation as a mathematical tool that combines two vectors to yield a scalar value, 
the dot product is a measure of similarity because it quantifies how closely two vectors are aligned: 
a higher dot product indicates a greater degree of alignment or similarity between the vectors. 
In the context of self-attention mechanisms, the dot product determines the extent to which each element in a sequence focuses on, or “attends to,” any other element: 
the higher the dot product, the higher the similarity and attention score between two elements.
'''

0 tensor([0.4300, 0.1500, 0.8900])
1 tensor([0.5500, 0.8700, 0.6600])
2 tensor([0.5700, 0.8500, 0.6400])
3 tensor([0.2200, 0.5800, 0.3300])
4 tensor([0.7700, 0.2500, 0.1000])
5 tensor([0.0500, 0.8000, 0.5500])
tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


'\nBeyond viewing the dot product operation as a mathematical tool that combines two vectors to yield a scalar value, \nthe dot product is a measure of similarity because it quantifies how closely two vectors are aligned: \na higher dot product indicates a greater degree of alignment or similarity between the vectors. \nIn the context of self-attention mechanisms, the dot product determines the extent to which each element in a sequence focuses on, or “attends to,” any other element: \nthe higher the dot product, the higher the similarity and attention score between two elements.\n'

In [9]:
'''
In the next step, we normalize each of the attention scores we computed previously. 
The main goal behind the normalization is to obtain attention weights that sum up to 1. 
This normalization is a convention that is useful for interpretation and maintaining training stability in an LLM. 
'''

attn_weights_2_tmp = attn_score2 / attn_score2.sum()
print("Attention weights:", attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

'''
In practice, it’s more common and advisable to use the softmax function for normalization. 
This approach is better at managing extreme values and offers more favorable gradient properties during training. 
The following is a basic implementation of the softmax function for normalizing the attention scores using Pytorch:
'''

attn_weights_2 = torch.softmax(attn_score2, dim=0)
print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())

Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: tensor(1.0000)
Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [10]:
'''
Now that we have computed the normalized attention weights, we are ready for the final step: 
calculating the context vector z(2) by multiplying the embedded input tokens, x(i), with the corresponding attention weights and then summing the resulting vectors.
Thus, context vector z(2) is the weighted sum of all input vectors, obtained by multiplying each input vector by its corresponding attention weight:
'''

query = inputs[1]         
context_vec_2 = torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i
print(context_vec_2)

tensor([0.4419, 0.6515, 0.5683])


In [11]:
# Computing attention weights for all input tokens

attn_scores = torch.empty(6, 6)
for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)
print(attn_scores)
print()
'''
When computing the preceding attention score tensor, we used for loops in Python. 
However, for loops are generally slow, and we can achieve the same results using matrix multiplication:
'''

# Attention matrix: the matrix of attention scores computed in a Transformer's self-attention mechanism. 
# It determines how much focus each word gives to each word in a sequence.
attn_scores = inputs @ inputs.T
print(attn_scores)
print()

# Compute attention weights: normalization
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)
print()

# Compute context vectors
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0

# Implementing self-attention with trainable weights

In [12]:
# Computing the attention weights 
'''
We will implement the self-attention mechanism step by step by introducing the three trainable weight matrices Wq, Wk, and Wv. 
These three matrices are used to project the embedded input tokens, x(i), into query, key, and value vectors, respectively.

Note that in GPT-like models, the input and output dimensions are usually the same, but to better follow the computation, 
we’ll use different input (d_in=3) and output (d_out=2) dimensions here.
'''
x_2 = inputs[1]     #1
d_in = inputs.shape[1]      #2
d_out = 2         #3

#1 The second input element
#2 The input embedding size, d=3
#3 The output embedding size, d_out=2

In [13]:
# Then we initialize the thre weight matrices Wq, Wk, and Wv. 3x2 dim
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key   = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

# Then we compute the query, key, and value vectors
query_2 = x_2 @ W_query # 1x3 x 3x2 => 1x2 
key_2 = x_2 @ W_key 
value_2 = x_2 @ W_value
print(query_2, key_2, value_2)

tensor([0.4306, 1.4551]) tensor([0.4433, 1.1419]) tensor([0.3951, 1.0037])


In [14]:
'''
Even though our temporary goal is only to compute the one context vector, z(2), 
we still require the key and value vectors for all input elements as they are involved in computing the attention weights with respect to the query q (2)
'''

keys = inputs @ W_key 
values = inputs @ W_value
print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

keys.shape: torch.Size([6, 2])
values.shape: torch.Size([6, 2])


In [15]:
# Step2: Compute Attention Scores
attn_scores_2 = query_2 @ keys.T       #1
print(attn_scores_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


In [None]:
'''
Now, we want to go from the attention scores to the attention weights, as illustrated in figure 3.16. 
We compute the attention weights by scaling the attention scores and using the softmax function. 
However, now we scale the attention scores by dividing them by the square root of the embedding dimension of the keys 
(taking the square root is mathematically the same as exponentiating by 0.5):
'''

d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)

2
tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])


In [None]:
# Final step: compute context vectors

'''
Similar to when we computed the context vector as a weighted sum over the input vectors (see section 3.3), 
we now compute the context vector as a weighted sum over the value vectors. 
Here, the attention weights serve as a weighting factor that weighs the respective importance of each value vector. 
Also as before, we can use matrix multiplication to obtain the output in one step:
'''

context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

'''
So far, we’ve only computed a single context vector, z(2). 
Next, we will generalize the code to compute all context vectors in the input sequence, z(1) to z(T).
'''

# input: 6x3
# Wq, Wk, Wv: 3x2 each 
# q(t) = x(t) x Wq: 1x3 x 3x2 = 1x2
# key = input x Wk = 6x3 x 3x2 = 6x2 (same for value)
# attention score for x(t): query_t @ key.T = 1x2 x 2x6 = 1x6 
# attention weights for x(t): softmax(attention score of x(t) / embedded dim of key = 2) = 1x6
# context vector z(t): attention weights of x(t) @ values = 1x6 @ 6x2 => 1x2

# Query: the current item the model focuses on or tries to understand
# Key: each item in the input sequence has an associated key, these keys are used to match the query 
# Value: It represents the actual content or representation of the input items. Once the model determines which keys are most relevant to the query, it retrieves the corresponding values. 

tensor([0.3061, 0.8210])
