In [1]:
# Fix path to be able to import classes
import sys
from pathlib import Path

# Add the src folder to the Python path
src_path = Path("../src").resolve()  # Adjust the relative path based on where your notebook is
sys.path.append(str(src_path))

<b><u>Exercise 3.1</u></b>: Comparing SelfAttention_v1 and SelfAttention_v2.

nn.Linear in SelfAttention_v2 uses a different weight initialisation scheme as nn.Parameter(torch.rand(d_in, d_out)) used in SelfAttention_v1, which causes the mechanisms to produce different results. To check that both implementations are otherwise similar, we can transfer the weight matrices from object v2 to v1, such that both objects then produce the same results. 

Correctly assign the weights from an instance of SelfAttention_v2 to SelfAttention_v1.

In [2]:
import torch
from Chapter03 import SelfAttention_v1, SelfAttention_v2, inputs, d_in, d_out

torch.manual_seed(42)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

tensor([[1.3751, 0.8610],
        [1.4201, 0.8892],
        [1.4198, 0.8890],
        [1.3533, 0.8476],
        [1.3746, 0.8606],
        [1.3620, 0.8532]], grad_fn=<MmBackward0>)


In [3]:
torch.manual_seed(42)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))

tensor([[0.3755, 0.2777],
        [0.3761, 0.2831],
        [0.3761, 0.2833],
        [0.3768, 0.2763],
        [0.3754, 0.2836],
        [0.3772, 0.2746]], grad_fn=<MmBackward0>)


In [4]:
print("V1 class weight:", sa_v1.W_key)
print("\nV2 class weight:", sa_v2.W_key.weight)

V1 class weight: Parameter containing:
tensor([[0.2566, 0.7936],
        [0.9408, 0.1332],
        [0.9346, 0.5936]], requires_grad=True)

V2 class weight: Parameter containing:
tensor([[-0.2811,  0.3391,  0.5090],
        [-0.4236,  0.5018,  0.1081]], requires_grad=True)


In [5]:
print("V2 class weight transposed:\n", sa_v2.W_key.weight.T)

V2 class weight transposed:
 tensor([[-0.2811, -0.4236],
        [ 0.3391,  0.5018],
        [ 0.5090,  0.1081]], grad_fn=<PermuteBackward0>)


In [6]:
sa_v1.W_key = torch.nn.Parameter(sa_v2.W_key.weight.T)
sa_v1.W_query = torch.nn.Parameter(sa_v2.W_query.weight.T)
sa_v1.W_value = torch.nn.Parameter(sa_v2.W_value.weight.T)

In [7]:
torch.manual_seed(42)
print(sa_v1(inputs))

tensor([[0.3755, 0.2777],
        [0.3761, 0.2831],
        [0.3761, 0.2833],
        [0.3768, 0.2763],
        [0.3754, 0.2836],
        [0.3772, 0.2746]], grad_fn=<MmBackward0>)


In [8]:
equal = torch.equal(sa_v1(inputs), sa_v2(inputs))
print("Tensors are equal:", equal) 

Tensors are equal: True


<b><u>Exercise 3.2</u></b>: Returning two-dimensional embedding vectors

Change the input arguments for the MultiHeadAttentionWrapper(..., num_heads=2) call such that the output context vectors are two-dimensional instead of four-dimensional, while keeping num_heads=2. You don't have to modify the class implementation, just change one of the other input arguments.

In [3]:
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape) 

torch.Size([2, 6, 3])


In [10]:
# Before
from Chapter03 import MultiHeadAttentionWrapper

torch.manual_seed(123)
context_length = batch.shape[1] # number of tokens
d_in, d_out = 3, 2

mha = MultiHeadAttentionWrapper(
    d_in, d_out, context_length, 0.0, num_heads=2
)
context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]],

        [[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]]], grad_fn=<CatBackward0>)
context_vecs.shape: torch.Size([2, 6, 4])


In [11]:
# Just change d_out to 1
d_in, d_out = 3, 1

mha = MultiHeadAttentionWrapper(
    d_in, d_out, context_length, 0.0, num_heads=2
)
context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[0.0189, 0.2729],
         [0.2181, 0.3037],
         [0.2804, 0.3125],
         [0.2830, 0.2793],
         [0.2476, 0.2541],
         [0.2748, 0.2513]],

        [[0.0189, 0.2729],
         [0.2181, 0.3037],
         [0.2804, 0.3125],
         [0.2830, 0.2793],
         [0.2476, 0.2541],
         [0.2748, 0.2513]]], grad_fn=<CatBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])


<b><u>Exercise 3.3</u></b>: Initialising GPT-2 size attention modules

Using the MultiHeadAttention class, initialise a multi-head attention module that has the same number of heads as the smallest GPT-2 model (12 attention heads). Ensure you use the respective input and output embedding sizes (768 dimensions). 

In [5]:
from Chapter03 import MultiHeadAttention

torch.manual_seed(42)
context_length, d_in = 1_024, 768
d_out = 768
num_heads = 12
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads)
mha

MultiHeadAttention(
  (W_query): Linear(in_features=768, out_features=768, bias=False)
  (W_key): Linear(in_features=768, out_features=768, bias=False)
  (W_value): Linear(in_features=768, out_features=768, bias=False)
  (out_proj): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)