Exercise 3.1 Comparing SelfAttention_v1 and SelfAttention_v2
Note that nn.Linear in SelfAttention_v2 uses a different weight initialization scheme as nn.Parameter(torch.rand(d_in, d_out)) used in SelfAttention_v1, which causes both mechanisms to produce different results. To check that both implementations, SelfAttention_v1 and SelfAttention_v2, are otherwise similar, we can transfer the weight matrices from a SelfAttention_v2 object to a Self-Attention_v1, such that both objects then produce the same results.
Your task is to correctly assign the weights from an instance of SelfAttention_v2 to an instance of SelfAttention_v1. To do this, you need to understand the relationship between the weights in both versions. (Hint: nn.Linear stores the weight matrix in a transposed form.) After the assignment, you should observe that both instances produce the same outputs.

In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [59]:
# Set random seed for reproducibility
d_in = 4
d_out = 3

# Define SelfAttention_v1
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        Q = x @ self.W_query
        K = x @ self.W_key
        V = x @ self.W_value
        scores = F.softmax(Q @ K.transpose(-2, -1) / (d_out ** 0.5), dim=-1)
        return scores @ V

# Get SelfAttention_v2 from nn.Linear
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)

    def forward(self, x):
        Q = self.W_query(x)
        K = self.W_key(x)
        V = self.W_value(x)
        scores = F.softmax(Q @ K.transpose(-2, -1) / (d_out ** 0.5), dim=-1)
        return scores @ V

# Create two model instances
sa_v1 = SelfAttention_v1(d_in, d_out)
sa_v2 = SelfAttention_v2(d_in, d_out)

# Assign the weight of sa_v2 to sa_v1 (key step!)
sa_v1.W_query = nn.Parameter(sa_v2.W_query.weight.T.clone())
sa_v1.W_key = nn.Parameter(sa_v2.W_key.weight.T.clone())
sa_v1.W_value = nn.Parameter(sa_v2.W_value.weight.T.clone())

# Input data
x = torch.randn(2, 5, d_in)  # batch=2, seq_len=5




In [60]:
# -----------------------
# testing result
# -----------------------
out1 = sa_v1(x)
out2 = sa_v2(x)

print("Is the output the same？", torch.allclose(out1, out2, atol=1e-6))  # True

Is the output the same？ True


Exercise 3.2 Returning two-dimensional embedding vectors
Change the input arguments for the MultiHeadAttentionWrapper(..., num_
heads=2) call such that the output context vectors are two-dimensional instead of four dimensional while keeping the setting num_heads=2. Hint: You don’t have to modify the class implementation; you just have to change one of the other input arguments.

In [61]:
# Implement Single-Head Attention
class SelfAttentionHead(nn.Module):
    def __init__(self, d_in, d_out, block_size):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)
        self.block_size = block_size

    def forward(self, x):
        B, T, C = x.shape
        Q = self.W_query(x)  # (B, T, d_out)
        K = self.W_key(x)    # (B, T, d_out)
        V = self.W_value(x)  # (B, T, d_out)

        attn_scores = Q @ K.transpose(-2, -1) / (Q.shape[-1] ** 0.5)
        attn_weights = F.softmax(attn_scores, dim=-1)
        out = attn_weights @ V  # (B, T, d_out)
        return out

# Implement Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, block_size, dropout, num_heads):
        super().__init__()
        self.heads = nn.ModuleList([
            SelfAttentionHead(d_in, d_out, block_size)
            for _ in range(num_heads)
        ])
        self.proj = nn.Linear(num_heads * d_out, d_in) 
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1) 
        out = self.dropout(out)
        return out


In [62]:
# from P84
torch.manual_seed(123)
context_length = batch.shape[1] # This is the number of tokens
d_in, d_out = 3, 2
mha = MultiHeadAttention(
d_in, d_out, context_length, 0.0, num_heads=2
)
context_vecs = mha(batch)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[-0.0686, -0.0920,  0.0610,  0.1280],
         [-0.0782, -0.0955, -0.0194,  0.1680],
         [-0.1404, -0.0605, -0.0017,  0.1665],
         [ 0.2054, -0.2619,  0.0702,  0.0758],
         [-0.1637, -0.1159, -0.1336,  0.2007]],

        [[ 0.2873, -0.0841, -0.3160,  0.0147],
         [ 0.4631, -0.1349, -0.3368,  0.0221],
         [ 0.3711, -0.2083, -0.2452,  0.1234],
         [ 0.0191, -0.2982, -0.0577,  0.3268],
         [ 0.2049, -0.1847, -0.2338,  0.0999]]], grad_fn=<CatBackward0>)
context_vecs.shape: torch.Size([2, 5, 4])


In [63]:
d_in = 4        
d_out = 1       
num_heads = 2
block_size = 5

mha = MultiHeadAttentionWrapper(d_in, d_out, block_size, 0.0, num_heads)
x = torch.randn(1, block_size, d_in)  # (batch=1, seq_len=5, d_in=4)

y = mha(x)
print(y.shape)  


torch.Size([1, 5, 2])


Exercise 3.3 Initializing GPT-2 size attention modules
Using the MultiHeadAttention class, initialize a multi-head attention module that has the same number of attention heads as the smallest GPT-2 model (12 attention heads). Also ensure that you use the respective input and output embedding sizes similar to GPT-2 (768 dimensions). Note that the smallest GPT-2 model supports a context length of 1,024 tokens.

In [64]:
block_size = 1024
d_in, d_out = 768, 768
num_heads = 12

mha = MultiHeadAttention(d_in, d_out, block_size, 0.0, num_heads)
