In [1]:
# 1. Self-attention by hand
# 2. Self-attention block in pytorch
# 3. GPT, piece-by-piece
# 4. GPU goes rrrr!

### Step 1: Self-attention by hand

In [2]:
import numpy as np

In [3]:
#  -- Write the scaled dot product self attention
  # 1. Compute queries, keys, and values
  # 2. Compute dot products
  # 3. Scale the dot products
  # 4. Apply softmax to calculate attentions
  # 5. Weight values by attentions
  # 6. Compute attention weighted features

# Choose values for the parameters
# Do not modify this code

X = np.array([[2,0,0,2],[0,1,0,0],[0,2,1,0],[0,0,1,1],[2,0,0,0],[1,0,1,1]], dtype=float)
W_Q = np.array([[1,1,0,0,0,0],[0,1,0,1,0,0],[0,0,1,0,1,1]], dtype=float)
W_K = np.array([[0,0,1,0,0,0],[0,1,0,0,0,0],[1,0,0,0,0,-1]], dtype=float)
W_V = np.array([[10,0,0,0,0,0],[0,0,0,10,0,0],[0,10,0,0,0,0]], dtype=float)

In [4]:
# This is given for you. Do not modify this code.
def softmax_cols(data_in):
  # Exponentiate all of the values
  exp_values = np.exp(data_in)
  # Sum over columns
  denom = np.sum(exp_values, axis = 0)
  # Replicate denominator to N rows
  denom = np.matmul(np.ones((data_in.shape[0],1)), denom[np.newaxis,:])
  # Compute softmax
  softmax = exp_values / denom
  # return the answer
  return softmax

In [5]:
# Computing the queries, keys, and values
Q = W_Q @ X
K = W_K @ X
V = W_V @ X

# Scaled dot product
d_k = W_K.shape[0]
scaled_dot_product = K.T @ Q / np.sqrt(d_k)

In [6]:
# What does the first dimension of matrices Q and K correspond to?
# The first dimension of matrices Q and K correspond to the number of queries and keys, respectively.

In [7]:
# compute the weighted attention matrix S
S = softmax_cols(scaled_dot_product) 

In [8]:
# compute the self-attention matrix A
A = V @ S

In [9]:
# Sanity check. This should return True.
np.allclose(A, np.array([[10.30759701, 10.10551833, 15.03361159,  3.06082018],
       [ 2.83283874,  2.97334971,  4.13169018,  1.53041009],
       [ 4.59026201,  4.50027071,  2.10990693,  7.70438486]]))

True

### Step 2: Self-attention block in pytorch

In [10]:
import torch
import torch.nn as nn
from torch.functional import F

In [11]:
# do not modify this code

batch_size = 3 # B
block_size = 2 # T
n_embd = 3     # C

In [12]:
torch.set_printoptions(precision=8)

In [13]:
# Build a scaled self-attention head without masked attention and without dropout (i.e. just key, query and values)
# A matrix multiplication is implemented using the nn.Linear() operator with no bias.
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.K = nn.Linear(n_embd, head_size, bias=False)
        self.Q = nn.Linear(n_embd, head_size, bias=False)
        self.V = nn.Linear(n_embd, head_size, bias=False)

    def forward (self, x):
        B, T, C = x.shape
        K = self.K(x)
        Q = self.Q(x)
        V = self.V(x)
        # scaled dot product
        dot_product = Q @ K.transpose(-2, -1) * C**-0.5
        S = F.softmax(dot_product, dim=-1)
        out = S @ self.V(x)
        return out

In [14]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
h = Head(2)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = h(x)
out

tensor([[[-0.46653441,  0.03306435],
         [-0.47224301,  0.04610375]],

        [[-0.38105938,  0.02397407],
         [-0.39453450,  0.02482043]],

        [[-0.29578221,  0.12158970],
         [-0.30042297,  0.12526262]]], grad_fn=<UnsafeViewBackward0>)

In [15]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.46653444,  0.03306433],
         [-0.47224304,  0.04610372]],
        [[-0.38105938,  0.02397406],
         [-0.39453450,  0.02482042]],
        [[-0.29578221,  0.12158968],
         [-0.30042297,  0.12526260]]]))

True

In [16]:
# Add weighted masked attention and dropout. Dropout comes after the softmax and before the multiplication with the value matrix.
# Copy the Head class from the previous exercise and expand upon it.

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.K = nn.Linear(n_embd, head_size, bias=False)
        self.Q = nn.Linear(n_embd, head_size, bias=False)
        self.V = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # store a persistent buffer for the forward pass

    def forward (self, x):
        B, T, C = x.shape
        K = self.K(x)
        Q = self.Q(x)
        V = self.V(x)
        dot_product = Q @ K.transpose(-2, -1) * C**-0.5
        dot_product = dot_product.masked_fill(self.tril == 0, float('-inf')) # apply the mask
        S = F.softmax(dot_product, dim=-1)
        out = S @ V
        return out

In [17]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
h = Head(2)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = h(x)
out

tensor([[[-0.37939817, -0.16596892],
         [-0.47224301,  0.04610375]],

        [[-0.14184164,  0.00894912],
         [-0.39453450,  0.02482043]],

        [[-0.17301908,  0.02442870],
         [-0.30042297,  0.12526262]]], grad_fn=<UnsafeViewBackward0>)

In [18]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.37939820, -0.16596894],
         [-0.47224304,  0.04610372]],
        [[-0.14184165,  0.00894911],
         [-0.39453450,  0.02482042]],
        [[-0.17301908,  0.02442869],
         [-0.30042297,  0.12526260]]]))


True

In [26]:
# A multi-head attention module contains a list of heads and a linear projection layer.
# The heads are applied to the input and then concatenated along the last dimension, then
# the linear layer is applied. Look at the unit test below to determine the dimensions of
# the linear layer.

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.linear = nn.Linear(num_heads * head_size, n_embd, bias=False)

    def forward (self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

In [27]:
# do not modify
num_heads = 3
head_size = 2
n_embd = 6

In [28]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
sa = MultiHeadAttention(num_heads=3, head_size=head_size)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = sa(x)

In [29]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.03730504, -0.07006130, -0.27096999,  0.13144857, -0.45049590,
          -0.33217290],
         [-0.06794342, -0.04509801, -0.34738648,  0.15599491, -0.45456851,
          -0.33087400]],
        [[-0.08914752, -0.03846309, -0.36569631,  0.09802882, -0.39963537,
          -0.29225215],
         [-0.04709741,  0.01406255, -0.25430590,  0.08396727, -0.41786054,
          -0.30781299]],
        [[ 0.15234883, -0.08591781, -0.10099770,  0.19886394, -0.49236685,
          -0.43605998],
         [ 0.15425430, -0.01792544, -0.00511202,  0.14046597, -0.48078871,
          -0.40730378]]]))


False

In [30]:
# Add a classical feedforward module: linear -> ReLU -> linear
# The hidden dimension is four times bigger than the input dimension (see Section 3.3 of Attention is All You Need)

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.linear1 = nn.Linear(n_embd, 4 * n_embd)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(4 * n_embd, n_embd)

    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        return out

In [31]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
ff = FeedForward(n_embd)
torch.manual_seed(123) # do not remove this line
x = torch.rand((3,n_embd))
out = ff(x)
out

tensor([[-0.58034134,  0.04641047, -0.10707694,  0.21581651, -0.30361828,
         -0.07352637],
        [-0.48917407,  0.07879594, -0.15972009,  0.17862341, -0.37070656,
         -0.07852858],
        [-0.48530391,  0.09604470, -0.06524836,  0.16611031, -0.35499069,
         -0.08964305]], grad_fn=<AddmmBackward0>)

In [32]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[-0.58034140,  0.04641046, -0.10707694,  0.21581653, -0.30361831,
         -0.07352637],
        [-0.48917407,  0.07879593, -0.15972012,  0.17862344, -0.37070659,
         -0.07852858],
        [-0.48530388,  0.09604470, -0.06524836,  0.16611034, -0.35499069,
         -0.08964306]]))

True

In [33]:
# Build a self-attention block
#
#   in -----> LayerNorm -------> multi-head attention -- + ----> LayerNorm -----> FeedForward --- + -----> out
#         |                                              |   |                                    |
#          ----------------------------------------------     ------------------------------------                       
#
# This architecture is slightly different from Attention is All You Need (or the UDL textbook):
# the layer norm comes before (not after) the attention or feed-forward
#
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.norm1 = nn.LayerNorm(n_embd)
        self.norm2 = nn.LayerNorm(n_embd)
        self.attn = MultiHeadAttention(n_head, head_size)
        self.ff = FeedForward(n_embd)

    def forward(self, x):
        out = self.norm1(x)
        out = self.attn(out)
        out = out + x
        out = self.norm2(out)
        out = self.ff(out)
        out = out + x
        return out

In [34]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
bk = Block(n_embd, num_heads)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size,block_size,n_embd))
out = bk(x)
out

tensor([[[0.55253643, 0.92662269, 0.14265607, 0.49135742, 0.44488862,
          0.40691286],
         [0.37060875, 0.53596079, 0.07363462, 0.50188196, 0.74524838,
          0.24173236]],

        [[0.33136606, 0.58603227, 0.19803919, 0.23582187, 0.55994618,
          0.30247170],
         [0.66611248, 1.00561023, 0.77635646, 0.41822353, 1.05300403,
          0.47236234]],

        [[0.05448158, 0.88543195, 0.24299024, 0.63907611, 1.21257365,
          0.26945171],
         [0.58008689, 0.91462666, 0.77791780, 0.65896595, 1.03971529,
          0.55926639]]], grad_fn=<AddBackward0>)

In [35]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.05278997, -0.10863629, -0.09458938,  0.97590691, -0.55101192,
           0.57085067],
         [-0.17928867, -0.44799608, -0.26547045,  1.11293721, -0.34837404,
           0.40728986]],
        [[-0.41515028, -0.30126408, -0.11399293,  0.64651299, -0.51579159,
           0.57017863],
         [-0.02734703,  0.08873296,  0.65776664,  0.70304352,  0.05667025,
           0.70008957]],
        [[ 0.52881187,  0.34458166,  0.31130391,  1.11564195,  0.37998506,
          -0.02971917],
         [ 1.39032197,  0.58906519,  0.97761846,  0.38604784,  0.63349819,
           0.50254500]]]))

False

In [None]:
## Step 3: Build a mini GPT
#
# - Start from the gpt-problem.py file
# - Add your Head, MultiHeadAttention, FeedForward and Block classes
# - Fill in the GPT class (__init__ and forward methods)
# - Train the network on CPU
# - Train the network on GPU

# For __init__, the GPT model parameters are:
#   - a token embedding table
#   - a positional embedding table
#   - a sequence of Blocks
#   - a layer norm
#   - a linear layer
#
# For forward(), the model consists in:
#   - applying the token embedding table and positional embedding table to the input tensor
#   - adding the two together
#   - applying the blocks, layer norm and linear layer (in that order)
#
# The code comes from hyperparameters that should work well on GPU.  On CPU, you 
# will need to reduce the model size significantly.
#
# In pytorch, an learnable embedding table is implemented with nn.Embedding(...)
#
# The token embedding table learns an embedding for each item of the vocabulary. The 
# positional embedding table does not depend on the input and learns an embedding
# for each position in the context.