In [1]:
#| default_exp GPT

In [2]:
#| echo false
#| export
from httpx import get as hget
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.set_printoptions(linewidth=150)

In [3]:
# with open('input.txt', 'r', encoding='utf-8') as f:
#     text = f.read()

In [4]:
#| export
batch_size = 64 # 32 how many independent sequences will be processed in parallel
block_size = 256 # 8 maximum context length for predictions
max_iters = 5_000
eval_interval = 500
lr = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_heads = 6 # 384 // 6 ---> every head is 64 dimensional
n_layer = 6
dropout = 0.2
# ---------
torch.manual_seed(1337);

Here we download sample dataset of Tiny Shakespeare dataset, which is a collection of all Shakespeare texts. The size is approximately 1 mln characters

In [5]:
#| export
f = hget("https://raw.githubusercontent.com/karpathy/ng-video-lecture/refs/heads/master/input.txt")
text = f.text

In [6]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [7]:
# let's look at the first 1000 characters
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


There are in total 65 unique characters that our model can see and work with.

In [8]:
#| export
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [9]:
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


Models work numbers, so let's create a mapping from characters to integers: encoder and decoder. Here we use a very simple encoder-decoder (tokenizer) by simply tokenizing each character by using their position. See tiktoken (used by OpenAI)

In [10]:
#| export
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string and output a list of integers
decode = lambda l: ''.join((itos[o] for o in l) )# decoder: take a list of integers and output a string

In [11]:
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


Let's encode the whole text dataset and store it into a torch.Tensor `data`. Currently data is simply a tensor stretched in a row. We also split our data into training (90% of the data) and validation (10% of the data) sets to calculate accuracy of our model and avoid overfitting. Without testing the model on a hold-out validation set we risk our model just memorizing the whole training set and having no actual predictive / creative power.

In [12]:
#| export
data = torch.tensor(encode(text), dtype=torch.long)

In [13]:
print(data.shape, data.dtype)
print(data[:100]) # the 1000 characters we looked at earlier will look to GPT like this)

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52,
        63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43,
        39, 49,  6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [14]:
#| export
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [52]:
print(len(train_data))
print(len(val_data))

1003854
111540


We will pass data (Shakespeare texts) into transformer model using batches because feeding the whole text at once will be computationally prohibitive. The idea is to pass random blocks (sequence) of text into a model and train it on predicting the next character. We can illustrate it below. In each sequence we train transformer model on context size from 1 to `block_size`. This allows the model to see different contexts in predicting the next character in a sequence. In generating text the model starts with a context on 1 and then it will predict up to a block_size characters and then just truncate because it is limited by the context of `block_size`.

In [15]:
block_sz = 8 # context length
x = train_data[:block_sz]
y = train_data[1:block_sz+1]
for t in  range(block_sz):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target : {target}")

When input is tensor([18]) the target : 47
When input is tensor([18, 47]) the target : 56
When input is tensor([18, 47, 56]) the target : 57
When input is tensor([18, 47, 56, 57]) the target : 58
When input is tensor([18, 47, 56, 57, 58]) the target : 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target : 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target : 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target : 58


We want to utilize GPU's power of parallel calculations and feed multiple sequences in a batch.

In [16]:
#| export
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(0, len(data)-block_size, (batch_size,)) # random offsets
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x, y

In [55]:
torch.manual_seed(1337)
batch_size = 4 # number of independent sequences we process in parallel
block_size = 8 # maximum context length for predictions
xb, yb = get_batch('train')

print('inputs:')
print(xb.shape)
print(xb,'\n')
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} the target : {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0') 

targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
----
When input is [24] the target : 43
When input is [24, 43] the target : 58
When input is [24, 43, 58] the target : 5
When input is [24, 43, 58, 5] the target : 57
When input is [24, 43, 58, 5, 57] the target : 1
When input is [24, 43, 58, 5, 57, 1] the target : 46
When input is [24, 43, 58, 5, 57, 1, 46] the target : 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target : 39
When input is [44] the target : 53
When input is [44, 53] the target : 56
When input is [44, 53, 56] the target : 1
When input is [44, 53, 56, 1] the target : 58
When input is [44, 53,

So each batch of 4 random sequences yields 32 examples (4 * 8) that will be fed into a neural net.

In [56]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')


In [57]:
torch.manual_seed(1337);

In [17]:
#|export
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [18]:
#| export
class Head(nn.Module):
    "One head of self-attention"
   
    def __init__(self, head_size):
        super().__init__()
        # linear projections of nodes
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B, T, C = x.shape # batch, time-step, channels
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        # compute attention scores ('affinities')
        wei = q @ k.transpose(-2,-1) * k.shape[-1] **-0.5 # (B, T, head_size) @ (B, head_size, T) ----> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # so that future does not communicate with the past
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, T) @ (B, T, head_size) ----> (B, T, head_size)
        return out

In [19]:
#| export
class MultiHeadAttention(nn.Module):
    "Multiple heads of self-attention in parrallel"

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # multiple communication channels between tokens
        self.proj = nn.Linear(head_size * num_heads, n_embd) # projection back into the main path from residual connection
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # concatinate over the channels dim 
        out = self.dropout(self.proj(out))
        return out


FFWL basically allows nodes to think on the infromation that they have accumulated through attention

In [20]:
#| export
class FeedForward(nn.Module):
    "Simple linear layer followed by a non-linearity"
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd), # 4 follows from the paper Attention is all you need
            nn.ReLU(), 
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )
    # note that linear layer works on each token individually (they don't communicate during this step - communication happened during sa_heads stage)
    def forward(self, x): return self.net(x)

In [21]:
#| export 
class Block(nn.Module):
    "Transformer block: communication followed by computation"
    
    def __init__(self, n_embd, n_heads):
        # n_embd: embedding dimension, n_heads: the nubmer fo heads we'd like
        super().__init__()
        head_size = n_embd // n_heads # so that the math works out and after concatination we have n_embd as a last dimension
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # normalized across the features (32)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        # x + to account for residual connections and allow our gradient to flow through `highway` and not get vanished during deep net training
        # note that now we need to allign the shapes to add x and self.sa(x)
        x = x + self.sa(self.ln1(x)) # communication
        x = x + self.ffwd(self.ln2(x)) # computation
        return x

In [22]:
#| export
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # we also introduce an embedding for the position of each token
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_heads=n_heads) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B, T, C). C is C_embd
        # passing posotion of a token (integer from 0 to T)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            target = targets.view(B*T)
            loss = F.cross_entropy(logits, target)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step (in forward pass logits retain original shape (no .view))
            logits = logits[:,-1,:] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            # print(idx.shape)
        return idx

model = GPTLanguageModel().to(device)

In [23]:
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

10.788929 M parameters


Function to estimate the loss by averaging over `eval_iters`

In [28]:
logits, loss = model(xb, yb)
print(f'{logits.shape=}')
print(f'{loss=}')

print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

logits.shape=torch.Size([32, 65])
loss=tensor(4.5092, device='cuda:0', grad_fn=<NllLossBackward0>)

HB?JfmNmSqqUWwiukZwxOFiDc'v!3YmGVvBDsdzYPgPX?z$IAGWUU-q-AN
-:$gndPevmAILXAInHcCDfD'naxXkFhWrS'lvWi,m


Create a PyTorch optimizer, using AdamW

In [29]:
#| export
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [30]:
#| export
for iter in range(max_iters):
   
    # evaluate loss on train and val sets once in a while
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.3530, val loss 4.3515
step 500: train loss 2.5136, val loss 2.5088
step 1000: train loss 2.3843, val loss 2.4123
step 1500: train loss 2.3205, val loss 2.3258
step 2000: train loss 2.2629, val loss 2.2871
step 2500: train loss 2.2076, val loss 2.2520
step 3000: train loss 2.1958, val loss 2.2514
step 3500: train loss 2.1857, val loss 2.2466
step 4000: train loss 2.1749, val loss 2.2614
step 4500: train loss 2.1348, val loss 2.2355
step 4999: train loss 2.1825, val loss 2.2096


In [24]:
#| export
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=1000)[0].tolist()))
open('more.txt', 'w').write(decode(model.generate(context, max_new_tokens=10000)[0].tolist()))

KeyboardInterrupt: 

## The mathematical trick is self-attention

We want our tokens to have a way of communicating with each other. The simplest way to achieve that, it to average channels corresponding to the __previous__ tokens. Note that we want each token to be able to communicate only with the past, not the future.

In [34]:
# consider the following toy example
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

$$ x+y=2 $$

So basically we want ` xbow[b, t] = mean_{i<=t} x[b,i] ` One way to achieve this, is  to use loops and average across channels. As an aside, note that when slicing a torch array, we can ignore the trailing `:` to be more concise.

In [35]:
# version 1: using loops
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = xprev.mean(0)

Let's take first batch. Each element in `xbow` is an average of the preceding values in `x`. So first element in `xbow` and `x` are equal, second element in `xbow` is an average of the first two elements in `x` and so on.

In [36]:
x[0],xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

Now we want to be able to do this efficientyl. Let's consider a toy example below. `torch.tril` returns a lower triangular part of the matrix (values below the diagonal)

In [37]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / a.sum(dim=1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b 
print(f'{a=}')
print('---\n')
print(f'{b=}')
print('---\n')
print(f'{c=}')

a=tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
---

b=tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
---

c=tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


Now we can use this trick to vectorize our calculation of `xbow` using batch matrix multiply. Triangular matrix allows for each token to ignore information that goes after its position (future) - they are not allowed to communicate

In [38]:
# version 2: using tril and batch matrix multiply
wei = torch.tril(torch.ones(T,T))
wei /= wei.sum(dim=1, keepdim = True)
xbow2 = wei @ x # (T, T) @ (B, T, C) ---(broadcasting)--> (B, T, T) @ (B, T, C) = (B, T, C)

In [39]:
torch.allclose(xbow, xbow2, atol=1e-7)

True

In [40]:
xbow[0][6], xbow2[0][6]

(tensor([ 0.0927, -0.0682]), tensor([ 0.0927, -0.0682]))

In [41]:
# version 3: using Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

In [42]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

### Self-attention 

Currently our code calculates a simple average for previous tokens because wei is initialized uniformly with zeros. In practice we don't want the weights to be uniform because different preceding tokens have different predictive power to the next token. The weights of preceding tokens should be data-dependent (different for different batches) and this is what self-attention solves. `Key` is basically infromation about itself and `Query` is information that this token is looking for (vowels and at particular position for example) and `Value` is the actual information of the token.

In [43]:
# version 4: using Softmax
torch.manual_seed(1337)
B,T,C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

# each token now has key and query but currently they are independent
k = key(x)   # (B, T, head_size)
q = query(x) # (B, T, head_size)

# to introduce interactivity we use matrix multiply
wei = q @ k.transpose(-2,-1) # (B, T, head_size) @ (B, head_size, T) ----> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# # wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1) # softmax allows to treat those interaction in a probabilistic manner

# # instead of aggregating raw x (tokens' C) we aggregate v's that are propagated x through a linear layer 
v = value(x) # (B, T, head_size)
out = wei @ v
# # out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [44]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]], grad_fn=<SelectBackward0>)

Notes:

* Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
* There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
* Each example across batch dimension is of course processed completely independently and never "talk" to each other
* In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
* "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
* "Scaled" attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

Normalization of wei to avoid extreme values at initialization and convergence to a one-hot

In [45]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2,-1) # * head_size**-0.5

In [46]:
k.var(), q.var(), wei.var()

(tensor(1.0449), tensor(1.0700), tensor(17.4690))

We don't want values fed into softmax to be too extreme especially at initialization, because softmax would converge onto a one-hot with its peak at the most extreme value. This would lead to aggregating all information from a single node and that's not what we are trying to achieve. See below

In [47]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [48]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

## Layer Norm

This is our code from makemore P3, where we implemented a batch norm layer. BN layer normalizes columns (each token is normalized) and LN normalizes across rows

In [49]:
class LayerNorm1d:

    def __init__(self, dim ,eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.beta = torch.zeros(dim)
        self.gamma = torch.ones(dim)

    def __call__(self, x):
        # calculate forward pass
        xmean = x.mean(1, keepdim=True) # batch mean
        xvar = x.var(1, keepdim=True) # batch variance
        xhat = (x-xmean)/torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out

    def parameters(self): return [self.beta, self.gamma]

In [50]:
class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        xmean = x.mean(0, keepdim=True) # batch mean
        xvar = x.var(0, keepdim=True) # batch variance
      elif x.ndim == 3:
        dim = (0,1)
        xmean = x.mean(dim, keepdim=True) # batch mean
        xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

In [51]:
torch.manual_seed(1337)
BN = BatchNorm1d(100)
LN = LayerNorm1d(100)

x = torch.randn(32, 100)
x_bn = BN(x)
x_ln = LN(x)

In [52]:
x_bn[:,0].mean(), x_bn[:,0].std() # mean, std of one feature across all batch inputs

(tensor(7.4506e-09), tensor(1.0000))

In [53]:
x_bn[0,:].mean(), x_bn[0,:].std() # mean, std of a single input from the batch, of its features

(tensor(0.0411), tensor(1.0431))

In [54]:
x_ln[:,0].mean(), x_ln[:,0].std() # mean, std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [55]:
x_ln[0,:].mean(), x_ln[0,:].std() # mean, std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))

In [25]:
from nbdev.export import nb_export
nb_export('GPT_dev1.ipynb')