# Read Tinyshakespeare data

In [7]:
# If you are using a Conda environment generated from scratch, you will need to run
# `conda install jupyter` and `conda install wget`
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-05-22 18:13:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'


2024-05-22 18:13:32 (13.0 MB/s) - 'input.txt' saved [1115394/1115394]



In [1]:
# Read the text
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Create vocabulary

In [3]:
# Grab all unique characters in the text, sorted, and compute vocabulary size
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print("Vocabulary size: ", vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size:  65


# Encoder and Decoder for the Vocabulary
Here we use the simplest possible schema for encoding/decoding: we simply map a string a list of the indices of its characters. There are much more complex schema. For instance:

- Google uses [SentencePiece](https://github.com/google/sentencepiece), which is a **sub-word** tokenizer.
- OpenAI uses [Tiktoken](https://github.com/openai/tiktoken) which is a fast Byte-Pair-Encoding tokenizer. This has `50257` tokens in the vocabulary. To use it, do `import tiktoken` and then `enc = tiktoken.get_encoding('gpt2')` and then `enc.n_vocab`.

Here we use a simple character-level tokenizer, which means that our **codebook** (here called `char`) has a very small size, only `65` possible characters.

In [4]:
# Dictionaries mapping characters to their index and vice versa
str_to_int = {character: integer for integer, character in enumerate(chars)}
int_to_str = {integer: character for integer, character in enumerate(chars)}

In [5]:
# Encoder and Decoder functions
encode = lambda string: [str_to_int[character] for character in string]  # string --> list(int)
decode = lambda intlist: ''.join([int_to_str[integer] for integer in intlist])  # list(int) --> string

In [6]:
# Test the encoder and decoder functions
test_text = "Hello, World!"
print(encode(test_text))
print(decode(encode(test_text)))

[20, 43, 50, 50, 53, 6, 1, 35, 53, 56, 50, 42, 2]
Hello, World!


# Tokenize TinyShakespeare
This requires not only pytorch but also numpy.

In [7]:
import torch

# Encode TinyShakespeare and store in a PyTorch tensor
data = torch.tensor(encode(text), dtype=torch.long)

print("Data shape: ", data.shape, " Data dtype: ", data.dtype)
print(data[:50])

Data shape:  torch.Size([1115394])  Data dtype:  torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56])


# Train and Validation Split

In [8]:
n = int(0.9*len(data))

train_data = data[:n]  # 90% training
val_data = data[n:]    # 10% validation

# Context size and traning examples
Andrey Karpathy calls the `context size` with a different name: the `block_size`. Notice that, being a sequence model, inside a sequence of length `m` there are `m-1` examples. We take full advantage of this when training transformers.

In [9]:
block_size = 8  # context size

This can be easily spelled out by printing the training examples contained in a small training block.

In [10]:
x = train_data[:block_size]     # 0,...,8 (context)
y = train_data[1:block_size+1]  # 1,...,9 (targets) off-set by one

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"Context: {context.tolist()} Target: {target}")

Context: [18] Target: 47
Context: [18, 47] Target: 56
Context: [18, 47, 56] Target: 57
Context: [18, 47, 56, 57] Target: 58
Context: [18, 47, 56, 57, 58] Target: 1
Context: [18, 47, 56, 57, 58, 1] Target: 15
Context: [18, 47, 56, 57, 58, 1, 15] Target: 47
Context: [18, 47, 56, 57, 58, 1, 15, 47] Target: 58


# Batching
In practice we want to perform operations in parallel, and this requires the notion of a batch size.

In [11]:
torch.manual_seed(1337)

batch_size = 4  # Number of independent sequences processed in parallel
block_size = 8  # Maximum context length for predictions

def get_batch(split):
    """Generates batch of data of inputs `x` and targets `y`."""
    data = train_data if split == "train" else val_data
    # Sample 4 integers from [0, n-block_size], representing off-sets, one for each batch
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    # Grab context and target
    context = torch.stack([data[i:i+block_size] for i in ix])  # (batch_size, block_size) = (4, 8)
    targets = torch.stack([data[i+1:i+block_size+1] for i in ix])  # (batch_size, block_size) = (4, 8)
    return context, targets

In [12]:
xb, yb = get_batch('train')
print("Context: ")
print(xb.shape)
print(xb)
print("Targets: ")
print(yb.shape)
print(yb)

print("---")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"Context: {context.tolist()} Target: {target}")

Context: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
---
Context: [24] Target: 43
Context: [24, 43] Target: 58
Context: [24, 43, 58] Target: 5
Context: [24, 43, 58, 5] Target: 57
Context: [24, 43, 58, 5, 57] Target: 1
Context: [24, 43, 58, 5, 57, 1] Target: 46
Context: [24, 43, 58, 5, 57, 1, 46] Target: 43
Context: [24, 43, 58, 5, 57, 1, 46, 43] Target: 39
Context: [44] Target: 53
Context: [44, 53] Target: 56
Context: [44, 53, 56] Target: 1
Context: [44, 53, 56, 1] Target: 58
Context: [44, 53, 56, 1, 58] Target: 46
Context: [44, 53, 56, 1, 58, 46] Target: 39
Context: [44, 53, 56, 1, 58, 46, 39] Target: 58
Context: [44, 53, 56, 1, 58, 46, 39, 58] Tar

Therefore, our input to the transformer would be the `(batch_size, block_size)` array `xb` shown below

In [13]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


# Bigram Language Model

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        """Bigram model, see Karpathy's previous series of videos."""
        super().__init__()
        # Tokens read off the logits for the next token from a lookup table
        # Token embedding table has size (vocab_size, vocab_size)
        # The way it works is that the input, say 24 (the first one in xb above) will take the 24th row of this
        # embedding table. 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        """Forward pass. Takes `idx` and `targets` which are both `(B, T)` tensors of integers.
        Here `B` is the batch_size and `T` should be the block/context length."""
        # PyTorch will grab the row corresponding to the indices provided and return logits in
        # the shape (batch, time, channel). Here batch=4, time=8, channel=65 (vocab size)
        # The logits here are like the scores for the next token in the sequence
        logits = self.token_embedding_table(idx)  # (B, T, C)

        # Negative log-likelihood loss (cross-entropy). Importantly, when working with multi-dimensional inputs, 
        # PyTorch's documentation https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss
        # mentions that it requires dimensions (B, C, T) using our notation. A simpler alternative is to simply shape it to (B*T, C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        """Here `idx` is the current context of tokens in some batch, so it is `(B, T)`. This function will continue the generation
        one by one, for both the B and T dimensions. It keeps doing this until max_new_tokens."""
        for _ in range(max_new_tokens):
            logits, loss = self(idx)   # Get the predictions (calls forward(idx, targets=None))
            logits = logits[:, -1, :]  # (B, T, C) --> (B, C) we focus only on the last "time step" (last token in the context)
            probs = F.softmax(logits, dim=-1)  # Use Softmax to get probabilities. (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # Sample using the probabilities (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)  # append the sampled index to the running sequence (B, T+1)
        return idx


bigram = BigramLanguageModel(vocab_size)
logits, loss = bigram(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [18]:
# Sample from the model
print(
    decode(
        bigram.generate(
            idx=torch.zeros((1, 1), dtype=torch.long),
            max_new_tokens=100
        )[0].tolist()   # use [0] to pluck out the single batch dimension (we are sending in [[0]] with B=T=1
    )
)


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


Of course, this is silly. We are feeding in an entire context but really just using the last token to predict the next (see `logits[:, -1, :]`). 

# Optimizer

In [20]:
optimizer = torch.optim.AdamW(bigram.parameters(), lr=1e-3)

In [27]:
batch_size = 32

for steps in range(10000):

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = bigram(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4955925941467285


# Generate After Optimization

In [29]:
# Sample from the model
print(
    decode(
        bigram.generate(
            idx=torch.zeros((1, 1), dtype=torch.long),
            max_new_tokens=400
        )[0].tolist()   # use [0] to pluck out the single batch dimension (we are sending in [[0]] with B=T=1
    )
)




BOMy is, mumot me bthenindsoferlle cardethe le h. sps t theleriny hacl fougarke,

Angllll a hieald lo d,
Thade bof jak rend
Thid weme w, aithefithe h tes s tomeeseanmpotl'de y l, t,
Goutandos t tof al o ad, prsthoneirermenicisull gsewd o re, myofary pef s'HNORicepim th, thit y h,
Cothor nsethat bre, lly, farotodis f vel cld, minounged tithit:
O:

And;
DAppr.

VI cerer e: t
Wipe ght cou FLOLOF ve


# Mathematical Trick of Self-Attention

In [30]:
torch.manual_seed(1337)

B, T, C = 4, 8, 2   # batch, time, channels

x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

We only want information to flow forward: we don't want a token at position 5 to get information from token at position 6, since we are trying to predict that. Instead we want it to receive information from tokens 4, 3, 2, 1. A simple way to transfer information is just the mean of the previous tokens. This is not self-attention, but at least we will get used to not having information flowing backwards.

In [31]:
# We want x[b, t] = mean_{i <= t} x[b, i]
# bow = "bag of words" it's the term used when people are just averaging out things
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]  # (t, C), grab current batch, up to and including location t
        xbow[b, t] = torch.mean(xprev, 0) # compute the mean over the time i.e. context


This is very inefficient, there is a better way of doing this.

In [35]:
torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print("a = ")
print(a)
print("--")
print("b = ")
print(b)
print("--")
print("c = ")
print(c)

a = 
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
--
b = 
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c = 
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


In [37]:
# If we use the lower-triangular part, we only get the running sums
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print("a = ")
print(a)
print("--")
print("b = ")
print(b)
print("--")
print("c = ")
print(c)

a = 
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
--
b = 
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c = 
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


To get the average, we simply normalise the rows beforehand

In [38]:
# If we use the lower-triangular part, we only get the running sums
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a /= a.sum(1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print("a = ")
print(a)
print("--")
print("b = ")
print(b)
print("--")
print("c = ")
print(c)

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b = 
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c = 
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


Now we can do the initial calculation much more quickly

In [43]:
#### THIS 
# xbow = torch.zeros((B, T, C))
# for b in range(B):
#     for t in range(T):
#         xprev = x[b, :t+1] 
#         xbow[b, t] = torch.mean(xprev, 0)

#### BECOMES THIS
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
# Here pytorch will do batch matrix multiplication meaning that the (T, T) matrix will
# multiply each of the (T, C) matrices (there are B of them) in `weights`
xbow2 = weights @ x  # (T, T) @ (B, T, C) --> (B, T, C)
torch.allclose(xbow, xbow2)

True

There is a different version of doing this: with softmax. Softmax does the normalization, but the advantage of this point of view is that while here we are choosing a zero `wei` matrix ourselves, in practice this would be a matrix of interactions that would be learned from the data.

In [44]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))  # for all elements where tril==0, make them -inf
wei = F.softmax(wei, dim=1)  # normalization.
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

# Self Attention

Each token will emit two vectors: a **query** and a **key**.

- Query: "What am I looking for?"
- Key: "What do I contain?"

To obtain affinities, we do dot products between keys and queries. Basically, my query times all the keys, will become our `wei`.

In [49]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Let's see a single head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)   # These are just matrix multiplies with matrix of size (C, head_size)
query = nn.Linear(C, head_size, bias=False)  # (C, head_size)
value = nn.Linear(C, head_size, bias=False)  # (C, head_size)

# Apply key and query forward to x, to obtain `k` and `q`
# Notice this happens fully independently, no communication has happened. Each token
# has produced keys and queries independently
k = key(x)    # (B, T, head_size) since (B, T, C) @ (C, head_size) = (B, T, head_size)
q = query(x)  # (B, T, head_size)

# Compute the affinities 
# Basically, each key/query vector has size `head_size`, which in this case is `16`. We want to multiply
# each query with each key, but we need to use transpose carefully because we have batch dimension. 
# (B, T, hs) @ (B, hs, T) performs batched matrix multiplication, meaning that for each b=0, ..., B-1
# we compute the matrix multiplication q[b] @ k[b].T. This means
# (B, T, hs) @ (B, hs, T) = (B, T, T)
wei = q @ k.transpose(-2, -1)  # (B, T, T)
# Basically, each (T, T) matrix contains affinities for that batch

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

# Importantly, we don't just compute `wei @ x` but we do it with the value of `x`
# Remember `x` has shape `(B, T, C)` and value matrix has shape `(C, head_size)` so `value(x)` has shape `(B, T, head_size)`
v = value(x)
out = wei @ v  # basically we aggregate v, not the raw x, meaning the output will have size `(B, T, head_size)` !!!!!!!

out.shape

torch.Size([4, 8, 16])

Intuition: `x` contains the information and position about tokens, this is private information. 

- Query: what I am interested in
- Key: what I have
- Value: This is what I will communicate to you, if you find it interesting

Notice that there is no concept of space. Self-attention behaves on a set of vectors, but contains no positional information. That is why we do the positional encoding.

Attention is just a **communication mechanism**. 

> The elements across the batch dimension (which are independent examples) never talk to each other. We process them all independently.

**Encoder block**:

- Encoder block simply means we remove the line `wei = wei.masked_fill(tril == 0, float('-inf'))` and we allow all the nodes to completely talk to each other. 

**Decoder block**:

- It's the one we have implemented here, with the masking. It is called decoder because it is "decoding" language and has this autoregressive structure.

**Cross attention**:

- The reason why what we have implemented earlier is called **self-attention** is that we compute key query and values from the vector itself, i.e. `key(x)`, `query(x)` and `value(x)`. For instance, in a **encoder-decoder** transformer, you have that the `queries` are computed from `x` but `keys` and `values` come from a separate source. Sometimes from an encoder block which encodes some context that we would like to condition on.

SCALED ATTENTION

> In the "Attention is all you need paper" they use
$$
\frac{QK^\top}{\sqrt{d_k}}
$$
> where $d_k$ is the `head_size`. This is called `scaled_attention`. We'd implement it as `wei = q @ k.transpose(-2, -1) * head_size**-0.5`


The reason why this is done is that if you initialize `q` and `k` to be Gaussians, then the variance of the weights will be `head_size`

In [52]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1)

In [53]:
k.var(), q.var(), wei.var()

(tensor(1.0966), tensor(0.9416), tensor(16.1036))

Whereas if we normalize, this is no longer true

In [54]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [55]:
k.var(), q.var(), wei.var()

(tensor(1.0104), tensor(1.0204), tensor(1.1053))

Since `wei` feeds into a softmax, it is important that at initialization it is fairly diffuse, otherwise softmax will converge to one-hot vectors.

# Layer Norm

### Recall Batch norm

In [63]:
class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        # the self.training attribute is actually present in Pytorch too because many layers have a different 
        # behavior based on whether you are during training or inference
        self.training = True
        # Parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # Buffers (trained with a running momentum update)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # Calculate forward pass differently if we are in training or inference mode
        if self.training:
            # During training we estimate the from the batch
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            # During inference we use the running ones
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma*xhat + self.beta
        # Update buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum)*self.running_mean + self.momentum*xmean
                self.running_var = (1 - self.momentum)*self.running_var + self.momentum*xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [64]:
torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)   # batch=32, dim=100
x = module(x)
x.shape

torch.Size([32, 100])

In [65]:
# this makes sure that each column has mean 0 and std 1
x[:, 0].mean(), x[:, 0].std()

(tensor(1.4901e-08), tensor(1.0000))

In [66]:
# rows are not normalized!
x[0, :].mean(), x[0, :].std()

(tensor(0.0411), tensor(1.0431))

### Layer Norm
We literally just change 
```
xmean = x.mean(0, keepdim=True)
xvar = x.var(0, keepdim=True)
```
to 
```
xmean = x.mean(1, keepdim=True)
xvar = x.var(1, keepdim=True)
```

IMPORTANTLY, since this is done independently for each example and does not comput it mean and std across batches, we can remove the `torch.no_grad()` stuff. There is no distinction between training and test time!

In [76]:
class LayerNorm1d:

    def __init__(self, dim, eps=1e-5):
        self.eps = eps
        # Parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # Calculate forward pass differently if we are in training or inference mode
        xmean = x.mean(1, keepdim=True)
        xvar = x.var(1, keepdim=True)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma*xhat + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [77]:
torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100)   # batch=32, dim=100
x = module(x)
x.shape

torch.Size([32, 100])

In [78]:
# NOW ROWS ARE NORMALIZED!
x[0, :].mean(), x[0, :].std()

(tensor(-3.5763e-09), tensor(1.0000))