# Developing GPT

## Setup

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x10d9a9d90>

In [3]:
"""Download Shakespeare training dataset"""
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

'Download Shakespeare training dataset'

In [4]:
"""Read Shakespeare file"""
with open('input.txt', 'r', encoding='utf-8') as file:
	text = file.read()

## Mathematical Trick in Self-Attention
- We want each of the 8 tokens in the T vector (time) to communicate with each other in a specific way
- Specifically, we want each token to communicate with the tokens that come before it, and not those that come after it
- This way, information only flows from previous context to the current timestamp, and not the other way around

In [5]:
batch, time, channels = 4, 8, 2
x = torch.randn(batch, time, channels)
x.shape

torch.Size([4, 8, 2])

- We want x[batch, time] = mean_{idx <= t} x [batch, idx]
- There's a word stored at each of the 8 time locations
- `Bag-of-words` is an expression for averaging

In [6]:
x_bag_of_words = torch.zeros((batch, time, channels))

for b in range(batch):
	for t in range(time):
		x_prev = x[b, :t+1]
		x_bag_of_words[b, t] = torch.mean(x_prev, 0)

In [7]:
"""0th batch element"""
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [8]:
"""First batch element is the same as the above because it's just the average of the first element,
but the second element is the average of elements one and two, and so on
"""
x_bag_of_words[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

---

### Applying Matrix Multiplication
Version 1
- For every T-th token, we want to calculate the average of all the vectors in all previous tokens and the current token
- Unfortunately, this process is very inefficient. The trick is to increase the efficiency by using matrix multiplication
-  In the code below, Matrix c is essentially a running sum of matrix b

In [9]:
"""Toy example illustrating how matrix multiplication can be used for a 'weighted aggregation'"""
torch.manual_seed(42)
"""
Tril returns the lower left triangular part of a matrix and zeros out the upper triangle
Tril allows us to just pick out exact numbers from matrix b and keep them in the resulting matrix c
"""
a = torch.tril(torch.ones(3, 3))

b = torch.randint(0,10,(3,2)).float()
"""@ is shorthand for matrix multiplication"""
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


- Once we apply the following line to matrix a, `a = a / torch.sum(a, 1, keepdim=True)`, we get a running _average_ instead of a running sum
- This allows us to calculate the running incremental average of all the vectors in all previous tokens and the current token much more efficiently

In [10]:
"""Toy example illustrating how matrix multiplication can be used for a 'weighted aggregation'"""
torch.manual_seed(42)
"""
Tril returns the lower left triangular part of a matrix and zeros out the upper triangle
Tril allows us to just pick out exact numbers from matrix b and keep them in the resulting matrix c
"""
a = torch.tril(torch.ones(3, 3))
"""Keepdim ensures that the sum has the same dimensions as the original tensor"""
a = a / torch.sum(a, 1, keepdim=True)
"""Now each row in matrix a will sum to 1"""

b = torch.randint(0,10,(3,2)).float()
"""@ is shorthand for matrix multiplication"""
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


---

### Recreating X-Bag-of-Words Using a Weights Matrix
Version 2

In [11]:
"""Weights represent how much of every row we want to average"""
weights = torch.tril(torch.ones(time, time))
"""Normalize the weights so that each row sums to 1"""""
weights = weights / weights.sum(1, keepdim=True)
weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [12]:
"""
PyTorch Matrix Multiplication
By default, you can't multiply (T, T) @ (B, T, C)
To make this feasible, PyTorch will add a batch dimension to the first matrix so that the final dimensions look like this:
(B, T, T) @ (B, T, C)
This is a batched matrix multiply, so the matrix multiplication will be applied to each batch in parallel
The result will be a (B, T, C) matrix
"""
x_bag_of_words_2 = weights @ x

In [13]:
"""
`torch.allclose` returns True if all elements of two tensors are equal within a certain tolerance
Essentially, the matrix multiplication by the weights just made x_bag_of_words_2 equal to x_bag_of_words
"""
torch.allclose(x_bag_of_words, x_bag_of_words_2)

True

In [15]:
"""Visualize the first elements of each tensor to show they're equal"""
x_bag_of_words[0], x_bag_of_words_2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

### Summary: Weights Matrix Method
- We used batched matrix multiplication to perform a weighted aggregation
- We took weighted sums using the weighted vector which takes on a triangular form
- The triangular form means that each token at the T-th dimension only gets information from the tokens preceding it

---

### Softmax
Version 3 <br><br>
As a review, Softmax converts a vector of numbers into a probability distribution. <br>
It does this by taking the exponential of each element in the vector,
then dividing each result by the sum of these exponentials. <br>
This ensures that the output values are between 0 and 1 and sum to 1,
making them interpretable as probabilities.

In [28]:
"""Initialize weights vector to all zeros and tril vector to all ones"""
weights = torch.zeros((time, time))
tril = torch.tril(torch.ones((time, time)))

In [29]:
"""Everywhere that tril == 0, replace the weights value with a value of negative infinity"""
weights = weights.masked_fill(tril == 0, float('-inf'))
weights

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [30]:
"""
Take a Softmax normalization along every row
The negative infinity values become zeroes;
Other values will then be normalized to sum to 1
"""
weights = F.softmax(weights, dim=1)
weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

### Summary: Softmax Method
- This is helpful for self-attention because all the weights begin with 0
- It tells us how much of each past token to aggregate
- The masked fill tells past tokens that they cannot communicate
- 