In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'mps'
print(device)

# Loading data

## Initial Preparation

First define where the training data url is

In [None]:
import os 
cwd = os.getcwd()
repository_dir = '/'.join(cwd.split('/')[:-1])
# DATA_DIR = f'{repository_dir}/data/data/preprocessed/preprocessed.txt'
DATA_DIR = 'input.txt'
assert os.path.exists(DATA_DIR), 'Make sure you follow the steps of README.md in data repository'

print(f'Training data located at {DATA_DIR}')

Then load the data

In [None]:
with open(DATA_DIR, 'r', encoding='utf-8') as train_file:
    raw_data = train_file.read()


# Inspection
print(f'Loaded {len(raw_data)} characters')
print(f'First 100 characters:\n{raw_data[:100]}')

## Tokenization

Tokenization is essentially turning the data into 'words' that the model understands. Below the vocabulary that is defined is based on a character basis. 

TODO: check out other tokenizers:
tiktoken - https://github.com/openai/tiktoken
sentencepiece - https://github.com/google/sentencepiece

In [None]:
chars = sorted(list(set(raw_data)))
VOCAB_SIZE = len(chars)
print(f'Vocabulary ({VOCAB_SIZE} characters): {"".join(chars)}')
char_to_digit = {char:digit for digit, char in enumerate(chars)}
digit_to_char = {digit:char for digit, char in enumerate(chars)}

encode = lambda string: [char_to_digit[char] for char in string]
decode = lambda digits: ''.join([digit_to_char[digit] for digit in digits])

In [None]:
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(raw_data), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

## Train/Val Split

In [None]:
TRAIN_PROPORTION = 0.9 # 90% of data is for training, 10% for validation
idx = int(TRAIN_PROPORTION * len(data)) 
train_data = data[:idx]
val_data = data[idx:]

## Chunking data

It is important to note that it is computationaly infeasible and prohibitive to train on the **whole** dataset all at once, due to the large training data size. This is why data is separated into *chunks*, which are smaller random samples of the whole dataset. The size of the chunks is defined by the variable `block_size`

### Blocks

In [None]:
BLOCK_SIZE = 64
train_data[:BLOCK_SIZE+1]


The way training is carried out is that for every character `c` in a chunk, `c` is the label and the inputs are all characters before `c`. 
This approach is useful, because the model can start predicting from as little as a single character, and predict everything up until block size
#### Example:

In [None]:
example = "Hello, there"
print(f"Input{' ' * 7}| Output")
print('='*22)
for t in range(1, len(example)):
    inputs = example[:t]
    output = example[t]

    print(f"{inputs:<11} | {output}")


### Batch

There is an additional dimension except the time dimension (blocks), which is the batch. Multiple blocks are sampled and stacked on top of each other to create a batch. This way multiple samples can be processed in parallel.

In [None]:
BATCH_SIZE = 64
BLOCK_SIZE = 128
N_EMBD = 64
SEED = 1338

torch.manual_seed(SEED)

def get_batch(split = 'train'):
    data = train_data if split == 'train' else val_data
    # data = torch.tensor([i for i in range(100)])

    # Generate BATCH_SIZE random starting positions for each block
    block_indices = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,)) 
    block_range = torch.arange(0, BLOCK_SIZE, dtype=torch.int)

    # Add the above range to each starting position to create a set
    # of indices for each block 
    block_ranges = block_indices[:, None] + block_range
    
    x = data[block_ranges] 
    y = data[block_ranges + 1]
    return x.to(device), y.to(device)    

print(get_batch())


## Loss Estimation

In [None]:
@torch.no_grad()
def estimate_loss(model, eval_iters = 10):
    out = {}
    model.eval()
    
    for split in ["test", "validation"]:
        losses = torch.zeros(eval_iters).to(device)
        for k in range(eval_iters):
            x, y = get_batch(split)
            loss = model(x, y)
            losses[k] = loss

        out[split] = losses.mean().item()

    model.train()
    return out

# Bigram Language Model

In [None]:
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def predict(self, input):
        return self.token_embedding_table(input)

    def forward(self, input, targets):

        # Number of channels is the same as the vocabulary size meaning for each letter
        # we have a vector of size VOCAB_SIZE storing the probability of each letter appearing
        logits = self.predict(input) # logits.shape = B, T, C - Batch, Time, Channels
        B, T, C = logits.shape
        transformed_logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(transformed_logits, targets) # Expects B*T, C
        return loss
    
    def generate(self, input, max_characters=20):
        for _ in range(max_characters):
            logits = self.predict(input)
            logits = logits[:, -1, :]
            distribution = F.softmax(logits, dim=-1)
            next = torch.multinomial(distribution, num_samples=1)
            input = torch.cat((input, next), dim=1)
        return input[0]

# model = BigramLanguageModel(VOCAB_SIZE).to(device)
# x, y = get_batch()

# context = torch.zeros((1, 1), dtype=torch.long).to(device)
# res = model.generate(context)
# print(decode(res.tolist()))


## Training the model

In [None]:
import time

def train(model, epochs = 10000, learning_rate = 1e-3, eval_step = 1000):
    # https://arxiv.org/abs/1711.05101v3
    # https://paperswithcode.com/method/adamw
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
    start = time.time()

    for i in range(epochs):
        x, y = get_batch()
        loss = model(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        if i % eval_step == 0:
            print(f'Epoch {i} - {estimate_loss(model)}')
            end = time.time()
            print(f"Time taken: {end - start}")
            start = time.time()

# Self-attention 
Taking into consideration all previous characters.

Let's start by just taking the average of the previous characters.

In [None]:
import torch

B, T, C = 4, 8, 2

x = torch.randn(B, T, C)

print(x.shape)

In [None]:
# Version 1

import torch

mask = torch.tril(torch.ones(T, T)) # returns the lower triangle of a matrix
mask /= mask.sum(axis=1, keepdim=True) # normalize the mask so each row sums to 1

print(mask @ x)

In [None]:
# Version 2
# Using softmax to normalize the mask in order to directly be able to use the embedding table

import torch
import torch.nn.functional as F

mask = torch.tril(torch.ones(T, T)) # returns the lower triangle of a matrix
affinities = torch.ones(T, T)
affinities = affinities.masked_fill(mask == 0, float('-inf'))
affinities = F.softmax(affinities, dim=1)

print(affinities @ x)

## Single Head to perform self-attention

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

# Head of self-attention
head_size = 16
keys = nn.Linear(C, head_size, bias=False)  # What is stored
query = nn.Linear(C, head_size, bias=False) # What is requied
value = nn.Linear(C, head_size, bias=False) 
k = keys(x)  # B, T, head_size
q = query(x) # B, T, head_size

v = value(x) # Multiply the affinities by this value instead of x



An intuitive way to think about the query is that for example for a vowel a consonant 2 positions earlier might be more important and this is reflected using the query. 
In other words it is a way to weight the positions in a data dependent manner. The keys layer on the other hand shows what is contained as weights. In the case of the previous example, if it is important that there is a consonant 2 letters before and there is infact a consonant the dot product will yield a high affinitie.

### Calculating the affinities

In [None]:
# Set the affinities to the dot product of the keys and queries instead of to a torch of ones
# The affinities are a weighted average where the weights are defined by the query and the 
# actual values are defined by the keys
affinities = k @ q.transpose(-1, -2) # (B, T, head_size) @ (B, head_size, T) = (B, T, T)

The problem with just mutliplying the keys by the queries will yield variance of the order of 
`head_size` when weights are initialized. Why?

(Attention is all you need https://arxiv.org/abs/1706.03762)

In [None]:
B, T, head_size = 4, 8, 15
sample_size = 100

k_var = 0
q_var = 0
affinities_var = 0

for _ in range(sample_size):
    k = torch.randn(B, T, head_size)
    q = torch.randn(B, T, head_size)
    affinities = k @ q.transpose(-1, -2)

    k_var += k.var()
    q_var += q.var()
    affinities_var += affinities.var()

print(k_var / sample_size)
print(q_var / sample_size)
print(affinities_var / sample_size)

This is because rand initializes the tensor with numbers drawn from a gaussian distribution (i.e. they have a 0-mean and a 1-variance). 
This means that the dot product of matrix where each row/column is a vector with 0-mean and 1-variance will result in the summing up of `head_size` number of vectors with 1-variance and the vectors in the resulting matrix will have `1 * head_size` variance. See proof below


In [None]:
sample_size = 100000
sum = 0
head_size = 3
for _ in range(sample_size):
    a = torch.randn(head_size, 3)
    b = torch.randn(head_size, 3)

    sum += (a @ b * head_size ** -0.5).var()

print(sum / sample_size)


# k = sqrt(head_size)
# sigma_a = E[a^2 / k^2] = E[a^2] / head_size
# 
# sigma_t = sigma_a + sigma_b = (E[a^2] + E[b^2]) / head_size


#### Motivation

The reason why it is important to normalize the weights (especially in the initial training) is that the softmax function converges to one-hot encoding of the largest value when there are very positive and very negative values in the same vector. See example below. This means that without the normalization information would be aggregated only from the highest-valued node which loses a lot of information.

In [None]:
a = torch.randn(5)
sharp_a = a * 10
print(F.softmax(a, dim=0)) # Values are fairly diffuse
print((F.softmax(sharp_a, dim=0, dtype=torch.float32) * 10000).round()/10000) # Approximates 1-hot encoding

In [None]:
mask = torch.tril(torch.ones(T, T)) 
affinities = affinities.masked_fill(mask == 0, float('-inf'))
affinities = F.softmax(affinities, dim=-1)

print(v.shape)
print(affinities.shape)
print((affinities @ v).shape)

### Bringing everything together

In [None]:
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(N_EMBD, head_size, bias=False)
        self.query = nn.Linear(N_EMBD, head_size, bias=False)
        self.value = nn.Linear(N_EMBD, head_size, bias=False)
        # Not a parameter of the module, so a pytorch naming convention
        self.register_buffer("mask", torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))

    def forward(self, inputs):
        B, T, C = inputs.shape
        k = self.key(inputs)
        q = self.query(inputs)
        v = self.value(inputs)
        
        affinities = k @ q.transpose(-1, -2) * (C ** -0.5)
        affinities = affinities.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
        affinities = F.softmax(affinities, dim=1)

        return affinities @ v

In [None]:
class BigramLanguageModelSelfAttention(nn.Module):
   
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
        self.sa_head = Head(N_EMBD)
        self.lm_head = nn.Linear(N_EMBD, vocab_size)

    def predict(self, input):
        B, T = input.shape
        tok_embd = self.token_embedding_table(input)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_embd + pos_embd

        x = self.sa_head(x)
        x = self.lm_head(x)
        return x

    def forward(self, input, targets):
        logits = self.predict(input)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return loss

    def generate(self, input, max_characters=20):
        for _ in range(max_characters):
            # Crop the input to the last BLOCK_SIZE tokens
            input_cond = input[:, -BLOCK_SIZE:]

            logits = self.predict(input_cond)
            logits = logits[:, -1, :]
            distribution = F.softmax(logits, dim=-1)
            next = torch.multinomial(distribution, num_samples=1)
            input = torch.cat((input_cond, next), dim=1)
        return input_cond[0]

# Multi-Head attention

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])

    def forward(self, inputs):
        B, T, C = inputs.shape
        x = torch.cat([head(inputs) for head in self.heads], dim=-1)
        return x

In [None]:
class BigramLanguageModelMultiHeadAttention(nn.Module):
   
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
        # Create 4 heads with size N_EMBD / 4, works like group convolution, where each head corresponds to a part of an image
        self.sa_heads = MultiHeadAttention(4, N_EMBD // 4)

        self.lm_head = nn.Linear(N_EMBD, vocab_size)

    def predict(self, input):
        B, T = input.shape
        tok_embd = self.token_embedding_table(input)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_embd + pos_embd

        x = self.sa_heads(x)
        x = self.lm_head(x)
        return x

    def forward(self, input, targets):
        logits = self.predict(input)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return loss

    def generate(self, input, max_characters=20):
        for _ in range(max_characters):
            # Crop the input to the last BLOCK_SIZE tokens
            input_cond = input[:, -BLOCK_SIZE:]

            logits = self.predict(input_cond)
            logits = logits[:, -1, :]
            distribution = F.softmax(logits, dim=-1)
            next = torch.multinomial(distribution, num_samples=1)
            input = torch.cat((input_cond, next), dim=1)
        return input_cond[0]

# Feed-forward 

In [None]:
class FeedForward(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        # self.net = nn.Sequential(
        #     nn.Linear(n_features, 4 * n_features),
        #     nn.GELU(),
        #     nn.Linear(4 * n_features, n_features),
        # )
        self.net = nn.Sequential(
            nn.Linear(n_features, 4*n_features),
            nn.ReLU(),
            nn.Linear(4*n_features, n_features),
        )
    
    def forward(self, inputs):
        return self.net(inputs)

In [None]:
class BigramLanguageModelMultiHeadAttentionFeedForward(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)

        self.sa_heads = MultiHeadAttention(4, N_EMBD // 4)
        self.ff = FeedForward(N_EMBD)
        self.lm = nn.Linear(N_EMBD, vocab_size)

    def predict(self, input):
        B, T = input.shape
        tok_embd = self.token_embedding_table(input)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_embd + pos_embd

        x = self.sa_heads(x)
        x = self.ff(x)
        x = self.lm(x)
        return x
    
    def forward(self, input, targets):
        logits = self.predict(input)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return loss
    
    def generate(self, input, max_characters):
        for _ in range(max_characters):
            # Crop the input to the last BLOCK_SIZE tokens
            input_cond = input[:, -BLOCK_SIZE:]

            logits = self.predict(input_cond)
            logits = logits[:, -1, :]
            distribution = F.softmax(logits, dim=-1)
            next = torch.multinomial(distribution, num_samples=1)
            input = torch.cat((input_cond, next), dim=1)
        return input_cond[0]

# Blocks
Blocks are a combination of an Attention Heads and a Feed Forward network. The idea is to chain multiple blocks to create a deep neural network.

In [None]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class MultiHeadAttentionProjection(nn.Module):

    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(N_EMBD, N_EMBD)

    def forward(self, inputs):
        x = torch.cat([head(inputs) for head in self.heads], dim=-1)
        x = self.proj(x)
        return x

In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttentionProjection(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        # self.ln1 = nn.LayerNorm(n_embd)
        # self.ln2 = nn.LayerNorm(n_embd)

    # def forward(self, x):
    #     x = x + self.sa(self.ln1(x))
    #     x = x + self.ffwd(self.ln2(x))
    #     return x

    def forward(self, input):
        input = input + self.sa(input)
        input = input + self.ffwd(input)

        return input

In [None]:
class BigramLanguageModelBlocks(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)

        self.blocks = nn.Sequential(*[Block(N_EMBD, 4) for _ in range(3)])
        self.lm = nn.Linear(N_EMBD, vocab_size)

    def predict(self, input):
        B, T = input.shape
        tok_embd = self.token_embedding_table(input)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_embd + pos_embd

        x = self.blocks(x)
        x = self.lm(x)
        return x
    
    def forward(self, input, targets):
        logits = self.predict(input)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return loss
    
    def generate(self, input, max_characters):
        for _ in range(max_characters):
            # Crop the input to the last BLOCK_SIZE tokens
            input_cond = input[:, -BLOCK_SIZE:]

            logits = self.predict(input_cond)
            logits = logits[:, -1, :]
            distribution = F.softmax(logits, dim=-1)
            next = torch.multinomial(distribution, num_samples=1)
            input = torch.cat((input_cond, next), dim=1)
        return input_cond[0]

# Testing out all models

In [None]:
# models = [BigramLanguageModelSelfAttention, BigramLanguageModelMultiHeadAttention, BigramLanguageModelMultiHeadAttentionFeedForward]
models = [BigramLanguageModelBlocks]
context = torch.zeros((1, 1), dtype=torch.long).to(device)
for model_cls in models:
    model = model_cls(VOCAB_SIZE).to(device)
    train(model, epochs=1000, eval_step=100)

In [None]:
res = model.generate(context, max_characters=200)
print(decode(res.tolist()))