# Artificial Peter Drury

In [1]:
#import necessary libraries
import torch
import torch.nn as nn
from torch.nn import functional as F

## Hyperparameters

In [2]:
#hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu' # use GPU if available - for faster training!
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0


torch.manual_seed(1337)

<torch._C.Generator at 0x7f5fdbc919b0>

## Dataset Loading

In [17]:
# load the data, Drury dataset
with open('data/drury.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab_size

79

## Tokenization
Tokenize the text - convert each character to a unique integer ID - here we are using character-level tokenization
There are several methods for tokenization:
- character-level tokenization
- word-level tokenization
- sub-word level tokenization
 You can also use a pre-trained tokenizer such as [SentencePiece](https://github.com/google/sentencepiece), [TikToken](https://github.com/openai/tiktoken) etc

In [6]:
# Tokenize the text - convert each character to a unique integer ID - here we are using character-level tokenization
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
itos

{0: '\n',
 1: ' ',
 2: '!',
 3: '&',
 4: "'",
 5: '(',
 6: ')',
 7: ',',
 8: '-',
 9: '.',
 10: '/',
 11: '0',
 12: '1',
 13: '2',
 14: '3',
 15: '4',
 16: '5',
 17: '6',
 18: '7',
 19: '8',
 20: '9',
 21: ':',
 22: '?',
 23: 'A',
 24: 'B',
 25: 'C',
 26: 'D',
 27: 'E',
 28: 'F',
 29: 'G',
 30: 'H',
 31: 'I',
 32: 'J',
 33: 'K',
 34: 'L',
 35: 'M',
 36: 'N',
 37: 'O',
 38: 'P',
 39: 'Q',
 40: 'R',
 41: 'S',
 42: 'T',
 43: 'U',
 44: 'V',
 45: 'W',
 46: 'Y',
 47: 'a',
 48: 'b',
 49: 'c',
 50: 'd',
 51: 'e',
 52: 'f',
 53: 'g',
 54: 'h',
 55: 'i',
 56: 'j',
 57: 'k',
 58: 'l',
 59: 'm',
 60: 'n',
 61: 'o',
 62: 'p',
 63: 'q',
 64: 'r',
 65: 's',
 66: 't',
 67: 'u',
 68: 'v',
 69: 'w',
 70: 'x',
 71: 'y',
 72: 'z',
 73: '–',
 74: '‘',
 75: '’',
 76: '“',
 77: '”',
 78: '…'}

## Splitting the data into training and test sets
We are splitting the data into training and test sets. The training set will be used to train the model and the test set will be used to evaluate the model. We will use 90% of the data for training and 10% for testing.

The reason for splitting the dataset is because we do not want a perfect memorization of the dataset. We want the model to generalize well to unseen data. If we do not split the dataset, the model will memorize the training data and will not perform well on the test data.

In [15]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # 90/10 train/test split
train_data = data[:n]
val_data = data[n:]
train_data

tensor([35, 47, 66,  ..., 55, 49, 66])

## Data Loading

In [8]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

## Attention mechanism

In [9]:


class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x



## Model

In [10]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx



## Optimizer

In [11]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)



0.211535 M parameters


## Training

In [12]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.6513, val loss 4.6271
step 100: train loss 2.6839, val loss 2.7591
step 200: train loss 2.4990, val loss 2.5412
step 300: train loss 2.4067, val loss 2.4650
step 400: train loss 2.2812, val loss 2.3409
step 500: train loss 2.1703, val loss 2.2600
step 600: train loss 2.0559, val loss 2.1685
step 700: train loss 1.9590, val loss 2.1566
step 800: train loss 1.8649, val loss 2.0971
step 900: train loss 1.7874, val loss 2.0827
step 1000: train loss 1.7011, val loss 2.0660
step 1100: train loss 1.6297, val loss 2.0298
step 1200: train loss 1.5460, val loss 2.0384
step 1300: train loss 1.4744, val loss 2.0804
step 1400: train loss 1.4092, val loss 2.0697
step 1500: train loss 1.3533, val loss 2.1194
step 1600: train loss 1.3045, val loss 2.1105
step 1700: train loss 1.2273, val loss 2.1128
step 1800: train loss 1.1798, val loss 2.1820
step 1900: train loss 1.1269, val loss 2.2044
step 2000: train loss 1.0743, val loss 2.2805
step 2100: train loss 1.0123, val loss 2.2053


## Generate Drury-like commentary

In [14]:
context = torch.zeros((2, 2), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=5000)[0].tolist()))




Match: Argentina vs. Mexico
Tournament: 2022 Frefies parist for the unfannited pressure parown champions the lear unfinaldired is hard the match extring forceling the wearlitics


Match: Chelsess, vs. Liverpool
Tournament: 2022 FIFA World Cup, Qatar 25
Commentary:
Drink it is Manchester City’s fantasy footballers bout sted to this final peak. Lionel Messi has conquered his final peak. Lionel - Grounst Man City’s to mainst goal. Whuthernam ia fashion B
Match: Senta Roma vs. Barcelona
Tournament: 2018 World Cup mitanout hal his final pencedsive conto on theing a moments throughout

Match: Chelsea France Etihas Fe it nears wenders of Europe notho lenst dae on of bre behing the opener was a millippe the Belfthing that wasn’t lhead stort of dis football called “Depressing, sther City’s man of menst in the brace to on the weeeko league has not won. He’s scome it in the nation to esu who a the fulless of ‘Wayne Rooney, out of differ risen. A blue.
The bruary This of bring career, but one d