<a href="https://colab.research.google.com/github/HolgerMolin/HolgerGPT/blob/main/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import math
from torch.nn import init
import random
import numpy as np
from tokenizer import Tokenizer
from architecture import HolgerGPT

TODO:
    - RMS norm
    - Spicy attention
    - GeLU
    

In [2]:
dataset = open('tokenizer_training.txt').read().lower()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab = sorted(list(set(dataset)))
dataset = open('training.txt').read().lower()

vocab_size = len(vocab)
tokenizer = Tokenizer(vocab)
dataset = tokenizer.encode(dataset)
tokenizer.train(dataset, vocab_size - len(vocab))
text_length = len(dataset)
print(f'Number of tokens: {text_length:_}')
print(tokenizer.vocab)

Number of tokens: 1_115_394
['\n', ' ', '!', '#', '$', '&', "'", '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', '\\', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [3]:
context_length = 32
model = HolgerGPT(3, 64, 2, 64, context_length, vocab_size)
print(f'Number of parameters: {sum(p.numel() for p in model.parameters()):_}')
model = model.to(device)

Number of parameters: 200_502


In [4]:
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

class TransformerLRScheduler:
    def __init__(self, optimizer, d_model, warmup_steps=4000):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.step_num = 1

    def step(self):
        self.step_num += 1
        lr = self._get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

    def _get_lr(self):
        arg1 = self.step_num ** -0.5
        arg2 = self.step_num * (self.warmup_steps ** -1.5)
        return (self.d_model ** -0.5) * min(arg1, arg2)

scheduler = TransformerLRScheduler(optim, 256)

In [5]:
for transformer_block in model.transformer_blocks:
    for attention_head in transformer_block.attention.heads:
        attention_head.w_k = attention_head.w_k.to(device)
        attention_head.w_q = attention_head.w_q.to(device)
        attention_head.mask = attention_head.mask.to(device)

**PRETRAINING

In [6]:
dataset = open('ptbdataset/ptb.train.txt').read().lower()
dataset = tokenizer.encode(dataset)
text_length = len(dataset)

max_index = text_length - context_length - 1
num_steps = 20_000
batch_size = 256
display_frequency = 50
period_loss = 0
start_indices = np.random.randint(0, max_index, size=(num_steps, batch_size))
for step in range(num_steps):
    optim.zero_grad()

    x = [dataset[start_idx: start_idx + context_length] for start_idx in start_indices[step]]
    x = torch.tensor(x)
    y_true = [dataset[start_idx + context_length] for start_idx in start_indices[step]]
    y_true = torch.tensor(y_true)

    y_pred = model.forward(x)
    loss = criterion(y_pred, y_true)
    period_loss += loss.item()
    
    loss.backward()
    if step % display_frequency == 0:
        print(f'{scheduler.step_num} {period_loss/display_frequency:4f} {scheduler._get_lr():4e}')
        period_loss = 0
    optim.step()
    scheduler.step()

1 0.080578 2.470529e-07
51 3.860074 1.259970e-05
101 3.307584 2.495235e-05
151 3.026561 3.730499e-05
201 2.983659 4.965764e-05
251 2.971040 6.201029e-05
301 2.957855 7.436294e-05
351 2.941096 8.671558e-05
401 2.889591 9.906823e-05
451 2.822158 1.114209e-04
501 2.698100 1.237735e-04
551 2.581423 1.361262e-04
601 2.520832 1.484788e-04
651 2.448850 1.608315e-04
701 2.383521 1.731841e-04
751 2.350719 1.855368e-04
801 2.323594 1.978894e-04
851 2.294322 2.102421e-04
901 2.265074 2.225947e-04
951 2.249390 2.349473e-04
1001 2.227370 2.473000e-04
1051 2.168285 2.596526e-04
1101 2.142169 2.720053e-04
1151 2.157745 2.843579e-04
1201 2.103859 2.967106e-04
1251 2.115675 3.090632e-04
1301 2.088366 3.214159e-04
1351 2.083470 3.337685e-04
1401 2.054857 3.461212e-04
1451 2.036747 3.584738e-04
1501 2.031680 3.708265e-04


In [None]:
sample_text = dataset[:context_length]
indices = list(sample_text)
output = []
for _ in range(500):
    x = torch.tensor(indices).reshape(1, -1).to(device)
    y = model.forward(x)
    y = F.softmax(y, dim=1)
    y = torch.multinomial(y, 1)
    y = y.item()
    indices.pop(0)
    indices.append(y)
    output.append(y)
    
print(''.join(tokenizer.decode(output)))

In [None]:
sample_text = dataset[:context_length]
indices = list(sample_text)
output = []
for _ in range(500):
    x = torch.tensor(indices).reshape(1, -1).to(device)
    y = model.forward(x)
    y = F.softmax(y, dim=1)
    y = torch.multinomial(y, 1)
    y = y.item()
    indices.pop(0)
    indices.append(y)
    output.append(y)
    
print(''.join(tokenizer.decode(output)))

In [None]:
torch.save(model.state_dict(), 'intermediate_model.pth')