In [None]:
!pip install tiktoken

In [None]:
from transformers import GPT2LMHeadModel

In [None]:
model_hf = GPT2LMHeadModel.from_pretrained('gpt2')
sd_hf = model_hf.state_dict()


for k, v in sd_hf.items():
    print(k, v.shape)

In [None]:
with open('asimov.txt', 'r') as f:
    asimov = f.read()
asimov = asimov[:1000]
print(asimov[37:565])

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.encode("Hello Hari Seldon!")

tokenizer.decode(tokenizer.encode("Hello Hari Seldon!"))

In [None]:
## 

# transformer.wte.weight torch.Size([50257, 768])
# transformer.wpe.weight torch.Size([1024, 768])

# transformer.h.0.ln_1.weight torch.Size([768])
# transformer.h.0.ln_1.bias torch.Size([768])
# transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
# transformer.h.0.attn.c_attn.bias torch.Size([2304])
# transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
# transformer.h.0.attn.c_proj.bias torch.Size([768])
# transformer.h.0.ln_2.weight torch.Size([768])
# transformer.h.0.ln_2.bias torch.Size([768])
# transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
# transformer.h.0.mlp.c_fc.bias torch.Size([3072])
# transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
# transformer.h.0.mlp.c_proj.bias torch.Size([768])

# transformer.h.11.ln_1.weight torch.Size([768])
# transformer.h.11.ln_1.bias torch.Size([768])
# transformer.h.11.attn.c_attn.weight torch.Size([768, 2304])
# transformer.h.11.attn.c_attn.bias torch.Size([2304])
# transformer.h.11.attn.c_proj.weight torch.Size([768, 768])
# transformer.h.11.attn.c_proj.bias torch.Size([768])
# transformer.h.11.ln_2.weight torch.Size([768])
# transformer.h.11.ln_2.bias torch.Size([768])
# transformer.h.11.mlp.c_fc.weight torch.Size([768, 3072])
# transformer.h.11.mlp.c_fc.bias torch.Size([3072])
# transformer.h.11.mlp.c_proj.weight torch.Size([3072, 768])
# transformer.h.11.mlp.c_proj.bias torch.Size([768])

# transformer.ln_f.weight torch.Size([768])
# transformer.ln_f.bias torch.Size([768])
# lm_head.weight torch.Size([50257, 768])


In [None]:
768*4

In [None]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F


# ---- #

batch_size = 64
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
dropout = 0.2


In [36]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layers: int = 12
    n_heads: int = 6
    n_embed: int = 768

# @dataclass
# class GPTConfig:
#     block_size: int = 1024
#     vocab_size: int = 50257
#     n_layers: int = 2
#     n_heads: int = 2
#     n_embed: int = 100

config = GPTConfig()

class MultiLayerPerceptron(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embed, config.n_embed * 4)
        self.c_proj = nn.Linear(config.n_embed * 4, config.n_embed)

    def forward(self, x):
        x = self.c_fc(x)
        x = F.gelu(x)
        x = self.c_proj(x)
        return x

class Attention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_heads = config.n_heads
        self.head_size = config.n_embed // config.n_heads
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed) # Why is matrix shaped ([2304, 768])
        self.c_proj = nn.Linear(config.n_embed, config.n_embed)

        self.register_buffer('tril', torch.tril(torch.ones(config.block_size, config.block_size)))

    def forward(self, x):
        B, T, C = x.shape
        x = self.c_attn(x)

        proj = x.view(B, self.n_heads, T, self.head_size*3)
        k, q, v = proj.split(self.head_size, dim=-1)

        wei = k @ q.transpose(-2, -1) * self.head_size**-0.5
        # Add triangle

        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        out = wei @ v
        out = out.view(B, T, C)

        return out

class Block(nn.Module):
    """Multiple head attention"""
    def __init__(self, config):
        super().__init__()
        self.n_heads = config.n_heads
        self.ln_1 = nn.LayerNorm(config.n_embed)
        self.attn = Attention(config)
        self.ln_2 = nn.LayerNorm(config.n_embed)
        self.mlp = MultiLayerPerceptron(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embed),
            wpe = nn.Embedding(config.block_size, config.n_embed),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
            ln_f = nn.LayerNorm(config.n_embed),
        ))
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False),

    def forward(self, x, targets=None):
        # x: (B, T, C)
        B, T = x.shape
        tok_emb = self.transformer.wte(x)
        pos_emb = self.transformer.wpe(torch.arange(T, device=device))
        x = tok_emb + pos_emb

        for block in self.transformer.h:
            x = block(x)

        logits = self.transformer.ln_f(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -config.block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
       

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

model = GPT(config).to(device)
print(f'{sum(p.numel() for p in model.parameters())/1e6:.2f} Million parameters')

context = torch.randint(0, config.vocab_size, (1, 2), device=device)
print(tokenizer.decode(model.generate(context, max_new_tokens=200)[0].tolist()))

124.44 Million parameters
خ him2ʎ whatction not� just can Ain�icoitionnt off��ind�Dly whose hin�ces).ice shigc5 ag so d..fan alberredect� mayonsormforeim see scXon� Eob� kces eromvery 6 yearall�t am?usm�� kroizom aboutasport with Eockul Theond���quick butantusachTam otherownastance con�b�� but�threatine� t@�


5.37 Million parameters


In [20]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
tokens = tokenizer.encode("What the dog doin¿")
x = torch.tensor(tokens).view(1, -1)
out = model(x)
print(out.shape)
decode

torch.Size([1, 1])


In [114]:
# for (key, param), (key_hf, v_hf) in zip(model.named_parameters(), sd_hf.items()):
#     # print(f'{key}: {param.shape} vs {key_hf}: {v_hf.shape}')
#     assert key == key_hf, f'{key} != {key_hf}'
#     if param.shape == v_hf.shape:
#         pass
#     elif param.shape[0] == v_hf.shape[1] and param.shape[1] == v_hf.shape[0]:
#         print(f'{key}: {param.shape} is transposed')
#     else:
#         raise ValueError(f'{param.shape} != {v_hf.shape}, key: {key}')


In [None]:
config.n_embed

In [None]:
# transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
# transformer.h.0.attn.c_attn.bias torch.Size([2304])
# transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
# transformer.h.0.attn.c_proj.bias torch.Size([768])


attn = Attention(config)
x = torch.randn(3, 20, 768)
attn(x).shape

In [None]:
config.n_embed, config.n_heads
head_size = config.n_embed // config.n_heads
head_size*18

In [None]:
x = torch.randn(3, 20, 2304)
# (3, 20, 2304), (B, T, head_size*n_heads*3)
# -> (B, n_heads, T, head_size*3)
print(f'x.shape: {x.shape}')
B, T, C = x.shape

In [None]:
  print(B, T, self.n_heads, self.head_size*3)
    k, q, v = x.view(B, T, self.n_heads, self.head_size*3), .split(self.head_size, dim=-1)