In [2]:
'''
nn.Module gives access to 
- parameter tracking
- forward()
- to(device)
- train() / eval()

module(x) -> module__call__(x) -> module.forward(x)
This way, it
- tracks gradients automatically
- enables hooks
- supports JIT
- handles distributed and mixed-precision training
'''

'\nnn.Module gives access to \n- parameter tracking\n- forward()\n- to(device)\n- train() / eval()\n\nmodule(x) -> module__call__(x) -> module.forward(x)\nThis way, it\n- tracks gradients automatically\n- enables hooks\n- supports JIT\n- handles distributed and mixed-precision training\n'

In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size = 8
batch_size = 4
max_iters = 1000
learning_rate = 3e-3
eval_iters = 250
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/Users/lemi/gpt/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/lemi/gpt/.venv/lib/python3.12/site-packages/traitlets/config/application.py", l

In [4]:
with open('wizard.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(set(text))
vocab_size = len(chars)

In [5]:
char_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_char = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [char_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_char[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

In [6]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [7]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [17]:
# B = batch size, T = block size, C = vocab_size
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v # (B, T, head_size)
        return out


class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(head_size * n_head, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, head_size * n_head)
        out = self.dropout(self.proj(out)) # (B, T, n_embd)
        return out
        

class FeedForward(nn.Module): # Multilayer Perceptron
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd), # (B, T, 4*n_embd)
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd), # (B, T, n_embd)
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
        

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head # For trivial concatenation and clean residual connections
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x
        

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)]) # * unpacks a list into serparate arguments
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights) # call _init_weights for every sub-module in the model

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        # idx and targets are both (B, T) of integers
        B, T = index.shape
        tok_emb = self.token_embedding_table(index) # (B, T, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, n_embd)
        x = tok_emb + pos_emb # (B, T, n_embd)
        x = self.blocks(x) # (B, T, n_embd)
        x = self.ln_f(x) # (B, T, n_embd)
        logits = self.lm_head(x) # (B, T, C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
    
        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            index_cond = index[:, -block_size:]
            logits, loss = self.forward(index_cond) # logits = (B, T, C), loss is None
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            index = torch.cat((index, index_next), dim=1) # (B, max_new_tokens)
            
        return index

model = GPTLanguageModel(vocab_size)
m = model.to(device)

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# model.parameters() -> all trainable weights in the model
# Each parameter tensor has .data and .grad

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss {losses['train']:.3f}, val loss {losses['val']:.3f}")
    
    xb, yb = get_batch('train')
    
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True) # parameter.grad is None
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss 4.449, val loss 4.454
step: 250, train loss 3.206, val loss 3.195
step: 500, train loss 3.188, val loss 3.210
step: 750, train loss 3.158, val loss 3.156
2.848365306854248


In [19]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


ooastas d eeusw fsyfbietsooioOdiorg 
 rn  "' Ia rhl sd  i astnnfneePdiSrnr  srlil wgm rdte w_iice mu emhtC o a  ed,m l m a"  f yo Ospiaaat rIn t ,S sltsmB ey  o
f rn behf    rfiseip
hJwWntpae ewq's
 f o 
irOeeleoemtgmie ol
rnetef  hswfo   aaetotf 
sdnv"rc.oa, eeeeconle oh dcd doaqhae n  haad  bc emcnlacd ustgofhn dmaiees"ar ad d ef  oolete,ba 
n"bAos e'i"   bso "d"e d eirruoos atrnr,tgeero m e  hseodre i m
 okeo  anmAt
 nhlanaerooaWe
r d- t
gl
ntlmee liobt sna  ,e tp lehutnat"icnWh  e gr   ecob 
