# GPT2 dummy architecture

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}


In [2]:
import tiktoken
import torch

tokenizer = tiktoken.get_encoding('gpt2')

lines = [
    "Every effort moves you",
    "Every day holds a"    
]

batch = torch.stack([torch.tensor(tokenizer.encode(line)) for line in lines])

print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [3]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(self, cfg: dict[str, object]):
        super().__init__()

        # Embedding
        self.emb_layer = torch.nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_layer = torch.nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg["drop_rate"])

        # Transformer blocks
        self.trf_blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        # Layer norms
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

        

    def forward(self, in_idx: torch.Tensor) -> torch.tensor:
        num_batches, seq_len = in_idx.shape
        embedding = self.emb_layer(in_idx)
        pos_embeddings = self.pos_layer(torch.arange(seq_len, device=in_idx.device))

        x = embedding + pos_embeddings
        x = self.dropout(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x


torch.manual_seed(123)
gpt_dummy = DummyGPTModel(GPT_CONFIG_124M)
logits = gpt_dummy(batch)

print(logits, logits.shape)

tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>) torch.Size([2, 4, 50257])


Kurzformel für LayerNorm (pro Sample über die Feature-Dimension)

Gegeben ein Vektor $x \in \mathbb{R}^H$:
- Mittelwert: $\mu = \frac{1}{H}\sum_{i=1}^{H} x_i$
- Varianz: $\sigma^2 = \frac{1}{H}\sum_{i=1}^{H} (x_i - \mu)^2$
- Normalisierung: $\hat{x}_i = \frac{x_i - \mu}{\sqrt{\sigma^2 + \varepsilon}}$
- Optionale affine Transformation: $y_i = \gamma_i \hat{x}_i + \beta_i$

Hinweis:
- Bei Tensoren mit Shape [Batch, …, Features] werden $\mu$ und $\sigma^2$ für jede Zeile/Position über die letzte Dimension (Features) berechnet.
- $\varepsilon$ ist eine kleine Konstante für Numerikstabilität; $\gamma,\beta \in \mathbb{R}^H$ sind lernbar.

In [4]:
t = torch.randn(2, 5)
mu = t.mean(dim=-1, keepdim=True)
var = t.var(dim=-1, keepdim=True, unbiased=False)
t_hat = (t - mu) / torch.sqrt(var + 1e-5)
gamma = nn.Parameter(torch.ones(5))
beta  = nn.Parameter(torch.zeros(5))
out = gamma * t_hat + beta

print(out)
print(out.mean(dim=-1, keepdim=True))
print(out.var(dim=-1, unbiased=False, keepdim=True))



tensor([[ 0.0455,  0.9326, -1.0036, -1.2364,  1.2619],
        [ 0.8868,  0.9411, -1.8051, -0.1976,  0.1748]], grad_fn=<AddBackward0>)
tensor([[ 0.0000e+00],
        [-2.3842e-08]], grad_fn=<MeanBackward1>)
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [6]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim: int):
        super().__init__()
        self.eps = 1e-5                                   # Epsilon
        self.scale = nn.Parameter(torch.ones(emb_dim))    # Gamma
        self.shift = nn.Parameter(torch.zeros(emb_dim))   # Beta

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

ln = LayerNorm(emb_dim=5)
out_ln = ln(t)

print(out_ln)
print("Mean:\n",  out_ln.mean(dim=-1, keepdim=True))
print("Variance:\n", out_ln.var(dim=-1, unbiased=False, keepdim=True))



tensor([[ 0.0455,  0.9326, -1.0036, -1.2364,  1.2619],
        [ 0.8868,  0.9411, -1.8051, -0.1976,  0.1748]], grad_fn=<AddBackward0>)
Mean:
 tensor([[ 0.0000e+00],
        [-2.3842e-08]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [7]:
import tinygrad
import tinygrad.nn


class TinyLayerNorm:
    def __init__(self, emb_dim: int):
        super().__init__()
        self.eps = 1e-5                                                    # Epsilon
        self.scale = tinygrad.Tensor.ones(emb_dim, requires_grad=True)     # Gamma
        self.shift = tinygrad.Tensor.zeros(emb_dim, requires_grad=True)    # Beta

    def forward(self, x:tinygrad.Tensor) -> tinygrad.Tensor:
        mean = x.mean(axis=-1, keepdim=True)
        var = x.var(axis=-1, keepdim=True,  correction=False)
        norm_x = (x - mean) / (var + self.eps).sqrt()
        return self.scale * norm_x + self.shift


class TinyGPTModel:
    def __init__(self, cfg: dict[str, object]):
        # Embedding
        self.emb_layer = tinygrad.nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_layer = tinygrad.nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_rate = cfg["drop_rate"]

        # Transformer blocks
        self.trf_blocks = [DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        
        # Layer norms
        self.final_norm = TinyLayerNorm(cfg["emb_dim"])
        self.out_head = tinygrad.nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)


    def forward(self, in_idx: tinygrad.Tensor) -> tinygrad.tensor:
        num_batches, seq_len = in_idx.shape
        embedding = self.emb_layer(in_idx)
        pos_embeddings = self.pos_layer(tinygrad.Tensor.arange(seq_len))

        x = embedding + pos_embeddings
        x = x.dropout(self.drop_rate)
        for block in self.trf_blocks:
            x = block(x)
        x = self.final_norm.forward(x)
        logits = self.out_head(x)
        return logits


tiny_batch = tinygrad.Tensor.stack([tinygrad.Tensor(tokenizer.encode(line)) for line in lines])

tinygrad.Tensor.manual_seed(42)
tiny = TinyGPTModel(GPT_CONFIG_124M)
logits = tiny.forward(tiny_batch)

print(logits.numpy(), logits.shape)

[[[-0.519526    0.7537327   0.2005472  ...  1.0668471  -1.5136696
   -0.33381417]
  [-0.34302607 -0.42789376 -0.25507692 ... -0.4442997  -0.13493417
    0.87539715]
  [-0.13321133  0.28134     0.40881827 ... -0.44780132 -0.11065756
    0.1352929 ]
  [ 0.10996917  1.0887383   0.5327068  ... -0.424065   -0.12587118
   -0.37713853]]

 [[-0.519526    0.7537327   0.2005472  ...  1.0668471  -1.5136696
   -0.33381417]
  [-0.02714658 -0.44134513 -0.16352235 ... -0.32524362 -0.22037949
    0.8112958 ]
  [-0.24537109  0.26612917  0.45883688 ... -0.4342     -0.00584923
   -0.17356916]
  [ 0.04278945  0.97872955  0.4680154  ... -0.31337032 -0.01501034
   -0.2654793 ]]] (2, 4, 50257)
