## Implementing GPT model from scratch to generate text

### 1) Coding an LLM architecture

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 200019,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [2]:
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.size()
        tok_embeds = self.token_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        
    def forward(self, x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        

    def forward(self, x):
        return x

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [3]:
import tiktoken

tokenizer = tiktoken.get_encoding("o200k_base")

batch = []

txt1 = "Pierwszy dzień wiosny jest"
txt2 = "Kazdy dzień jest bardzo dobry na"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)
#check num of tokens in tokenizer
print(tokenizer.n_vocab)

tensor([[152687,   8811,   3705, 155653,    286,   2453,   3008,  12637],
        [197105,   7593, 155653,  12637,  49755,  25148,   1102,    898]])
200019


In [4]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)

logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 8, 200019])
tensor([[[-7.9670e-01, -1.5191e+00, -2.3169e+00,  ...,  1.8177e-01,
           7.0826e-01, -4.7331e-01],
         [ 6.9835e-01,  6.0731e-01,  3.3680e-01,  ...,  1.7759e+00,
          -5.1157e-01, -6.6683e-01],
         [-2.8688e-01, -7.9203e-01,  3.5718e-02,  ...,  1.1765e+00,
           7.8765e-01, -2.7011e-01],
         ...,
         [-1.1749e+00, -1.0568e+00,  8.8069e-01,  ...,  2.0276e-01,
           9.7955e-01,  6.6893e-01],
         [-1.0532e+00,  1.8222e-01,  2.5785e-01,  ..., -3.7386e+00,
          -1.0138e+00, -1.7017e+00],
         [ 5.6126e-01,  4.8769e-01, -8.2236e-01,  ...,  3.1312e-02,
          -4.7627e-01, -3.1223e-01]],

        [[-2.2843e-01, -4.6201e-01, -1.6914e+00,  ...,  1.1486e+00,
          -3.3998e-01, -4.6595e-02],
         [ 1.0330e-01,  2.3047e-01, -1.7211e-01,  ...,  1.0148e-01,
          -1.2830e+00, -9.1505e-02],
         [ 3.5997e-01, -2.3105e+00, -1.6541e-01,  ...,  1.3573e+00,
           6.0159e-01, -6.1696e-01]

### 2) Normalizing activations with layer normalization

In [5]:
torch.manual_seed(123)

batch_example = torch.randn(2, 5)
batch_example

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])

In [6]:
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
out

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [20]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [21]:
ln = LayerNorm(6)
outputs_normed = ln(out)
outputs_normed

tensor([[ 0.6745,  1.5470, -0.9549,  0.6431, -0.9549, -0.9549],
        [-0.0207,  0.1228, -1.1913,  1.6619,  0.6186, -1.1913]],
       grad_fn=<AddBackward0>)

In [22]:
outputs_normed.mean(dim=-1, keepdim=True), outputs_normed.var(dim=-1, keepdim=True)

(tensor([[ 0.0000e+00],
         [-1.9868e-08]], grad_fn=<MeanBackward1>),
 tensor([[1.1994],
         [1.1996]], grad_fn=<VarBackward0>))