In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.0,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [7]:
import torch
import torch.nn as nn


def build_gpt_model(cfg):
    # ---- Submodules ----

    class LayerNorm(nn.Module):
        def __init__(self, emb_dim):
            super().__init__()
            self.eps = 1e-5
            self.scale = nn.Parameter(torch.ones(emb_dim))
            self.shift = nn.Parameter(torch.zeros(emb_dim))

        def forward(self, x):
            mean = x.mean(dim=-1, keepdim=True)
            var = x.var(dim=-1, keepdim=True, unbiased=False)
            norm_x = (x - mean) / torch.sqrt(var + self.eps)
            return self.scale * norm_x + self.shift

    class GELU(nn.Module):
        def forward(self, x):
            return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * x.pow(3))))

    class FeedForward(nn.Module):
        def __init__(self, cfg):
            super().__init__()
            self.layers = nn.Sequential(
                nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
                GELU(),
                nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
            )

        def forward(self, x):
            return self.layers(x)

    class MultiHeadAttention(nn.Module):
        def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
            super().__init__()
            assert d_out % num_heads == 0
            self.num_heads = num_heads
            self.head_dim = d_out // num_heads
            self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
            self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
            self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
            self.out_proj = nn.Linear(d_out, d_out)
            self.dropout = nn.Dropout(dropout)
            self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

        def forward(self, x):
            b, num_tokens, _ = x.shape
            q = self.W_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
            k = self.W_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
            v = self.W_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
            attn_scores = q @ k.transpose(2, 3) / self.head_dim ** 0.5
            mask = self.mask[:num_tokens, :num_tokens].bool()
            attn_scores = attn_scores.masked_fill(mask, float("-inf"))
            attn_weights = torch.softmax(attn_scores, dim=-1)
            attn_weights = self.dropout(attn_weights)
            context = (attn_weights @ v).transpose(1, 2).contiguous().view(b, num_tokens, -1)
            return self.out_proj(context)

    class TransformerBlock(nn.Module):
        def __init__(self, cfg):
            super().__init__()
            self.att = MultiHeadAttention(
                d_in=cfg["emb_dim"],
                d_out=cfg["emb_dim"],
                context_length=cfg["context_length"],
                num_heads=cfg["n_heads"],
                dropout=cfg["drop_rate"],
                qkv_bias=cfg["qkv_bias"]
            )
            self.ff = FeedForward(cfg)
            self.norm1 = LayerNorm(cfg["emb_dim"])
            self.norm2 = LayerNorm(cfg["emb_dim"])
            self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

        def forward(self, x):
            x = x + self.drop_shortcut(self.att(self.norm1(x)))
            x = x + self.drop_shortcut(self.ff(self.norm2(x)))
            return x

    # ---- GPT Model ----

    class GPTModel(nn.Module):
        def __init__(self, cfg):
            super().__init__()
            self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
            self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
            self.drop_emb = nn.Dropout(cfg["drop_rate"])

            self.trf_blocks = nn.Sequential(
                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
            )

            self.final_norm = LayerNorm(cfg["emb_dim"])
            self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

        def forward(self, in_idx):
            batch_size, seq_len = in_idx.shape
            tok_embeds = self.tok_emb(in_idx)
            pos_ids = torch.arange(seq_len, device=in_idx.device)
            pos_embeds = self.pos_emb(pos_ids)
            x = self.drop_emb(tok_embeds + pos_embeds)
            x = self.trf_blocks(x)
            x = self.final_norm(x)
            return self.out_head(x)

    return GPTModel(cfg)


In [9]:
import torch
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [13]:
torch.manual_seed(123)
model = build_gpt_model(GPT_CONFIG_124M)

batch = torch.randint(0, GPT_CONFIG_124M["vocab_size"], (2, GPT_CONFIG_124M["context_length"]))  # (batch_size=2)
# Run model
out = model(batch)
# Print results
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)  # Should be (2, 128, 50257)
print(out)

Input batch:
 tensor([[25371, 42188, 47556,  ..., 24530, 23973, 42738],
        [10319, 13677, 15405,  ..., 46829, 43675, 36998]])

Output shape: torch.Size([2, 1024, 50257])
tensor([[[-0.1979, -0.9429, -0.1899,  ...,  0.5057,  0.0779, -0.8494],
         [ 0.1766, -0.6579,  0.3999,  ..., -0.8292,  0.3827, -0.5175],
         [ 2.0439, -0.7966, -0.7330,  ...,  0.0813, -0.1084, -1.0077],
         ...,
         [ 0.6889,  0.5231,  0.2079,  ...,  0.3870, -0.9431,  0.1925],
         [-0.1019, -0.6580, -1.0914,  ..., -0.1600, -0.4862,  0.8976],
         [-0.3582,  0.1140, -0.3821,  ...,  0.0150, -1.5644,  0.7577]],

        [[ 0.0855, -0.2615,  0.0312,  ...,  0.7823, -0.1748, -1.0757],
         [ 0.6213, -0.4445,  0.9343,  ..., -0.5286, -0.1273, -0.4402],
         [ 0.9910, -0.6334,  0.0214,  ..., -0.1111, -0.6266, -0.2978],
         ...,
         [ 0.1882, -0.0144,  0.0783,  ...,  0.3538,  0.3164, -0.0981],
         [-0.5850, -0.7166, -0.4520,  ...,  0.2429, -1.0128,  0.7518],
         [ 0.2

In [15]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [17]:
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_feature

In [19]:
start_context = "Hello, I am"

encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [21]:
out = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10


In [23]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue
