# BUILD & Train GPT-decoder-only-transformer.

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F


In [None]:
# hyperparameters
batch_size = 16 # How many independent training sequences we process in one forward/backward pass. B = 16

# same, will need it during training. B T C
block_size = 32 # what is the maximum context length for predictions? MOST IMPORTANT HYPERPARAMETER IN A DECODER-ONLY MODEL
# This is the context length or maximum number of tokens the model can “see” when predicting the next token.
# T = block_size = 32 and GPT-2 small uses 1024
# Every attention head computes attention over T × T = 32 × 32
# Positional embeddings table is also shaped (block_size, n_embd)


# during training
max_iters = 5000 # How many gradient updates we perform during training. More iterations = better convergence usually.
eval_interval = 100 # during eval. How frequently we run evaluation on validation data (every 100 steps).

# during training
learning_rate = 1e-3 # This is extremely high for Transformer training (typical is 3e-4), but small models can handle larger LR.

# optional
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Moves tensors and model to GPU if available.

# during eval
eval_iters = 200 # How many evaluation batches we run to compute test loss.

# size of embedding per token, lets say there are 100 tokens, we will construct 100 x 64 token embedding table
n_embd = 64 # Each token is embedded into a 64-dimensional vector.

''' Token embedding table → (vocab_size, 64)
		Positional embedding table → (block_size, 64)
		Linear projections in Q, K, V → nn.Linear(64, head_size)
		Multi-head concat → (B, T, 64)
		Feedforward network W1 → (64, 4×64 = 256)
		Feedforward network W2 → (256, 64)
    LayerNorm → operates over dimension 64
  '''

# number of heads, each head will be of size 64/4 = 16. Number of attention heads in each transformer block.
n_head = 4

# number of layers. Number of transformer blocks stacked sequentially
n_layer = 4
'''
The block:
Input
 → LayerNorm
 → MultiHeadAttention
 → Residual Add
 → LayerNorm
 → FeedForward
 → Residual Add
 → Output
'''

# Embedding → Block → Block → Block → Block → LayerNorm → LM Head
#This is a 4-layer GPT-like decoder-only transformer.


# lets see where they use dropout.
dropout = 0.0

# ------------
'''
Dropout used inside:
	•	Multi-head attention output projection
	•	FeedForward network output

Since you set it to 0, dropout is disabled.
'''

'\nDropout used inside:\n\t•\tMulti-head attention output projection\n\t•\tFeedForward network output\n\nSince you set it to 0, dropout is disabled.\n'

### You are training a GPT-2–style decoder-only transformer with:
	•	4 transformer layers
	•	4 attention heads per layer
	•	Embedding dim = 64
	•	Context length = 32
	•	Batch size = 16
	•	About ~500k parameters

### This is small enough to:
	•	train on CPU
	•	fit in 2–4GB VRAM
	•	converge in minutes

#### But architecturally identical to GPT-2 (just scaled down).

In [None]:
torch.manual_seed(1337)


! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# get random indices batches for block size
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

--2025-11-20 03:38:12--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-11-20 03:38:13 (21.2 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
# will be used in training.
@torch.no_grad() # prevents creation of gradients during evaluation → saves memory and speeds up the code. Use this when you only want forward passes (no backprop).
def estimate_loss(): # defines a helper function that computes average loss estimates on both train and validation splits.
    out = {} # empty dict to store avg losses in train and losses in val
    model.eval() # set the PyTorch model to evaluation mode, no dropout
    for split in ['train', 'val']: # loop twice, once to measure training loss and once for val loss
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [None]:
# Setting up a Head Class.
# we are structuring Head class in such a way that it can be reused to compute multiple heads, where each head has its own Wq, Wk, Wv.
# Each head object can run in parallel to learn multiple attention patterns in parallel

class Head(nn.Module): # Each Head is the implementation of Attention mechanism. This head is going to be consumed in MultiHeadAttention class.
    """ one head of self-attention """
    def __init__(self, head_size):

      # now each head should have their own Key, Value and Query matrices that eventually gets multiplied to Z. Where Z is the input to the transformer block.
      # Now, these weight matrices might or might not have biases.
      # they will internally transform the incoming input to [Input @ Weight (k/q/v)]
        super().__init__()

        # assume x: (B batches, T block_size, C n_embed)
        self.key = nn.Linear(n_embd, head_size, bias=False) # output = Z @ Wk [(B, T, n_embd)  @ (n_embd, head_size) --> (B, T, head_size)]
        self.query = nn.Linear(n_embd, head_size, bias=False) # output = Z @ Wq [[(B, T, n_embd)  @ (n_embd, head_size) --> (B, T, head_size)] ]
        self.value = nn.Linear(n_embd, head_size, bias=False)  # output = Z @ Wv [[(B, T, n_embd)  @ (n_embd, head_size) --> (B, T, head_size)] ]
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # this creates a lower triangular matric of block_size x block_size
        #  register_buffer registers the generated lower-triangular matrix as a "buffer" named 'tril' within the module
        # causal mask: we register it so that it does not become as parameters and become part of gradients.
        self.dropout = nn.Dropout(dropout) # why do we need a dropout in the constructor of head ?
        # Without dropout, the model quickly memorizes patterns, especially with small datasets

    '''
    x = input to attention                      (B,T,C) or (B,T,n_embed)
    k = key                                     (B,T,head_size)
    q = query                                   (B,T,head_size)
    v = value                                   (B,T,head_size)
    wei = intermediate output of attention       (B,T,T)
    out  = output of attention                   (B,T,head_size)
    '''

    def forward(self, x): # x: B, T, head_size or C [C == n_embed ]
        B,T,C = x.shape
        k = self.key(x)   # size of k ??
        q = self.query(x) # size of q ?
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,head_size) # i got value from x
        out = wei @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size). # last step of attention.
        return out # [B, T, head_size] one-head-output


In [None]:
# multi-head-attention class: This class essentially does: Consumed in the transformer block. Why does this also inherit the nn module ?
# it contains learnable layers (multiple Head objects + linear layer), it needs to register parameters in PyTorch’s graph, it needs .to(device), .parameters(), .train(), etc.
# Every neural component → must be a module.

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size): # takes input as number of heads and head-size
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # Each head outputs ==>  h(x): (B, T, head_size)
        # ModuleList: list list specifically for storing PyTorch submodules
        # so that the underlying parameters get trained.
        # Normal Python list will not register the modules, so their parameters won’t train.
        self.proj = nn.Linear(n_embd, n_embd) # what do we do here ?
        self.dropout = nn.Dropout(dropout) # why dropout here ?

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # concat all heads.
        out = self.dropout(self.proj(out)) # multiply with Wo the whole concatenated output. (B, T, n_embd) (n_embed, n_embed)
        return out # output: B,T,n_embed


In [None]:
# having feed-forward as an class.
# Attention handles communication between tokens.
# The FFN then handles computation at each token separately.
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
# consumed in transformer block
    def __init__(self, n_embd): #
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), # W1: (n_embd, 4 * n_embd)
                            # b1: (4 * n_embd) # expansion layer, giving the model more expressive capacity.

            nn.ReLU(), # Adds non-linearity, so the model can learn complex functions.

            nn.Linear(4 * n_embd, n_embd), # This projects back down to the original embedding size.
            nn.Dropout(dropout),
        )

    def forward(self, x): #
        return self.net(x)
        ''' self.net = nn.Sequential(
    Linear1,
    ReLU,
    Linear2,
    Dropout
)
produces the following output:
input x gets multiplied with W1==> x1 = X @ W1 + b1  and x2 = relu(x1)  (B, T, 4*n_embd)
input x2 gets multiplied with W2==> x3 = x2 @ W2 + b2 and x4 = relu(x3) (B, T, n_embd) and out = Dropout(x3)
'''

## one full Transformer decoder block, exactly like GPT-2

In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head # head size = total embeddings/number of heads.   64 / 4 = 16. Each attention head works on 16-dimensional projections of the embeddings.
        self.sa = MultiHeadAttention(n_head, head_size) # compute attention across all heads.  output dimension of MHA is always (B, T, n_embd)
        self.ffwd = FeedFoward(n_embd) # do ffd. output is  (B, T, n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # do layer norm. This normalizes each token embedding before it enters attention.
        self.ln2 = nn.LayerNorm(n_embd) # do layer norm. This normalizes each token embedding before it enters the feedforward network.

    def forward(self, x): # x goes through attention then layer norm then added into
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

"""
Transformer Block Forward Pass

Input:
    x : (B, T, n_embd)


The block performs two major operations:
1) Communication across tokens (Self-Attention)
2) Computation at each token (Feed-Forward Network)

Both operations use residual connections and layer norms.

===
Step-by-step:

1. a1 = ln1(x)
   - Apply LayerNorm to x.
   - Shape: (B, T, n_embd)

2. sa_out = sa(a1)
   - Run Multi-Head Self-Attention on the normalized input.
   - This mixes information across tokens.
   - Shape: (B, T, n_embd)

3. x = x + sa_out
   - First residual connection ("Add & Norm").
   - Shape: (B, T, n_embd)

4. a2 = ln2(x)
   - Apply LayerNorm again before feedforward.
   - Shape: (B, T, n_embd)

5. ff_out = ffwd(a2)
   - Run the 2-layer feedforward MLP:
        (n_embd → 4*n_embd → n_embd)
   - Applied independently to each token.
   - Shape: (B, T, n_embd)

6. x = x + ff_out
   - Second residual connection.
   - Shape: (B, T, n_embd)

Output:
    x : (B, T, n_embd)


          ┌─────────────────┐
x ── LN ─►│ MultiHead-Attention  │───┐
          └─────────────────┘   │
                                ▼
                    Residual Add (+)

          ┌─────────────────┐
x ── LN ─►│ FeedForward MLP │───┐
          └─────────────────┘   │
                                ▼
                    Residual Add (+)
"""

'\nTransformer Block Forward Pass \n\nInput:\n    x : (B, T, n_embd)\n      \n\nThe block performs two major operations:\n1) Communication across tokens (Self-Attention)\n2) Computation at each token (Feed-Forward Network)\n\nBoth operations use residual connections and layer norms.\n\n===\nStep-by-step:\n\n1. a1 = ln1(x)\n   - Apply LayerNorm to x.\n   - Shape: (B, T, n_embd)\n\n2. sa_out = sa(a1)\n   - Run Multi-Head Self-Attention on the normalized input.\n   - This mixes information across tokens.\n   - Shape: (B, T, n_embd)\n\n3. x = x + sa_out\n   - First residual connection ("Add & Norm").\n   - Shape: (B, T, n_embd)\n\n4. a2 = ln2(x)\n   - Apply LayerNorm again before feedforward.\n   - Shape: (B, T, n_embd)\n\n5. ff_out = ffwd(a2)\n   - Run the 2-layer feedforward MLP:\n        (n_embd → 4*n_embd → n_embd)\n   - Applied independently to each token.\n   - Shape: (B, T, n_embd)\n\n6. x = x + ff_out\n   - Second residual connection.\n   - Shape: (B, T, n_embd)\n\nOutput:\n    x :

In [None]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # get embeddings of vocab x n_embed, to have a trainable embedding vector
        self.position_embedding_table = nn.Embedding(block_size, n_embd) #  block_size x n_embed.  this also trainable here. we dont need vocab here, because sequence/order is to be captured.
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # have 4 transformer blocks. each block maps (B,T,C) → (B,T,C), so chaining works.
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm after all transformer blocks. In PRE-LN, a final LN is REQUIRED before the output layer. to prevent prevent explosion of embeddings and improve training stability

        # Projection from embedding dimension → vocabulary (input to layer: n_embd and output of this layer: vocab_size)
        # convert the final hidden vector → next-token logits
        self.lm_head = nn.Linear(n_embd, vocab_size) # at the end, i learn all global embeddings across all the vocab ? what is lm_head ? how is it this size?

    def forward(self, idx, targets=None):
        B, T = idx.shape # same as bigram lang model
        # idx and targets are both (B,T) tensor of integers
        # idx:     (B,T) integers
        # targets: (B,T) integers


        # convert token ids → vectors
        # token_embedding_table(idx) returns shape (B,T,n_embd)
        tok_emb = self.token_embedding_table(idx) # pytorch replaces each token ID in idx is by its corresponding embedding vector of size n_embd. thus B,T becomes B,T,n_embd

        #  torch.arange(T) generates positions [0, 1, 2, ..., T-1].
        # then we create embeddings for each position and self.position_embedding_table becomes a learnable matrix of size (block_size, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C) # i dont understand this..


        #tok_emb: (B,T,C)
        #    pos_emb: (T,C) → broadcast to (B,T,C)
        x = tok_emb + pos_emb # (B,T,C) # i hope both tok and pos are same size.
        x = self.blocks(x) # (B,T,C) # x goes through transformer blocks
        x = self.ln_f(x) # (B,T,C) # x goes through layer norm

        # A linear classifier converting hidden vectors → logits over vocabulary.
        logits = self.lm_head(x) # (B,T,vocab_size) # we build logits with x, finally.

        if targets is None: # when targets is none, use it as a .predict() function in the generate function?
            loss = None
        else:
            B, T, C = logits.shape # do cross-entrop loss calculation and return

            # reshape logits from (B,T,C) → (B*T,C) to make it compatible for cross-entropy func
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            # cross-entropy over vocabulary
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens): # This function generates new text given an initial context (idx). Let’s break it step by step:
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            '''
            	assume block size = 4
              	Current sequence: [I, love, deep, learning, and, transformers] → length 6
	            	Last 4 tokens: [deep, learning, and, transformers]

            '''
            idx_cond = idx[:, -block_size:] # Transformer has a context window (block_size). We take only the last block_size tokens to feed into the model. but why?
            # get the predictions
            logits, loss = self(idx_cond) # do forward pass to get logits

            # logits shape: (B, T_cond, vocab_size)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C) raw scores into probabs. Logits are just raw, unnormalized scores from the final linear layer.
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1). argmax → always picks the most probable token → deterministic → can get boring repeated sequences.
            # softmax + multinomial → picks tokens probabilistically → allows creative/random generation. Multinomial then introduces controlled randomness according to predicted probabilities.
            # append sampled index to the running sequence.
            # allows stochastic generation, avoids boring deterministic output

            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
model = BigramLanguageModel() # Creates an instance of your BigramLanguageModel class.
m = model.to(device) # Moves the model to the specified device
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
#	torch.optim.AdamW is the Adam optimizer with weight decay, which helps regularize and improve generalization.
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


0.209729 M parameters


In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 4.4116, val loss 4.4022
step 100: train loss 2.6568, val loss 2.6670
step 200: train loss 2.5090, val loss 2.5058
step 300: train loss 2.4194, val loss 2.4334
step 400: train loss 2.3501, val loss 2.3568
step 500: train loss 2.2963, val loss 2.3129
step 600: train loss 2.2410, val loss 2.2501
step 700: train loss 2.2057, val loss 2.2191
step 800: train loss 2.1633, val loss 2.1860
step 900: train loss 2.1242, val loss 2.1498
step 1000: train loss 2.1027, val loss 2.1298
step 1100: train loss 2.0692, val loss 2.1183
step 1200: train loss 2.0386, val loss 2.0797
step 1300: train loss 2.0276, val loss 2.0652
step 1400: train loss 1.9925, val loss 2.0370
step 1500: train loss 1.9702, val loss 2.0302
step 1600: train loss 1.9645, val loss 2.0487
step 1700: train loss 1.9421, val loss 2.0143
step 1800: train loss 1.9091, val loss 1.9953
step 1900: train loss 1.9085, val loss 1.9874
step 2000: train loss 1.8861, val loss 1.9957
step 2100: train loss 1.8731, val loss 1.9765


In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))



FlY BOLINGHARD:
Nay, humbract; it contes too
must encleming and the second; and say life;
In enter all I are and those it;
Give out of your I'll tom them nither,
One these is news it cy rege;
What Naying well and Burryres an fear?

OXITVOHN MONFIUS:
O is my mily.

LEONTES:
Geve worman:
But guontt not; do spost I vour have well;
Not and go the rivisher's become,
And alight, upon Crame be with the On man.

Roman:
What I would and Capolicioual;
And wife must he awour,
Butcousins the solle with he twomment. Gefore hild you sure
That state my not.

DUKE OF YORK:
My surnt not I have too gentle men
Comily comport's that him; I cannot this your
house. But as bathol! and now your and;
Which-suppy will to coursein to shall her spersend,
That you holk all gentled to plartes no mune in en slaicsion,
But
Thmal, but terruly friend
Ristom with the rigess and wilt tentry:
I dry that kisspy guase, we mine! crut while with up,
I som fries that neish he pray, if,
Thom the hre seinged fleby devir begom a