In [1]:
from importlib.metadata import version

print("matplotlib version:", version("matplotlib"))
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

matplotlib version: 3.10.8
torch version: 2.5.1
tiktoken version: 0.12.0


### Next chapter i am going to implement the gpt from scratch so i just complete the building blocks of the transformer architecture Only 

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

Configuration details for the 124 million parameter GPT-2 model include:

    We use short variable names to keep the code concise. The configuration parameters are defined as follows:

* **`vocab_size`**: 50,257 (BPE tokenizer, Chapter 2).
* **`context_length`**: Maximum input token count (positional embeddings, Chapter 2).
* **`emb_dim`**: Embedding size for token inputs (768-dimensional vector).
* **`n_heads`**: Number of attention heads (Chapter 3).
* **`n_layers`**: Number of transformer blocks.
* **`drop_rate`**: Dropout intensity (e.g., 0.1 for 10%) to mitigate overfitting (Chapter 3).
* **`qkv_bias`**: Determines if Linear layers include a bias vector for Q, K, and V. While modern LLMs often disable this, we'll revisit it in Chapter 5 for GPT-2 weight compatibility.

In [3]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        
        # Use a placeholder for TransformerBlock
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        # Use a placeholder for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x

In [4]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [5]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)

logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


### Layer normalization

In [11]:
import torch

x = torch.tensor([[1.0, 2.0, 3.0]])

print(x.shape)
print("--------------")
print("batch=1, features=3")


torch.Size([1, 3])
--------------
batch=1, features=3


### Step 1: Compute the mean (per row)

In [12]:
mean = x.mean(dim=-1, keepdim=True)
print(mean)


tensor([[2.]])


###    μ= (1+2+3) / 3 = 2

### Step 2: Compute the variance (per row)

In [13]:
var = x.var(dim=-1, keepdim=True, unbiased=False)
print(var)


tensor([[0.6667]])


Calculation

In [22]:
from IPython.display import display, Math

display(Math(r"""
\sigma^2
= \frac{(1-2)^2 + (2-2)^2 + (3-2)^2}{3}
= \frac{2}{3}
\approx 0.6667
"""))


<IPython.core.display.Math object>

## Step 3: Normalise (subtract mean, divide by std)

In [24]:
eps = 1e-5
x_norm = (x - mean) / torch.sqrt(var + eps)
print(x_norm)


tensor([[-1.2247,  0.0000,  1.2247]])


### Step 4: Check mean and variance

In [25]:
print(x_norm.mean(dim=-1))
print(x_norm.var(dim=-1, unbiased=False))


tensor([0.])
tensor([1.0000])


### Step 5: Add scale (γ) and shift (β)

Now we add trainable parameters.

In [26]:
gamma = torch.tensor([1.0, 1.0, 1.0])  # scale
beta  = torch.tensor([0.0, 0.0, 0.0])  # shift

y = gamma * x_norm + beta
print(y)


tensor([[-1.2247,  0.0000,  1.2247]])


## Step 6: See why scale and shift matter

Change them:

In [27]:
gamma = torch.tensor([2.0, 0.5, 1.0])
beta  = torch.tensor([1.0, -1.0, 0.0])

y = gamma * x_norm + beta
print(y)


tensor([[-1.4495, -1.0000,  1.2247]])


Now each feature:

1. Is scaled differently

2. Is shifted differently

The model learns these values during training.

## Step 7: Wrap it into a class (from scratch)

In [28]:
import torch.nn as nn

class SimpleLayerNorm(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta  = nn.Parameter(torch.zeros(features))
        self.eps = 1e-5

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var  = x.var(dim=-1, keepdim=True, unbiased=False)
        x_hat = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_hat + self.beta


##### Test it:

In [29]:
ln = SimpleLayerNorm(3)
print(ln(x))


tensor([[-1.2247,  0.0000,  1.2247]], grad_fn=<AddBackward0>)


LayerNorm is always:

for each sample:
   1. mean over features
   2. variance over features
   3. normalise
   4. scale
   5. shift


## Residual or - Shortcut or -  Skip - Connection


A residual (skip) connection just means:

“Take the input, do something to it, then add the original input back.”


In [32]:
from IPython.display import display, Math

display(Math(r"""
y = x + f(x)
"""))


<IPython.core.display.Math object>

In [33]:
x = torch.tensor([1.0, 2.0, 3.0])


def f(x):
    return 2 * x

## With out residual connection

y = f(x)
print("with out residual connection:", y)

## with residual connection

y = x + f(x)
print("with residual connection:", y)


with out residual connection: tensor([2., 4., 6.])
with residual connection: tensor([3., 6., 9.])


### 3. Why this matters (Intuition, not hype)
If $f(x)$ learns something useless or noisy:
*   **The model can simply ignore it**: The identity path ($x$) still survives.
*   **Increased Robustness**: Residuals make deep networks significantly harder to "break" or degrade during training.

-------------------------------------------------

# 5. Residual + LayerNorm (Transformer-style)

In [34]:
class ResidualLayerNorm(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear = nn.Linear(dim, dim)
        self.ln = SimpleLayerNorm(dim)

    def forward(self, x):
        return self.ln(x + self.linear(x))


# Why Transformers Need Residual Connections

### ❌ Without Residuals
*   **Vanishing Gradients:** Gradients shrink exponentially as they propagate backward, making updates negligible.
*   **Early Layer Stagnation:** Initial layers stop learning because they receive little to no signal for optimization.
*   **Training Instability:** Deep architectures become extremely difficult to converge, leading to performance degradation.

### ✅ With Residuals
*   **Unimpeded Information Flow:** Input features bypass transformations, ensuring the core signal is preserved across the network.
*   **Gradient Highway:** Provides a direct "shortcut" for gradients to flow backward to earlier layers without distortion.
*   **Deep Scalability:** Enables the training of massive models by maintaining signal strength regardless of depth.
