In [1]:

import math
import torch

# 1. Hyperparameters: keep them tiny so we can print everything
d_model = 4        # embedding dimension
seq_len = 5        # max sequence length



In [2]:
# 2. Create empty positional encoding matrix of shape (seq_len, d_model)
pe = torch.zeros(seq_len, d_model)
print("Step 2 - pe shape:", pe.shape)
print(pe)

Step 2 - pe shape: torch.Size([5, 4])
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


In [3]:
# 3. Create a column vector with positions [0, 1, 2, 3, 4]^T
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
print("\nStep 3 - position shape:", position.shape)
print(position)


Step 3 - position shape: torch.Size([5, 1])
tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.]])


In [4]:
# 4. Compute div_term: different frequencies for different dimensions
div_term = torch.exp(
    torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
print("\nStep 4 - div_term shape:", div_term.shape)
print(div_term)


Step 4 - div_term shape: torch.Size([2])
tensor([1.0000, 0.0100])


In [5]:
torch.arange(0, d_model, 2)

tensor([0, 2])

In [6]:
-math.log(10000.0) / d_model

-2.302585092994046

In [8]:
print(position)
print(div_term)
position * div_term

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.]])
tensor([1.0000, 0.0100])


tensor([[0.0000, 0.0000],
        [1.0000, 0.0100],
        [2.0000, 0.0200],
        [3.0000, 0.0300],
        [4.0000, 0.0400]])

In [9]:
# 5. Compute sine values for even dimensions (0, 2, ...)
sin_part = torch.sin(position * div_term)  # shape (seq_len, d_model/2)
print("\nStep 5 - sin_part shape:", sin_part.shape)
print(sin_part)


Step 5 - sin_part shape: torch.Size([5, 2])
tensor([[ 0.0000,  0.0000],
        [ 0.8415,  0.0100],
        [ 0.9093,  0.0200],
        [ 0.1411,  0.0300],
        [-0.7568,  0.0400]])


In [10]:
# 6. Compute cosine values for odd dimensions (1, 3, ...)
cos_part = torch.cos(position * div_term)
print("\nStep 6 - cos_part shape:", cos_part.shape)
print(cos_part)


Step 6 - cos_part shape: torch.Size([5, 2])
tensor([[ 1.0000,  1.0000],
        [ 0.5403,  0.9999],
        [-0.4161,  0.9998],
        [-0.9900,  0.9996],
        [-0.6536,  0.9992]])


In [11]:
pe

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [12]:
# 7. Fill pe: even columns with sin, odd columns with cos
pe[:, 0::2] = sin_part   # columns 0,2
print(pe)

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0000,  0.0100,  0.0000],
        [ 0.9093,  0.0000,  0.0200,  0.0000],
        [ 0.1411,  0.0000,  0.0300,  0.0000],
        [-0.7568,  0.0000,  0.0400,  0.0000]])


In [13]:

pe[:, 1::2] = cos_part   # columns 1,3
print("\nStep 7 - pe filled with sin/cos, shape:", pe.shape)
print(pe)



Step 7 - pe filled with sin/cos, shape: torch.Size([5, 4])
tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996],
        [-0.7568, -0.6536,  0.0400,  0.9992]])


In [16]:
import torch

t = torch.tensor([1, 2, 3])
print(t.shape)          # torch.Size([3])

t0 = t.unsqueeze(0)
print(t0.shape)         # torch.Size([1, 3])
print(t0)
t1 = t.unsqueeze(1)
print(t1.shape)         # torch.Size([3, 1])
print(t1)


torch.Size([3])
torch.Size([1, 3])
tensor([[1, 2, 3]])
torch.Size([3, 1])
tensor([[1],
        [2],
        [3]])


In [14]:
# 8. Add batch dimension so shape becomes (1, seq_len, d_model)
pe = pe.unsqueeze(0)
print("\nStep 8 - pe with batch dim, shape:", pe.shape)
print(pe)


Step 8 - pe with batch dim, shape: torch.Size([1, 5, 4])
tensor([[[ 0.0000,  1.0000,  0.0000,  1.0000],
         [ 0.8415,  0.5403,  0.0100,  0.9999],
         [ 0.9093, -0.4161,  0.0200,  0.9998],
         [ 0.1411, -0.9900,  0.0300,  0.9996],
         [-0.7568, -0.6536,  0.0400,  0.9992]]])


In [17]:
# 9. Create a fake input x (like embeddings) to see how addition works
#    Shape: (batch_size=2, seq_len=3, d_model=4)
x = torch.tensor(
    [[[10., 10., 10., 10.],
      [20., 20., 20., 20.],
      [30., 30., 30., 30.]],

     [[40., 40., 40., 40.],
      [50., 50., 50., 50.],
      [60., 60., 60., 60.]]]
)
print("\nStep 9 - x shape:", x.shape)
print(x)


Step 9 - x shape: torch.Size([2, 3, 4])
tensor([[[10., 10., 10., 10.],
         [20., 20., 20., 20.],
         [30., 30., 30., 30.]],

        [[40., 40., 40., 40.],
         [50., 50., 50., 50.],
         [60., 60., 60., 60.]]])


In [18]:

# 10. Take the first seq_len=3 positions from pe and add to x
pe_slice = pe[:, :x.shape[1], :]   # shape: (1, 3, 4)
print("\nStep 10 - pe_slice shape:", pe_slice.shape)
print(pe_slice)


Step 10 - pe_slice shape: torch.Size([1, 3, 4])
tensor([[[ 0.0000,  1.0000,  0.0000,  1.0000],
         [ 0.8415,  0.5403,  0.0100,  0.9999],
         [ 0.9093, -0.4161,  0.0200,  0.9998]]])


In [20]:
x_with_pos = x + pe_slice          # broadcasting to (2, 3, 4)
print("\nStep 10 - x_with_pos shape:", x_with_pos.shape)
print(x_with_pos)


Step 10 - x_with_pos shape: torch.Size([2, 3, 4])
tensor([[[10.0000, 11.0000, 10.0000, 11.0000],
         [20.8415, 20.5403, 20.0100, 21.0000],
         [30.9093, 29.5839, 30.0200, 30.9998]],

        [[40.0000, 41.0000, 40.0000, 41.0000],
         [50.8415, 50.5403, 50.0100, 51.0000],
         [60.9093, 59.5839, 60.0200, 60.9998]]])
