### Using Numpy Only

In [33]:
import numpy as np

def softmax(x):
    # assuming x.size() = [batch, numheads, sequence_length, dimension]
    return (np.exp(x).transpose(-2, -1) / np.sum(np.exp(x), axis=-1)).transpose(-2, -1)

# Scaled Dot-Product Attention
def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.size()[-1] # query vector dimension
    scaled = np.matmul(q, k.transpose(-2, -1)) / np.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
    attention = softmax(scaled)
    out = np.matmul(attention, v)
    return out, attention

### Using `torch` - Multi-Head Attention

In [108]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# Scaled Dot-Product Attention
def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.size()[-1] # query vector dimension
    scaled = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
    attention = F.softmax(scaled, dim=-1)
    out = torch.matmul(attention, v)
    return out, attention

In [109]:
batch_size = 1
sequence_length = 4 # My name is Goro
input_dim = 512
d_model = 512

x = torch.randn(batch_size, sequence_length, input_dim)
x.shape

torch.Size([1, 4, 512])

### Query vector: what I am looking for?
### Key vector: what I can offer
### Value vector: what I actually offer

In [110]:
# Create the query, key, value vectors
qkv_layer = nn.Linear(input_dim, 3*d_model)
qkv = qkv_layer(x)
qkv.shape

torch.Size([1, 4, 1536])

In [111]:
num_heads = 8
head_dim = d_model // num_heads # Purpose of //: Floor division: returns the largest integer <= result. (7//2 = 3)
qkv = qkv.reshape(batch_size, sequence_length, num_heads, 3*head_dim) 
qkv.shape


torch.Size([1, 4, 8, 192])

In [24]:
qkv = qkv.permute(0, 2, 1, 3) # [batch_size, num_heads, sequence_length, 3*head_dim]
qkv.shape

torch.Size([1, 8, 4, 192])

In [25]:
# Divide each tensor into 3 chunks on the last dimension
q, k, v = qkv.chunk(3, dim=-1)
q.shape, k.shape, v.shape

(torch.Size([1, 8, 4, 64]),
 torch.Size([1, 8, 4, 64]),
 torch.Size([1, 8, 4, 64]))

### Self-Attention for Multiple Heads
 
- Scaled by sqrt(dk) is to **stablize the training** by lowering the **variance** of the dot product.
- **Mask**: used in **decoder** only. This is to prevent prediction of word `i` by looking at future predictions `i+1` ~ ... `n`.
- **Value** matrix: the **multiplier**.



<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mtext>self attention</mtext>
  <mo>=</mo>
  <mi>s</mi>
  <mi>o</mi>
  <mi>f</mi>
  <mi>t</mi>
  <mi>m</mi>
  <mi>a</mi>
  <mi>x</mi>
  <mrow data-mjx-texclass="ORD">
    <mo minsize="2.047em" maxsize="2.047em">(</mo>
  </mrow>
  <mfrac>
    <mrow>
      <mi>Q</mi>
      <mo>.</mo>
      <msup>
        <mi>K</mi>
        <mi>T</mi>
      </msup>
    </mrow>
    <msqrt>
      <msub>
        <mi>d</mi>
        <mi>k</mi>
      </msub>
    </msqrt>
  </mfrac>
  <mo>+</mo>
  <mi>M</mi>
  <mrow data-mjx-texclass="ORD">
    <mo minsize="2.047em" maxsize="2.047em">)</mo>
  </mrow>
</math>

<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mtext>new V</mtext>
  <mo>=</mo>
  <mtext>self attention</mtext>
  <mo>.</mo>
  <mi>V</mi>
</math>

In [28]:
d_k = q.size()[-1]
scaled = torch.matmul(q, k.transpose(-2,-1))
mask = torch.full(scaled.size(), float('-inf'))
mask = torch.triu(mask, diagonal=1)

# For decoder
attention = F.softmax(scaled + mask, dim=-1)
values = torch.matmul(attention, v)
values.shape 

torch.Size([1, 8, 4, 64])

In [43]:

class MultiHeadAttention(nn.Module):

    def __init__(self, input_dim, d_model, num_heads):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(input_dim, 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    def forward(self, x, mask=None):
        batch_size, sequence_length, input_dim = x.size()
        print(f"x.size(): {x.size()}")
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v, = qkv.chunk(3, dim=-1)
        print(f"    q size: {q.size()}\n    k size: {k.size()}\n    v size: {v.size()}, ")
        values, attention = scaled_dot_product_attention(q, k, v, mask)
        print(f"values.size(): {values.size()}, attention.size:{ attention.size()} ")
        # Concatenate values
        values = values.reshape(batch_size, sequence_length, num_heads * self.head_dim)
        out = self.linear_layer(values)
        print(f"out.size() {out.size()}")
        return out

# Input
input_dim = 1024
d_model = 512
num_heads = 8

batch_size = 30
sequence_length = 10
x = torch.randn(batch_size, sequence_length, input_dim)

model = MultiHeadAttention(input_dim, d_model, num_heads)
out = model.forward(x)


x.size(): torch.Size([30, 10, 1024])
    q size: torch.Size([30, 8, 10, 64])
    k size: torch.Size([30, 8, 10, 64])
    v size: torch.Size([30, 8, 10, 64]), 
values.size(): torch.Size([30, 8, 10, 64]), attention.size:torch.Size([30, 8, 10, 10]) 
out.size() torch.Size([30, 10, 512])


### Positional Encoding

In [46]:
max_sequence_length = 10
d_model = 6 # dimension of embedding. Typically 512

even_i = torch.arange(0, d_model, 2).float()
even_denominator = torch.pow(10000, even_i/d_model)

odd_i = torch.arange(1, d_model, 2).float()
odd_denominator = torch.pow(10000, odd_i/d_model)

In [47]:
# denominator = even_denominator
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)
even_PE = torch.sin(position/even_denominator)
odd_PE = torch.sin(position/odd_denominator)

In [51]:
stacked = torch.stack([even_PE, odd_PE], dim=2)
stacked.shape

torch.Size([10, 3, 2])

In [53]:
PE = torch.flatten(stacked, start_dim=1, end_dim=2)
PE

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00],
        [ 8.4147e-01,  2.1378e-01,  4.6399e-02,  9.9998e-03,  2.1544e-03,
          4.6416e-04],
        [ 9.0930e-01,  4.1768e-01,  9.2698e-02,  1.9999e-02,  4.3089e-03,
          9.2832e-04],
        [ 1.4112e-01,  6.0226e-01,  1.3880e-01,  2.9995e-02,  6.4633e-03,
          1.3925e-03],
        [-7.5680e-01,  7.5900e-01,  1.8460e-01,  3.9989e-02,  8.6176e-03,
          1.8566e-03],
        [-9.5892e-01,  8.8064e-01,  2.3000e-01,  4.9979e-02,  1.0772e-02,
          2.3208e-03],
        [-2.7942e-01,  9.6157e-01,  2.7491e-01,  5.9964e-02,  1.2926e-02,
          2.7850e-03],
        [ 6.5699e-01,  9.9804e-01,  3.1922e-01,  6.9943e-02,  1.5080e-02,
          3.2491e-03],
        [ 9.8936e-01,  9.8836e-01,  3.6285e-01,  7.9915e-02,  1.7235e-02,
          3.7133e-03],
        [ 4.1212e-01,  9.3298e-01,  4.0570e-01,  8.9879e-02,  1.9389e-02,
          4.1774e-03]])

In [54]:

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model
    
    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        even_denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position/even_denominator)
        odd_i = torch.arange(1, d_model, 2).float()
        odd_denominator = torch.pow(10000, odd_i/d_model)        
        odd_PE = torch.cos(position/odd_denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE
    

In [56]:
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pe.forward()

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.9769,  0.0464,  0.9999,  0.0022,  1.0000],
        [ 0.9093,  0.9086,  0.0927,  0.9998,  0.0043,  1.0000],
        [ 0.1411,  0.7983,  0.1388,  0.9996,  0.0065,  1.0000],
        [-0.7568,  0.6511,  0.1846,  0.9992,  0.0086,  1.0000],
        [-0.9589,  0.4738,  0.2300,  0.9988,  0.0108,  1.0000],
        [-0.2794,  0.2746,  0.2749,  0.9982,  0.0129,  1.0000],
        [ 0.6570,  0.0627,  0.3192,  0.9976,  0.0151,  1.0000],
        [ 0.9894, -0.1522,  0.3629,  0.9968,  0.0172,  1.0000],
        [ 0.4121, -0.3599,  0.4057,  0.9960,  0.0194,  1.0000]])

### Layer Normalization

In [58]:
inputs = torch.Tensor([  # batch size: 1
    [[0.2, 0.1, 0.3],    # 2 words
     [0.5, 0.1, 0.1]]    # each word - 3 dimensinoal embedding
])

B, S, E = inputs.size() # Batch size, Sequence length, Embedding dimension
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

In [59]:
parameter_shape =inputs.size()[-2:] # batch x embedding dimension
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))

In [62]:
dims = [-(i+1) for i in range(len(parameter_shape))]
dims

[-1, -2]

In [69]:
mean = inputs.mean(dim=dims, keepdim=True)
print(f"mean.size() {mean.size()}")
var = ((inputs-mean)**2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

mean.size() torch.Size([2, 1, 1])


tensor([[[0.0817]],

        [[0.1886]]])

In [70]:
y = (inputs-mean) / std
y.shape

torch.Size([2, 1, 3])

In [72]:
out = gamma * y + beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

In [100]:
class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape = parameters_shape
        print(f"self.parameters_shape: {self.parameters_shape}")
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(self.parameters_shape))
        self.beta = nn.Parameter(torch.zeros(self.parameters_shape))
    
    def forward(self, inputs):
        dims = [-(i+1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"mean size: {mean.size()} one mean for each word")
        var = ((inputs-mean)**2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [102]:
batch_size = 3
sequence_length = 5 # 5 words
embedding_dim = 8
inputs = torch.randn(batch_size, sequence_length, embedding_dim)
inputs = inputs.permute(1, 0, 2) # S, B, E
# print(f"inputs \n ({inputs.size()}) = \n {inputs}")

layer_norm = LayerNormalization(inputs.size()[-2:])
out = layer_norm.forward(inputs)
out.size()


self.parameters_shape: torch.Size([3, 8])
mean size: torch.Size([5, 1, 1]) one mean for each word


torch.Size([5, 3, 8])

In [104]:
# out

## Encoder

The goal of **encoder** is to transform the input sequence (collection of words) into embeddings (vectors) that better **encapsulate** the **context of the words**.

Better representations of the **meaning** of the words.

Be used in the decoder to assist in **translation**

### How? -> Encoder layer

### Residual/skip Connections (to Add & Norm): Avoid the vanishing gradient problem
- For **very deep networks**, the backpropagation of values will eventually be **very small gradients**. (ReLU, ... -> near 0 activation)
- The network **stop learning** if gradients become super small.
- To prevent the vanishing gradients problem: Use **skip connection**

### Why Layer Normalization? Want to preform more stable training
- Naturally, the values after positional encoding would have scattered means and large standard deviations.
- LayerNorm ensure values are centered around 0 with std ~ 1. 
    - Backprop: gets more even steps during the learning process
- Training becomes more stable, easier, and faster

Output after the **encoder block**: Better contextual awareness.

After vectors are passed through the network encompassing attention
1. Preserve signals via skip connections.
2. Much more stable values via layer normalizations.

Overall, each vector better represents the word compared to the original input vector.

Original transformer paper uses a stack of **N=6** encoder blocks cascaded one after the other.

## Decoder