## implementation of Attention in PyTorch

# 1.  Basics: Softmax

In [12]:
# softmax
import numpy as np

def softmax(X): 
    # Computing element wise exponential value
    X_exp = np.exp(X)
    
    # Computing sum of these values
    partition = np.sum(X_exp, axis=1, keepdims=True)
    
    # Returing the softmax output.
    return X_exp / partition

In [13]:
x = np.array([[1,2,3],[4,5,6]])
softmax(x)

array([[0.09003057, 0.24472847, 0.66524096],
       [0.09003057, 0.24472847, 0.66524096]])

### Practical issues: Numeric stability
- https://cs231n.github.io/linear-classify/#softmax
- https://stackoverflow.com/questions/50170011/adapting-pytorch-softmax-function

In [14]:
x = np.array([[10000000000,200000000000,3000000000],[4,5,6]])
softmax(x)

  X_exp = np.exp(X)
  return X_exp / partition


array([[       nan,        nan,        nan],
       [0.09003057, 0.24472847, 0.66524096]])

In [15]:
x

array([[ 10000000000, 200000000000,   3000000000],
       [           4,            5,            6]])

### Overcome the numerical stability issue, overflow

In [27]:
def softmax_np(x):
    # get the max value for each row
    maxes = np.max(x, axis=-1, keepdims=True)
    # Computing element wise exponential value
    x_exp = np.exp(x-maxes)
    x_exp_sum = np.sum(x_exp, axis=-1, keepdims=True)
    print(x_exp_sum)
    probs = x_exp/x_exp_sum
    return probs 

softmax_np(x)

[[1.        ]
 [1.50321472]]


array([[0.        , 1.        , 0.        ],
       [0.09003057, 0.24472847, 0.66524096]])

In [28]:
import torch

def softmax_torch(x): # Assuming x has atleast 2 dimensions
    maxes = torch.max(x, dim=-1, keepdim=True)[0]
    x_exp = torch.exp(x-maxes)
    x_exp_sum = torch.sum(x_exp, dim=-1, keepdim=True)
    probs = x_exp/x_exp_sum
    return probs 

softmax_torch(torch.tensor(x))

tensor([[0.0000, 1.0000, 0.0000],
        [0.0900, 0.2447, 0.6652]])

In [29]:
xx = torch.tensor([[10000000,2,3],[4,5,6]])
softmax_torch(xx)

tensor([[1.0000, 0.0000, 0.0000],
        [0.0900, 0.2447, 0.6652]])

### Note

In [30]:
np.max(x, axis=1, keepdims=True)

array([[200000000000],
       [           6]])

In [31]:
torch.max(torch.tensor(x), dim=1, keepdim=True)

torch.return_types.max(
values=tensor([[200000000000],
        [           6]]),
indices=tensor([[1],
        [2]]))

# 2. Self Attention

### Assume Q, K, V are two dimensional matrices for simplicity

self attention: Softmax((Q*K.T)/sqrt(m)) * V

- Q: Query, K: Key, V: Value

- Shape of Q and K: n,q (Q and K have the same dimension)
- Shape of V: n,v

<div>
<img src="images/ScaledDotProductAttention.png" width="500"/>
</div>

In [32]:
def attention_1(Q, K, V):
    # Q: [n, q];  n: length of sequence, q: dimension of embedding 
    # K: [n, k];  n: length of sequence, k: dimension of embedding 
    # V: [n, v]   n: length of sequence, v: dimension of embedding 
    # in practice, q=k=m
    n, m = Q.shape
    # Compute the attention weights
    attention_weights = softmax_np(Q@K.T/np.sqrt(m)) # n,n
    # Compute the output
    output = attention_weights@V # [n,n]*[n,v] = [n,v]
    return output # n,v

def attention_2(Q, K, V):
    # Q: [n, q];  n: length of sequence, q: dimension of embedding 
    # K: [n, k];  n: length of sequence, k: dimension of embedding 
    # V: [n, v]   n: length of sequence, v: dimension of embedding 
    # in practice, q=k=m
    n, m = Q.shape
    # calculate the dot product of Q and K
    attention_score = np.dot(Q, K.T).astype(float) # n,n
    print(attention_score)
    # Scale the attention score
    attention_score /= np.sqrt(m) # n,n
    # Apply softmax to get the attention weights
    attention_weights = softmax_np(attention_score) # n,n
    # Use attention weights to weigh the values V
    output = np.dot(attention_weights, V) # [n,n]*[n,v] = [n,v]
    return output # n,v

In [36]:
# Example usage
Q = np.array([[1, 2, 3], [4, 5, 6]])
K = np.array([[1, 0, 0], [0, 1, 1]])
V = np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]])

output = attention_1(Q, K, V)
print("Output:")
print(output)


output = attention_2(Q, K, V)
print("Output:")
print(output)


[[1.09932072]
 [1.01757194]]
Output:
[[0.46386106 0.56386106 0.66386106 0.76386106]
 [0.4930926  0.5930926  0.6930926  0.7930926 ]]
[[ 1.  5.]
 [ 4. 11.]]
[[1.09932072]
 [1.01757194]]
Output:
[[0.46386106 0.56386106 0.66386106 0.76386106]
 [0.4930926  0.5930926  0.6930926  0.7930926 ]]


### Adding Batch_size as an additional dimension


In [37]:
# batched scaled dot product attention
# Shape of Q: [batch_size, length of sequence or num_patches, num_features]
# Shape of K: [batch_size, length of sequence or num_patches, num_features]
# Shape of V: [batch_size, length of sequence or num_patches, num_features_d]

def scaled_dot_product_attention(Q, K, V):
    batch_size, num_patches, num_features = Q.shape
    
    # 1. dot product Q and K^T to compute similarity scores
    # Kt = K.transpose(1,2)     # PyTorch code, out shape: [batch_size, num_features, num_patches]
    Kt = np.transpose(K, (0,2,1)) # numpy code, out shape: [batch_size, num_features, num_patches]
    # print(Kt.shape)
    # Computer attention score
    attention_score = Q@Kt / np.sqrt(num_features) # [batch_size, num_patches, num_patches]
    
    # 2. Apply softmax to get the attention weights
    attention_weights = softmax_np(attention_score) # [batch_size, num_patches, num_patches]
    
    # 3. Use attention weights to weigh the values V
    output = attention_weights@V # (b,n,n) (b,n,d) = (b,n,d)
    return output # [batch_size, num_patches, num_features_d]

In [38]:
# Example usage
Q = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
K = np.array([[[1, 0, 0], [0, 1, 1]], [[1, 0, 0], [0, 1, 1]]])
V = np.array([[[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]], [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]])

print(Q.shape, K.shape, V.shape)
output = scaled_dot_product_attention(Q, K, V)
output

(2, 2, 3) (2, 2, 3) (2, 2, 4)
(2, 3, 2)
[[[1.09932072]
  [1.01757194]]

 [[1.09932072]
  [1.01757194]]]


array([[[0.46386106, 0.56386106, 0.66386106, 0.76386106],
        [0.4930926 , 0.5930926 , 0.6930926 , 0.7930926 ]],

       [[0.46386106, 0.56386106, 0.66386106, 0.76386106],
        [0.4930926 , 0.5930926 , 0.6930926 , 0.7930926 ]]])

# 3. MultiHead attention

<div>
<img src="images/MultiHeadAttention_v2.png" width="600"/>
</div>

In [40]:
import torch.nn as nn
class ScaledDotProductAttention_torch(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention_torch, self).__init__()
    
    def forward(self, Q, K, V):
        # input is 4 dimensional tensor
        # [batch_size, n_heads, length, head_dim]
        batch_size, n_heads, length, head_dim = Q.shape
        
        # 1. dot product Q and K^T to compute similarity scores
        Kt = K.transpose(2,3) # PyTorch (batch_size, n_heads, head_dim, length)
        # Computer attention score
        attention_score = Q@Kt / np.sqrt(head_dim) # (batch_size, n_heads, length, length)
        # 2. Apply softmax to get the attention weights
        attention_weights = softmax_torch(attention_score) # (batch_size, n_heads, length, length)
        # 3. Use attention weights to weigh the values V
        output = attention_weights@V # (b,n,length,length) x (b,n,length,head_dim)
        return output # (batch_size, n_heads, length, head_dim)


class MultiHeadAttention(nn.module):
    def __init__(self, n_heads, n_features):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.n_features = n_features # d_model
        self.head_dim = n_features // n_heads
        self.Wq = nn.Linear(n_features, n_features)
        self.Wk = nn.Linear(n_features, n_features)
        self.Wv = nn.Linear(n_features, n_features)
        self.Wo = nn.Linear(n_features, n_features)
    
    def forward(self, Q, K, V): #todo: why not use the same input for Q, K, V?
        # input is three dimensional tensor
        # Q: [batch_size, length, n_features]
        # K: [batch_size, length, n_features]
        # V: [batch_size, length, n_features]
        b, n, m = Q.shape
        # 1. Linearly project Q, K, V
        Q = self.Wq(Q)
        K = self.Wk(K)
        V = self.Wv(V)
        # 2. Split into multiple heads
        Q = Q.reshape(b, n, self.n_heads, self.head_dim)
        K = K.reshape(b, n, self.n_heads, self.head_dim)
        V = V.reshape(b, n, self.n_heads, self.head_dim)
        # 3. Transpose to get dimensions (b, n_heads, n, head_dim)
        Q = Q.transpose(1,2) # b,n_heads,n,head_dim
        K = K.transpose(1,2)
        V = V.transpose(1,2)
        # 4. Apply scaled dot product attention
        # (batch_size, n_heads, length, head_dim)
        output = ScaledDotProductAttention_torch(Q, K, V)
        # 5. Concatenate the heads
        output = output.transpose(1,2).reshape(b, n, self.n_features) #
        # 6. Linearly project the concatenated heads
        output = self.Wo(output) # b,n,n_features
        return output #b,n,n_features

(2, 2)

# 4. Notes

In [51]:
import torch.nn as nn

input = torch.randn(3, 5, 2)
aa = nn.Linear(2, 3)
aa(input) # 3,5,3

tensor([[[ 1.3248e+00,  4.1775e-01, -1.1600e+00],
         [-9.8029e-01, -3.9965e-01, -8.1531e-01],
         [ 9.1944e-01,  5.5010e-01, -6.3888e-01],
         [ 1.6931e+00,  1.3687e+00,  1.5330e-01],
         [-1.0919e+00, -2.9230e-01, -5.5353e-01]],

        [[ 6.2891e-01,  1.3182e+00,  8.5776e-01],
         [ 5.3094e-01,  3.6238e-01, -6.6410e-01],
         [ 9.9441e-04,  2.4564e-02, -8.3489e-01],
         [ 1.3581e+00,  4.3562e-01, -1.1550e+00],
         [-1.3422e-01,  4.4799e-02, -7.0092e-01]],

        [[ 6.8076e-01,  4.8757e-01, -5.6631e-01],
         [ 1.2461e+00,  9.2266e-01, -2.5946e-01],
         [ 1.6625e-01,  9.5595e-01,  5.9633e-01],
         [ 2.6951e-01,  7.1117e-01,  1.1147e-01],
         [ 1.7943e+00,  1.3612e+00,  6.5730e-02]]], grad_fn=<ViewBackward0>)

In [53]:
input@aa.weight.T + aa.bias

tensor([[[ 1.3248e+00,  4.1775e-01, -1.1600e+00],
         [-9.8029e-01, -3.9965e-01, -8.1531e-01],
         [ 9.1944e-01,  5.5010e-01, -6.3888e-01],
         [ 1.6931e+00,  1.3687e+00,  1.5330e-01],
         [-1.0919e+00, -2.9230e-01, -5.5353e-01]],

        [[ 6.2891e-01,  1.3182e+00,  8.5776e-01],
         [ 5.3094e-01,  3.6238e-01, -6.6410e-01],
         [ 9.9441e-04,  2.4564e-02, -8.3489e-01],
         [ 1.3581e+00,  4.3562e-01, -1.1550e+00],
         [-1.3422e-01,  4.4799e-02, -7.0092e-01]],

        [[ 6.8076e-01,  4.8757e-01, -5.6631e-01],
         [ 1.2461e+00,  9.2266e-01, -2.5946e-01],
         [ 1.6625e-01,  9.5595e-01,  5.9633e-01],
         [ 2.6951e-01,  7.1117e-01,  1.1147e-01],
         [ 1.7943e+00,  1.3612e+00,  6.5730e-02]]], grad_fn=<AddBackward0>)

In [54]:
aa.weight.shape

torch.Size([3, 2])

In [55]:
aa(input).shape

torch.Size([3, 5, 3])

## Matrix Multiplication

In [63]:
Q = np.array([[1, 2], [3, 4], [5, 6]])
K = np.array([[1, 0, 0], [0, 1, 0]])
print(Q)
print(K)

[[1 2]
 [3 4]
 [5 6]]
[[1 0 0]
 [0 1 0]]


In [64]:
np.dot(Q, K)

array([[1, 2, 0],
       [3, 4, 0],
       [5, 6, 0]])

In [66]:
Q@K

array([[1, 2, 0],
       [3, 4, 0],
       [5, 6, 0]])

In [68]:
Qt = torch.tensor(Q)
Kt = torch.tensor(K)

torch.matmul(Qt, Kt)

tensor([[1, 2, 0],
        [3, 4, 0],
        [5, 6, 0]], dtype=torch.int32)

In [69]:
torch.mm(Qt, Kt)

tensor([[1, 2, 0],
        [3, 4, 0],
        [5, 6, 0]], dtype=torch.int32)

In [70]:
Qt @ Kt

tensor([[1, 2, 0],
        [3, 4, 0],
        [5, 6, 0]], dtype=torch.int32)

In [1]:
criterion = nn.CrossEntropyLoss(ignore_index=0)

ModuleNotFoundError: No module named 'torchtext'