## Objective: Implementation self attention from scratch

### 1. Using numpy

In [1]:
import numpy as np

In [2]:
def stable_softmax(x):
    # e ^ xi / sum of e ^xi for all ith element in array
    exp_x = np.exp(x-np.max(x,axis=-1,keepdims=True))
    return exp_x / exp_x.sum(axis=-1,keepdims=True)

def naive_softmax(x):
    return np.exp(x) / np.exp(x).sum()

In [3]:
queries = np.array([[1,0,1],
                   [0,1,1]])
# 2 x 3

keys = np.array([[1,0,1],
                [1,1,0],
                [0,1,1]])
#  3 x 3

values = np.array([[10,0],
                  [0,10],
                  [5,5]])
# 3 x 2

In [5]:
# Conduct UNSCALED dot product = q @ k.t / d_k
# 2 x 3 @ 3 x 3 = 2 x 3
attention_scores = np.dot(queries, keys.T)
attention_weights = stable_softmax(attention_scores)

# 2 x 3 @ 3 x 2 = 2 x 2
context = np.dot(attention_weights,values)
print(context.shape)
context

(2, 2)


array([[6.82087664, 3.17912336],
       [5.        , 5.        ]])

In [6]:
# Conduct SCA;ED dot product = q @ k.t / d_k
# 2 x 3 @ 3 x 3 = 2 x 3
attention_scores = np.dot(queries, keys.T) / np.sqrt(queries.shape[-1])

print('d_k:',queries.shape[-1])
attention_weights = stable_softmax(attention_scores) 

# 2 x 3 @ 3 x 2 = 2 x 2
context = np.dot(attention_weights,values)
print(context.shape)
context

d_k: 3
(2, 2)


array([[6.03312308, 3.96687692],
       [5.        , 5.        ]])

### 2. Using pytorch

In [7]:
import torch
import torch.nn.functional as F

In [8]:
queries = torch.tensor([[1.0,0.0,1.0],
                   [0.0,1.0,1.0]])
# 2 x 3

keys = torch.tensor([[1.0,0.0,1.0],
                [1.0,1.0,0.0],
                [0.0,1.0,1.0]])
#  3 x 3

values = torch.tensor([[10.0,0.0],
                  [0.0,10.0],
                  [5.0,5.0]])

In [9]:
attention_scores = torch.matmul(queries,keys.T)
attention_weights = F.softmax(attention_scores,dim=-1)
context = torch.matmul(attention_weights,values)
context

tensor([[6.8209, 3.1791],
        [5.0000, 5.0000]])

In [10]:
attention_scores = torch.matmul(queries,keys.T) / torch.sqrt(torch.tensor(queries.shape[-1]))
attention_weights = F.softmax(attention_scores,dim=-1) 
context = torch.matmul(attention_weights,values)
context

tensor([[6.0331, 3.9669],
        [5.0000, 5.0000]])