# 1. Simplified self-attention
To compute the "attention score" between input x[1] with query x[2], use "dot product" to compute between their embeding: 
<div align="center">
Attention_score = Dot_product(x[1], x[2])
</div>

<div align="center">
    <img src="images/books/chap_3/attention_score_example.png">
</div>

In [None]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your (x^1)
    [0.55, 0.87, 0.66], # journey (x^2)
    [0.57, 0.85, 0.64], # starts (x^3)
    [0.22, 0.58, 0.33], # with (x^4)
    [0.77, 0.25, 0.10], # one (x^5)
    [0.05, 0.80, 0.55]] # step (x^6)
)

Take "journey" as query x[2]

In [None]:
query = inputs[1] #A
print(query) # [0.55, 0.87, 0.66]
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query) # USE DOT PRODUCT
print(attn_scores_2)

Then normalize the calculated attention score by:
<div align="center">
Normalize_attention_score = x[i] / sum(x)
</div>

<div align="center">
    <img src="images/books/chap_3/normalize_attention_score.png">
</div>

In [None]:
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum() # A[i]/sum(A)
print("Attention weights:", attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

Or we can use Softmax to normalize the attention score
<div align="center">
Normalize_attention_score = softmax(x)
</div>

In [None]:
# Use softmax ---> Positive numbers
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)
attn_weights_2_naive = softmax_naive(attn_scores_2)
print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

In [None]:
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())

<div align="center">
    <img src="images/books/chap_3/multiply_attention_score.png">
</div>

After getting the attention score (attention weight) of x[2], multiply them with their corresponding input and sum all the result to get the context vector of x[2] - z[2]

In [None]:
query = inputs[1] # 2nd input token is the query
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i
print(context_vec_2)

Now, apply it to compute attention score of all inputs

<table>
  <tr>
    <td><img src="images/books/chap_3/Screenshot 2025-09-19 232100.png" alt="Image 1" style="max-width:100%; height:auto;"></td>
    <td><img src="images/books/chap_3/Screenshot 2025-09-19 232341.png" alt="Image 2" style="max-width:100%; height:auto;"></td>
  </tr>
</table>

In [None]:
# STEP 1: Compute attention scores
attn_scores = torch.empty(6, 6)
for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)
#  OR
attn_scores = inputs @ inputs.T
print(attn_scores)

In [None]:
# STEP 2: Normalize attention scores
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)

In [None]:
# STEP 3: Compute context vector
print(attn_weights.shape, inputs.shape)
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

# 2. Self-attention
In self-attention, there will have 3 new matrices: Key (K), Value (V) and Query (Q). They are used to project the input embedding into K, V and Q vectors
<table>
  <tr>
    <td><img src="images/books/chap_3/Screenshot 2025-09-20 010042.png" alt="Image 1" style="max-width:100%; height:auto;"></td>
    <td><img src="images/books/chap_3/Screenshot 2025-09-20 014032.png" alt="Image 2" style="max-width:100%; height:auto;"></td>
  </tr>
</table>

In [None]:
torch.manual_seed(123)
x_2 = inputs[1] #A
d_in = inputs.shape[1] #B
d_out = 2

# K, V and Q matrices
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

# K, V and Q vectors
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value
print(query_2)

# Get all Ks and Vs
keys = inputs @ W_key
values = inputs @ W_value
print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

In [None]:
#  STEP 1: Attention score of query input 2 - journey
attn_scores_2 = query_2 @ keys.T

# STEP 2: Normalize - Attention weight
d_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(attn_weights_2)

# STEP 3: Context vector
context_vec_2 = attn_weights_2 @ values

In [None]:
import torch.nn as nn
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))
        
    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1)
        context_vec = attn_weights @ values
        return context_vec

# Linear store matrix as transposed form
# Parameter (2, 3) -> shape: 2, 3
# Linear (2, 3) -> shape: 3, 2 
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        context_vec = attn_weights @ values
        return context_vec

# 3. Hidden future words
Use masked out to hide next word

<table>
  <tr>
    <td><img src="images/books/chap_3/Screenshot 2025-09-20 014622.png"" alt="Image 1" style="max-width:100%; height:auto;"></td>
    <td><img src="images/books/chap_3/Screenshot 2025-09-20 014839.png" alt="Image 2" style="max-width:100%; height:auto;"></td>
  </tr>
</table>