In [None]:
import math
import numpy as np

In [10]:
# HYPERPARAMETERS:

vocabulary_size = 100
pad_token_index = 0
batch_size = 2
sequence_length = 4
embedding_dimension = 8
head_dimension = 4

np.random.seed(0)
# Sequence length is the number of tokens in sentence
# Embedding dimension is the size of each token vector, which means each word will have 8 traits.
# These are saved in token embeddings.

# Head dimension means, size of each head dimension, attention uses  multiples head, that is like parallel brains.

In [11]:
# TOKEN EMBEDDINGS:
token_embeddings = np.random.randn(sequence_length, embedding_dimension)

In [12]:
# CREATING 3 DiFFERENT PRESPECTIVES FOR EACH TOKEN
# Query, Key, Value:
# These are weight metrices, in my notebook, these are randomm. But in transformer model, learned during training.
# Each has shape (8, 4). They are  like linear layers (y = x * W) that project embeddings into new space.
# Query (Q): what I am  looking for.
# Key (K): What can I offer??
# Value (V): What is my actual content?

# Why three?, because in conversations:
# You ask questions (Q)
# Others have answers (K)
# everyone has their own story (V)

# Each word becomes question asker 
W_q = np.random.randn(embedding_dimension, head_dimension)
# Each word becomes answer giver
W_k = np.random.randn(embedding_dimension, head_dimension)
# Each word becomes actual content
W_v = np.random.randn(embedding_dimension, head_dimension)
# This is like asking: For each word, what is its query vector, what it wants from others?
# It is vector, what it can offer?
# It is value vector it is useful information? 

# Why in  attention?:
# We  compare queries to keys to find matches, like a search engine.
# Values are what we collect from good matches.
# Projecting with W  lets the model learn different aspects Q/K for similarity, V for actual content.

Q = token_embeddings @ W_q  
K = token_embeddings @ W_k 
V = token_embeddings @ W_v 


In [13]:
# ATTENSION SCORE: 
# Measures how much each token should attend to every other token, including itself this is called self attention.
scale = np.sqrt(head_dimension)
attention_scores = (Q @ K.T) / scale
# Word1 asks Q1, Word2 answers with K2 calculate match score
# Higher score = We understand each other well
# Divide by sqrt(number of head) to keep scores from getting too big.

# Word1 asks: I like cats
# Word2 Key: I have cat stories
# Word3 Key: I like dogs
# Word4 Key: The weather is nice

# For example, scores: Word1 with Word2 = 0.9 which is high, Word1 with Word3 = 0.5, Word1 with Word4 = 0.1

In [14]:
# ATTENTION WEIGHT:
# Here, raw score is turned into probabilities likesaying pay X%  attention here
def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

attention_weights = softmax(attention_scores, axis=-1)

# Softmax makes things differentiable and focuses on high scores while ignoring lows.

In [15]:
# CREATING NEW UNDERSTANDING:
# Here, each word creates a new version of itself by mixing other words information.
# The model learns contextual meaning
# Each word becomes smarter by considering its neighbors 
# For example: word bank can learn from account,  bank account.
attention_output = attention_weights @ V

In [None]:
# Sources: 
# https://medium.com/%40wangdk93/multihead-attention-from-scratch-6fd6f99b9651

In [17]:
np.set_printoptions(precision=3, suppress=True)

print("=" * 60)
print("SELF-ATTENTION MECHANISM - SHAPES AND VALUES")
print("=" * 60)

print("\n1. TOKEN EMBEDDINGS:")
print(f" Shape: {token_embeddings.shape} → (sequence_length={sequence_length}, embedding_dim={embedding_dimension})")
print(f"Values:\n{token_embeddings}\n")

print("2. WEIGHT MATRICES:")
print(f"W_q, W_k, W_v shape: {W_q.shape} → (embedding_dim={embedding_dimension}, head_dim={head_dimension})")

print("\n3. QUERY, KEY, VALUE PROJECTIONS:")
print(f"Q, K, V shape: {Q.shape} → (sequence_length={sequence_length}, head_dim={head_dimension})")
print(f"Q (what each token looks for):\n{Q}\n")

print("4. ATTENTION SCORES (before softmax):")
print(f"Shape: {attention_scores.shape} → (seq_len × seq_len)")
print(f"Values:\n{attention_scores}\n")

print("5. ATTENTION WEIGHTS (after softmax):")
print(f"Shape: {attention_weights.shape}")
print(f"Values (each row sums to 1.0):\n{attention_weights}")
print(f"Row sums (verification): {attention_weights.sum(axis=-1)}\n")

print("6. ATTENTION OUTPUT:")
print(f"Shape: {attention_output.shape} → (sequence_length={sequence_length}, head_dim={head_dimension})")
print(f"Values (context-aware representations):\n{attention_output}")
print("=" * 60)

# I got the code from Qwen Max for better clarity and visibility.

SELF-ATTENTION MECHANISM - SHAPES AND VALUES

1. TOKEN EMBEDDINGS:
 Shape: (4, 8) → (sequence_length=4, embedding_dim=8)
Values:
[[ 1.764  0.4    0.979  2.241  1.868 -0.977  0.95  -0.151]
 [-0.103  0.411  0.144  1.454  0.761  0.122  0.444  0.334]
 [ 1.494 -0.205  0.313 -0.854 -2.553  0.654  0.864 -0.742]
 [ 2.27  -1.454  0.046 -0.187  1.533  1.469  0.155  0.378]]

2. WEIGHT MATRICES:
W_q, W_k, W_v shape: (8, 4) → (embedding_dim=8, head_dim=4)

3. QUERY, KEY, VALUE PROJECTIONS:
Q, K, V shape: (4, 4) → (sequence_length=4, head_dim=4)
Q (what each token looks for):
[[-5.592 -4.286 -7.37   4.027]
 [-1.781 -0.435 -3.429  0.881]
 [ 2.871 -2.977  2.418  0.502]
 [-7.225 -8.378 -1.89   1.252]]

4. ATTENTION SCORES (before softmax):
Shape: (4, 4) → (seq_len × seq_len)
Values:
[[ 31.588   2.933  12.022  -9.08 ]
 [ 11.106   1.093   3.98    0.299]
 [ -8.946  -4.35    5.684  -5.326]
 [ 33.206   5.506  -4.183 -22.69 ]]

5. ATTENTION WEIGHTS (after softmax):
Shape: (4, 4)
Values (each row sums to 1.0)

<h2><center>Multi Head Self attention and masking from scratch<h2>

In [None]:
# In here  I am going to create multi head attention from scratch,  multiple heads self attention mechanism is like having many brains.
# from what I understand, it helps the model focus on different parts of the input parallelly.
# In single head attention, I have one set of Q, K, V  per token. but here, I will have multiple sets of Q, K, V per token.
# In LSTM time series, I tried to forecast the multivariate time series, zone1, zone2, zone3. zone3 got bad result
# While I was looking on the internet, to handle  that issue, mulit head attention is suggested.

In [None]:
# I don't need to dine hyperparameters again, they are already defined above.
# just small change:
# Each head gets 4 traits to work with.
multi_head_dimension = head_dimension // 2
np.random.seed(0)
