In [1]:
import numpy as np

np.set_printoptions(precision=3, suppress=True)

# 1. "Mening" med 3 token-embeddings
d_model = 4   # embedding-dimension
seq_len = 3   # antal ord

np.random.seed(0)
E = np.random.randn(seq_len, d_model)  # (3, 4)
print("Embeddings E (seq_len x d_model):")
print(E)
print("Shape:", E.shape, "\n")

Embeddings E (seq_len x d_model):
[[ 1.764  0.4    0.979  2.241]
 [ 1.868 -0.977  0.95  -0.151]
 [-0.103  0.411  0.144  1.454]]
Shape: (3, 4) 



In [2]:
d_k = d_v = d_model

In [3]:
W_Q = np.random.randn(d_model, d_k)
W_K = np.random.randn(d_model, d_k)
W_V = np.random.randn(d_model, d_v)

In [4]:
Q = E @ W_Q  # (seq_len, d_k)
K = E @ W_K  # (seq_len, d_k)
V = E @ W_V  # (seq_len, d_v)

In [5]:
print("Q (queries):\n", Q, "\nShape:", Q.shape, "\n")
print("K (keys):\n", K, "\nShape:", K.shape, "\n")
print("V (values):\n", V, "\nShape:", V.shape, "\n")

Q (queries):
 [[ 4.528 -2.487  1.857 -0.899]
 [-2.808  1.269  1.337  0.781]
 [ 3.468 -2.118  0.274 -0.764]] 
Shape: (3, 4) 

K (keys):
 [[ 1.203 -0.206 -4.069  4.805]
 [ 5.058  6.037  0.52  -0.029]
 [-1.87  -2.857 -2.696  2.819]] 
Shape: (3, 4) 

V (values):
 [[-1.896 -1.336 -4.017  1.133]
 [ 0.13  -1.778 -1.395  1.536]
 [-0.587  0.228 -1.165 -0.387]] 
Shape: (3, 4) 



In [6]:
scores = Q @ K.T / np.sqrt(d_k)  # (seq_len, seq_len)
print("Oskalade attention-scorer (QK^T / sqrt(d_k)):\n", scores, "\n")

Oskalade attention-scorer (QK^T / sqrt(d_k)):
 [[-2.958  4.439 -4.452]
 [-2.664 -2.935  0.111]
 [-0.089  2.46  -1.664]] 



In [7]:
def softmax(x, axis=-1):
    x_shifted = x - np.max(x, axis=axis, keepdims=True)
    exp_x = np.exp(x_shifted)
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

In [8]:
A = softmax(scores, axis=-1)  # (seq_len, seq_len)
print("Attention-vikter (softmax på varje rad):\n", A, "\n")
print("Varje rad summerar till:", A.sum(axis=-1), "\n")

Attention-vikter (softmax på varje rad):
 [[0.001 0.999 0.   ]
 [0.056 0.043 0.901]
 [0.071 0.914 0.015]] 

Varje rad summerar till: [1. 1. 1.] 



In [9]:
O = A @ V  # (seq_len, d_v)

In [10]:
O

array([[ 0.129, -1.777, -1.397,  1.535],
       [-0.63 ,  0.054, -1.335, -0.219],
       [-0.025, -1.716, -1.579,  1.479]])

In [12]:
import torch

print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("Memory allocated:", torch.cuda.memory_allocated(0) / 1024**3, "GB")
    print("Memory reserved:", torch.cuda.memory_reserved(0) / 1024**3, "GB")


GPU available: False
