In [None]:
import numpy as np

class SelfAttention:
    def __init__(self, embedding_size):
        self.w_q = np.random.randn(embedding_size, 64)
        self.w_k = np.random.randn(embedding_size, 64)
        self.w_v = np.random.randn(embedding_size, 64)

    def softmax(self,z):
        exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=-1, keepdims=True)  

    def attention_output(self, X):
        X = np.asarray(X, dtype=np.float64)

        q = X @ self.w_q    # (100,64)
        k = X @ self.w_k    # (100,64)
        v = X @ self.w_v    # (100,64)

        attention_scores = q @ k.T                                       # (100,64) @ (64,100) = (100,100)
        normalized_attention_scores = attention_scores / np.sqrt(64)     # (100,1)
        attention_weights = self.softmax(normalized_attention_scores)    # (100,1)
        weighted_sum = attention_weights @ v                             # (100,100) @ (100,64) = (1,64)

        return weighted_sum

self_attention_layer = SelfAttention(embedding_size=512)

X = np.random.randn(100, 512)
attention_output = self_attention_layer.attention_output(X)

In [11]:
print(attention_output)

[[  4.30778279  10.42221294 -36.50902976 ... -23.80877106  19.16665023
  -24.2314227 ]
 [ -6.3773666   -0.87922991   9.52779687 ...  28.72339927  16.35779464
    5.98561188]
 [-23.0373316   29.06103622  17.40501214 ...  24.24781546  -7.43527149
  -19.07495084]
 ...
 [-15.37574694 -12.04698284  -1.17155791 ...  28.19857678  24.95907417
  -25.05887593]
 [  4.30778279  10.42221294 -36.50902976 ... -23.80877106  19.16665023
  -24.2314227 ]
 [ 15.66345455  13.39679568   5.6871774  ...  24.70714439  11.84435309
   -6.18882984]]


Below is kind of implementation of the Encoder layer, but it is not thorough so please revise it. Also it does not consider the Add + LayerNorm

In [None]:
import numpy as np

# ---- Step 1: Self-Attention Layer ----
class SelfAttention:
    def __init__(self, embedding_size, d_k=64):
        self.d_k = d_k
        self.W_q = np.random.randn(embedding_size, d_k) * 0.01
        self.W_k = np.random.randn(embedding_size, d_k) * 0.01
        self.W_v = np.random.randn(embedding_size, d_k) * 0.01

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=-1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=-1, keepdims=True)

    def forward(self, X):
        Q = X @ self.W_q
        K = X @ self.W_k
        V = X @ self.W_v
        scores = (Q @ K.T) / np.sqrt(self.d_k)
        attn_weights = self.softmax(scores)
        out = attn_weights @ V
        return out, attn_weights

# ---- Step 2: Linear Classifier ----
class Classifier:
    def __init__(self, input_dim, num_classes):
        self.W = np.random.randn(input_dim, num_classes) * 0.01
        self.b = np.zeros((1, num_classes))

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def forward(self, X):
        logits = X @ self.W + self.b
        probs = self.softmax(logits)
        return probs, logits

# ---- Step 3: Cross-Entropy Loss ----
def cross_entropy_loss(probs, y_true):
    m = y_true.shape[0]
    log_likelihood = -np.log(probs[range(m), y_true])
    return np.sum(log_likelihood) / m

# ---- Step 4: Dummy Data ----
np.random.seed(42)
X = np.random.randn(10, 512)     # 10 samples, 512-dim embeddings
y = np.random.randint(0, 3, 10)  # 3 classes

# ---- Step 5: Forward Pass ----
attn = SelfAttention(embedding_size=512)
cls = Classifier(input_dim=64, num_classes=3)

# Get attention output
attn_out, _ = attn.forward(X)          # (10, 64)
avg_out = np.mean(attn_out, axis=0, keepdims=True)  # Pool across tokens

# Classifier forward
probs, logits = cls.forward(avg_out)   # (1, 3)

# Compute loss
loss = cross_entropy_loss(probs, np.array([y[0]]))
print("Initial Loss:", loss)
