Name: Pavan Kusampudi

700#: 700762366

In [1]:
import torch
import torch.nn.functional as F
import math

In [2]:
def scaled_dot_product_attention(Q, K, V):
    """
    Q, K, V shapes: (batch, seq_len, d_k)
    """

    d_k = Q.size(-1)

    # Step 1: raw attention scores (QK^T)
    scores = torch.matmul(Q, K.transpose(-2, -1))

    # Step 2: scale scores
    scaled_scores = scores / math.sqrt(d_k)

    # Step 3: softmax to get weights
    attn_weights = F.softmax(scaled_scores, dim=-1)

    # Step 4: multiply by V
    output = torch.matmul(attn_weights, V)

    return output, attn_weights, scores, scaled_scores


In [3]:
batch = 1
seq_len = 5
d_k = 4

# random Q, K, V
Q = torch.randn(batch, seq_len, d_k)
K = torch.randn(batch, seq_len, d_k)
V = torch.randn(batch, seq_len, d_k)

output, attn_weights, scores, scaled_scores = scaled_dot_product_attention(Q, K, V)


A. Attention Weight Matrix

In [9]:
print("Attention Weights (Softmax):\n", attn_weights)


Attention Weights (Softmax):
 tensor([[[0.1754, 0.0321, 0.6192, 0.0745, 0.0988],
         [0.4575, 0.0577, 0.3475, 0.0235, 0.1138],
         [0.0585, 0.2715, 0.0448, 0.5126, 0.1125],
         [0.3727, 0.1286, 0.2421, 0.1057, 0.1510],
         [0.5947, 0.2099, 0.0651, 0.0273, 0.1030]]])


B. Output Vectors

In [10]:
print("\nOutput Vectors:\n", output)



Output Vectors:
 tensor([[[-0.0563,  0.2386, -0.8007,  0.5123],
         [ 0.1545,  0.4383, -0.8074,  0.3550],
         [-0.1621,  0.4103,  0.0902, -0.5871],
         [ 0.1178,  0.3792, -0.6335,  0.1302],
         [ 0.1419,  0.5325, -0.5786,  0.0183]]])


C. Softmax Stability Check

In [11]:
print("\nSoftmax Stability Check:")
print("Max |raw scores| before scaling :", scores.abs().max().item())
print("Max |scaled scores| after scaling:", scaled_scores.abs().max().item())



Softmax Stability Check:
Max |raw scores| before scaling : 3.8929433822631836
Max |scaled scores| after scaling: 1.9464716911315918
