<a href="https://colab.research.google.com/github/HarikrishnanK9/DataSymphony/blob/main/Multi_Head_Attention_From_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [6]:
class MultiHeadAttention:
  """
  Multi-head attention.
  Parameters:
  num_hiddens: int
  Number of hidden units.
  num_heads: int
  Number of attention heads.
  dropout: float
  Dropout rate.
  bias: bool
  Whether to include bias parameters in the model.
  """
  def __init__(self, num_hiddens, num_heads, dropout=0.0, bias=False):
        self.num_heads = num_heads
        self.num_hiddens = num_hiddens
        self.d_k = self.d_v = num_hiddens // num_heads
        self.W_q = np.random.rand(num_hiddens, num_hiddens)
        self.W_k = np.random.rand(num_hiddens, num_hiddens)
        self.W_v = np.random.rand(num_hiddens, num_hiddens)
        self.W_o = np.random.rand(num_hiddens, num_hiddens)
        if bias:
          self.b_q = np.random.rand(num_hiddens)
          self.b_k = np.random.rand(num_hiddens)
          self.b_v = np.random.rand(num_hiddens)
          self.b_o = np.random.rand(num_hiddens)
        else:
          self.b_q = self.b_k = self.b_v = self.b_o = np.zeros(num_hiddens)

  def transpose_qkv(self, X):
        """
        Transposition for batch processing
        Parameters:
        X: np.ndarray
        Input tensor
        Returns:
        np
        Transposed tensor
        """
        X = X.reshape(X.shape[0], X.shape[1], self.num_heads, -1)
        X = X.transpose(0, 2, 1, 3)
        return X.reshape(-1, X.shape[2], X.shape[3])
  def transpose_output(self, X):
        """
        Transposition for output
        Parameters:
        X: np.ndarray
        Input tensor
        Returns:
        np
        Transposed tensor
        """
        X = X.reshape(-1, self.num_heads, X.shape[1], X.shape[2])
        X = X.transpose(0, 2, 1, 3)
        return X.reshape(X.shape[0], X.shape[1], -1)
  def scaled_dot_product_attention(self, Q, K, V, valid_lens):
        """
        Scaled dot product attention
        Parameters:
        Q: np.ndarray
        Query tensor
        K: np.ndarray
        Key tensor
        V: np.ndarray
        Value tensor
        valid_lens: np.ndarray
        Valid lengths for the query
        Returns:
        np
        Output tensor
        """
        d_k = Q.shape[-1]
        scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)
        if valid_lens is not None:
          mask = np.arange(scores.shape[-1]) < valid_lens[:, None]
          scores = np.where(mask[:, None, :], scores, -np.inf)
          attention_weights = np.exp(scores - np.max(scores, axis=-1,keepdims=True))
          attention_weights /= attention_weights.sum(axis=-1, keepdims=True)
        return np.matmul(attention_weights, V)
  def forward(self, queries, keys, values, valid_lens):
        """
        Forward pass
        Parameters:
        queries: np.ndarray
        Query tensor
        keys: np.ndarray
        Key tensor
        values: np.ndarray
        Value tensor
        valid_lens: np.ndarray
        Valid lengths for the query
        Returns:
        np
        Output tensor
        """
        queries = self.transpose_qkv(np.dot(queries, self.W_q) + self.b_q)
        keys = self.transpose_qkv(np.dot(keys, self.W_k) + self.b_k)
        values = self.transpose_qkv(np.dot(values, self.W_v) + self.b_v)
        if valid_lens is not None:
          valid_lens = np.repeat(valid_lens, self.num_heads, axis=0)
          output = self.scaled_dot_product_attention(queries, keys, values,valid_lens)
          output_concat = self.transpose_output(output)
        return np.dot(output_concat, self.W_o) + self.b_o

# New Section

In [7]:
# Define dimensions and initialize multi-head attention
num_hiddens, num_heads = 100, 5
attention = MultiHeadAttention(num_hiddens, num_heads, dropout=0.5, bias=False)
# Define sample data
batch_size, num_queries, num_kvpairs = 2, 4, 6
valid_lens = np.array([3, 2])
print(valid_lens)

[3 2]


In [9]:
X = np.random.rand(batch_size, num_queries, num_hiddens) # Use random data to simulate input queries
Y = np.random.rand(batch_size, num_kvpairs, num_hiddens) # Use random data to simulate key-value pairs
print("Query data shape:", X.shape)
print("Key-value data shape:", Y.shape)

Query data shape: (2, 4, 100)
Key-value data shape: (2, 6, 100)


In [11]:
# Apply multi-head attention
output = attention.forward(X, Y, Y, valid_lens)
print("Output shape:", output.shape) # Expected shape: (batch_size,num_queries, num_hiddens)
# Output sample data
print(output[0][0])

Output shape: (2, 4, 100)
[1352.90077426 1253.23139856 1556.6128199  1513.80032417 1305.43769393
 1365.82303938 1512.2497699  1529.52131061 1512.74313702 1443.56795073
 1451.7070232  1446.87191905 1422.55158685 1370.9700987  1256.41455512
 1335.76729195 1367.02833798 1409.54372477 1550.95294193 1262.36758339
 1387.6066865  1338.37699346 1515.92284845 1458.18847239 1478.65040157
 1306.09947677 1291.69807397 1303.39347217 1295.94267472 1416.55255069
 1257.38662215 1257.33406508 1393.12622816 1428.17794078 1302.9198136
 1364.64700271 1443.89411916 1426.6393788  1367.66520473 1309.34922966
 1483.18764286 1383.47380866 1441.52139262 1405.62452993 1231.76690432
 1345.83482005 1357.48128796 1297.79420729 1421.10264642 1390.83237925
 1478.86977488 1347.31935295 1500.22107762 1594.54308112 1588.83641395
 1336.54909021 1390.43472962 1305.64747901 1531.57811718 1287.63190405
 1372.14942575 1290.28548264 1487.52634649 1313.54870997 1341.55787612
 1343.99477134 1451.44780452 1483.40890087 1453.6412