# **1. Install Necessary Libraries**

In [None]:
!pip install numpy torch seaborn matplotlib nltk

# **2. Define Self-Attention Mechanism**

In [None]:
import torch
import torch.nn.functional as F

class SelfAttention(torch.nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size must be divisible by heads"

        self.values = torch.nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = torch.nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = torch.nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = torch.nn.Linear(heads * self.head_dim, embed_size)

        self.dropout = torch.nn.Dropout(0.1)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim]))

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys]) / self.scale
        if mask is not None:
            energy -= 1e10 * mask

        attention = torch.nn.functional.softmax(energy, dim=3)
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out, attention

# **3. Prepare Sample Data**



In [None]:
sentence = "The cat sat on the mat"

embed_size = 16
heads = 4
tokens = sentence.split()
seq_length = len(tokens)

def create_random_embeddings(tokens, embed_size):
    embeddings = torch.rand(len(tokens), embed_size)
    return embeddings

values = create_random_embeddings(tokens, embed_size).unsqueeze(0)
keys = values.clone()
queries = values.clone()
mask = None

# **4. Run Self-Attention**

In [None]:
self_attention = SelfAttention(embed_size, heads)

output, attention_scores = self_attention(values, keys, queries, mask)

# **5. Create Word Labels and Plot Heatmap**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

word_labels = tokens

attention_scores = attention_scores[0, 0].detach().numpy()

plt.figure(figsize=(8, 6))
sns.heatmap(
    attention_scores,
    cmap="viridis",
    annot=True,
    fmt=".2f",
    xticklabels=word_labels,
    yticklabels=word_labels
)
plt.title("Self-Attention Scores Heatmap")
plt.xlabel("Keys (Words)")
plt.ylabel("Queries (Words)")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.show()