In [None]:
!pip install torch


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math


In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):
        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))

        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        output = torch.matmul(attn, v)

        return output, attn

class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, q, k, v, mask=None):
        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        residual = q

        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        output, attn = self.attention(q, k, v, mask=mask)

        output = output.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)

        return output, attn

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid)
        self.w_2 = nn.Linear(d_hid, d_in)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_in)

    def forward(self, x):
        residual = x
        output = self.w_2(torch.relu(self.w_1(x)))
        output = self.dropout(output)
        output = self.layer_norm(output + residual)
        return output

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
        super().__init__()
        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)

    def forward(self, enc_input, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        enc_output = self.pos_ffn(enc_output)
        return enc_output, enc_slf_attn

class TransformerEncoder(nn.Module):
    def __init__(self, n_src_vocab, d_model, n_layers, n_head, d_k, d_v, d_inner, dropout=0.1):
        super().__init__()

        self.src_word_emb = nn.Embedding(n_src_vocab, d_model)
        self.position_enc = nn.Parameter(torch.zeros(1, 512, d_model), requires_grad=False)
        self.dropout = nn.Dropout(dropout)

        self.layer_stack = nn.ModuleList([
            TransformerEncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])

    def forward(self, src_seq, src_mask):
        enc_output = self.src_word_emb(src_seq) + self.position_enc[:, :src_seq.size(1), :]
        enc_output = self.dropout(enc_output)

        enc_slf_attn_list = []

        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask)
            enc_slf_attn_list += [enc_slf_attn]

        return enc_output, enc_slf_attn_list


In [None]:
# Sample sentences
sentences = [
    "This is a sample sentence",
    "Transformers are very powerful models",
    "This is another example"
]

# Build a simple vocabulary
vocab = set(" ".join(sentences).split())
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

# Convert sentences to token indices
def sentences_to_tensor(sentences):
    return torch.tensor([[word2idx[word] for word in sentence.split()] for sentence in sentences])

# Prepare data
data = sentences_to_tensor(sentences)


In [None]:
n_src_vocab = len(vocab)
d_model = 512
n_layers = 6
n_head = 8
d_k = d_model // n_head
d_v = d_model // n_head
d_inner = 2048

model = TransformerEncoder(n_src_vocab, d_model, n_layers, n_head, d_k, d_v, d_inner)


In [None]:
src_mask = None  # In this simple case, we do not use a mask
output, attn_list = model(data, src_mask)

print("Output shape:", output.shape)
print("Attention shapes:", [attn.shape for attn in attn_list])
