In [None]:
# Transformer from Scratch in PyTorch - Google Colab Version
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# Set seed
torch.manual_seed(42)

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Scaled Dot-Product Attention
def attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    return torch.matmul(F.softmax(scores, dim=-1), value)

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)

        # linear projections
        q = self.q_linear(q).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.k_linear(k).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)

        # apply attention
        x = attention(q, k, v, mask)

        # concatenate heads
        x = x.transpose(1, 2).contiguous().view(bs, -1, self.num_heads * self.d_k)
        return self.out(x)

# Feed Forward
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))

# Encoder Layer
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x2 = self.norm1(x + self.dropout(self.self_attn(x, x, x, mask)))
        x3 = self.norm2(x2 + self.dropout(self.feed_forward(x2)))
        return x3

# Transformer Encoder
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, num_heads, d_ff, max_len=100):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([TransformerEncoderLayer(d_model, num_heads, d_ff) for _ in range(N)])
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)  # binary classification

    def forward(self, src, mask=None):
        x = self.embed(src)
        x = self.pe(x)
        for layer in self.layers:
            x = layer(x, mask)
        x = self.norm(x)
        out = self.classifier(x[:, 0, :])  # take [CLS]-like first token
        return out

# Dummy example run
vocab_size = 1000
model = TransformerEncoder(vocab_size=vocab_size, d_model=128, N=2, num_heads=4, d_ff=256)

dummy_input = torch.randint(0, vocab_size, (8, 20))  # (batch_size, seq_len)
output = model(dummy_input)

print("Output shape:", output.shape)  # should be [8, 2]
