In [None]:
import torch
import math
from torch import nn
import torch.nn.functional as F

In [None]:
def scaled_dot_product_attention(q,k,v,mask=None):
  d_k = q.size(-1)
  scaled = torch.matmul(q,k.transpose(-1,-2))/math.sqrt(d_k)
  attention = F.softmax(scaled)
  attention = torch.matmul(attention,v)
  if mask is not None:
    attention += mask
  return attention

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,d_model,num_heads):
    self.d_model = d_model
    self.num_heads = num_heads
    self.qkv_layer = nn.Linear(d_model,3*d_model)
    self.head_dim = d_model // num_heads
    self.linear_layer = nn.Linear(d_model,d_model)

  def forward(self,x,mask=None):
    sequence_length, dim = x.size()
    qkv = self.qkv_layer(x)
    qkv = qkv.reshape(sequence_length,self.num_heads,3*self.head_dim)
    q,k,v = qkv.chunk(3)
    attention = scaled_dot_product_attention(q,k,v,mask)
    attention = attention.reshape(sequence_length,self.num_heads*self.head_dim)
    out = self.linear_layer(attention)

In [None]:
class LayerNormalization(nn.Module):
  def __init__(self,parameter_shape,epsilon=1e-5):
    self.parameter_shape = parameter_shape
    self.epsilon = epsilon
    self.gamma = nn.Parameter(torch.ones(parameter_shape))
    self.beta = nn.Paramreter(torch.ones(parameter_shape)):
  def forward(self,inputs):
    mean = inputs.mean()
    std = ((inputs-mean)**2).mean()
    std = (std + self.epsilon).sqrt()
    y = (inputs-mean)/std
    out = y*self.gamma + self.beta
    return out

In [None]:
class FeedForward(nn.Module):
  def __init__(self,d_model,hidden,drop_prob):
    self.linear1 = nn.Linear(d_model,hidden)
    self.linear2 = nn.Linear(hidden,d_model)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=drop_prob)
  def forward(self,x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.linear2(x)
    return x

In [None]:
class EncoderLayer(nn.Module):
  def __init__(self,d_model,ffn_hidden,num_heads,drop_prob):
    self.attention = MultiHeadAttention(d_model=d_model,num_heads=num_heads)
    self.norm1 = LayerNormalization(parameter_shape=[d_model])
    self.norm2 = LayerNormalization(parameter_shape=[d_model])
    self.ffn = FeedForward(d_model,ffn_hidden,drop_prob)
  def forward(self,x):
    residual_x = x
    x = self.attention(x)
    x = self.norm1(x+residual_x)
    res_x = x
    x = self.ffn(x)
    x = self.norm1(x+res_x)
    return x

In [None]:
class Encoder(nn.Module):
  def __init__(self,d_model,ffn_hidden,num_heads,drop_prob,num_encoders):
    layers = nn.Sequential(*[EncoderLayer(d_model,ffn_hidden,num_heads,drop_prob) for _ in range(num_encoders)])
  def forward(self,x):
    x = self.layers(x)
    return x