In [4]:
import torch
import torch.nn as nn

Provides stability in model training (prevents exploding/vanishing gradients problem). Prevents internal covarinat shift.

In [30]:
class LayerNormalization(nn.Module):
  def __init__(self, emd_dim):
      super().__init__()
      self.scale = nn.Parameter(torch.ones(emd_dim))
      self.shift = nn.Parameter(torch.zeros(emd_dim))
      self.eps = 1e-5
  def forward(self, x):
    mean = x.mean(dim=-1,keepdim=True)
    var = x.var(dim=-1,keepdim=True, unbiased=False)
    norm_x = (x-mean)/torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift

In [31]:
torch.manual_seed(123)
batch_example = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())
output = layer(batch_example)
print(output)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [38]:
ln = LayerNormalization(5)
normalized = ln.forward(batch_example)
normalized.mean()

tensor(-4.7684e-08, grad_fn=<MeanBackward0>)

Differentiable at 0. Solves dead neuron problem. x*phi(x) phi(x)~ CDF of standard normal distribution.

Gaussian Error Linear Unit

In [39]:
class GELU(nn.Module):
  def __init__(self):
      super().__init__()
  def forward(self,x):
      return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))*(x+0.044715*torch.pow(x,3))))



In [42]:
CFG = {
    'Emb_dim' : 768
}

In [47]:
class FeedForward(nn.Module):
   def __init__(self,CFG):
      super().__init__()
      self.layers = nn.Sequential(
          nn.Linear(CFG['Emb_dim'],4*CFG['Emb_dim']), #expansion
          GELU(), # gelu
          nn.Linear(4*CFG['Emb_dim'],CFG['Emb_dim']) #contraction
      )

   def forward(self,x):
    return self.layers(x)

In [49]:
ffn = FeedForward(CFG)
x = torch.rand((2,3,768))
op=ffn.forward(x)
print(op.shape)

torch.Size([2, 3, 768])
