In [1]:
import torch

# Now we are gonna take a look at Multi-Head Attention
# It is applying multiple attentions in parallel, then concatenating there results

In [3]:
class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # Params trained with backprop
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # Buffers trained with a running momentum update
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # Calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0, 1)
      xmean = x.mean(dim, keepdim=True)
      xvar = x.var(dim, keepdim=True)
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
    self.out = self.gamma * xhat + self.beta
    # Update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape


torch.Size([32, 100])

# Once we look at zero-th column its zero mean - tensor(7.4506e-09) and one standard deviation - tensor(1.0000),
# So it's normalizing every single column of this input
* But the rows are not normalized by default - tensor(0.0411), so we need to implement the layer norm

In [4]:
x[:, 0].mean(), x[:, 0].std()

(tensor(7.4506e-09), tensor(1.0000))

In [5]:
x[0, :].mean(), x[0, :].std()

(tensor(0.0411), tensor(1.0431))

In [6]:
class LayerNorm:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    xmean = x.mean(1, keepdim=True)
    xvar = x.var(1, keepdim=True)
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
    self.out = self.gamma * xhat + self.beta
    return self.out

  def __parameters__(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm(100)
x = torch.randn(32, 100)
x = module(x)
x.shape

torch.Size([32, 100])

# And now columns are not going to be normalized, but rows are normalized

In [7]:
x[:, 0].mean(), x[:, 0].std()

(tensor(0.1469), tensor(0.8803))

In [8]:
x[0, :].mean(), x[0, :].std()

(tensor(-9.5367e-09), tensor(1.0000))

# I introduce Layer-Norm, which is similar to Batch Normalization, so we can borrow the code from the Batch-Norm
* We just do normalization across rows, not columns
* Also we don't distinguished between training and test, we also don't need buffers
* Because the computation do not span across the examples

# Size of Layer-Norm is n_embed, so 32
* When Layer-Norm is normalizing our features, the mean and variance is taken over 32 numbers
* So batch and time act as batch-dimensions, this is like per token transformation, that normalizes the features
* And make them unit mean and unit gaussian at initialization