In [9]:
GPT_CONFIG_124M={
    "vocab_size":50257,
    "context_length": 1024,
    "emb_dim":768,
    "n_head":12,
    "n_layers":12,
    "drop_rate":0.5,
    "qkv_bias":False
}

In [10]:
import torch.nn as nn
import torch

In [46]:
class Layer_Norm(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.ones(emb_dim))
    self.eps = 1e-5

  def forward(self,x):
    mean = x.mean(dim=-1,keepdim=True)
    var = x.var(dim=-1,keepdim=True, unbiased=True)
    return self.scale*(x-mean)/torch.sqrt(var+self.eps) + self.shift


class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self,x):
    return 0.5*x*(1+torch.tanh(torch.tensor((2/torch.pi))**0.5)*(x+0.044715*x**3))


class FeedForward(nn.Module):
  def __init__(self,cfg):
        super().__init__()
        self.layers =  nn.Sequential(
        nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),    #expansion
        GELU(),   #non linear activation
        nn.Linear(4*cfg['emb_dim'],cfg['emb_dim'])  #compression
    )
  def forward(self,x):
    return self.layers(x)

In [47]:
class MultiheadAttention(nn.Module):
  def __init__(self,din,dout,context_length,dropout,num_heads,qkv_bias=False):
    super().__init__()
    self.w_queries = nn.Linear(din,dout,qkv_bias)
    self.w_keys = nn.Linear(din,dout,qkv_bias)
    self.w_values = nn.Linear(din,dout,qkv_bias)
    self.dropout=nn.Dropout(dropout)
    self.context_length=context_length
    self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length)))
    self.num_heads=num_heads

  def forward(self,x):
    b,contextlength, emd_size = x.shape
    num_heads=self.num_heads
    head_dim = emd_size//num_heads

    queries = self.w_queries(x)
    keys = self.w_keys(x)
    values = self.w_values(x)

    queries = queries.view(b,contextlength,num_heads,head_dim)
    keys = queries.view(b,contextlength,num_heads,head_dim)
    values = queries.view(b,contextlength,num_heads,head_dim)

    queries = queries.transpose(1,2)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)

    attention_scores = queries @ keys.transpose(2,3)
    attention_scores.masked_fill(self.mask.bool()[:contextlength, :contextlength],-torch.inf)
    attention_weights = torch.softmax(attention_scores/num_heads**0.5,dim=-1)
    attention_weights = self.dropout(attention_weights)
    context_vectors = (attention_weights @ values).transpose(1,2)
    context_vectors = context_vectors.contiguous().view(b,contextlength,emd_size)
    return context_vectors


In [50]:
class Transformer(nn.Module):

  def __init__(self,cfg):
      super().__init__()
      self.norm1 = Layer_Norm(cfg['emb_dim'])
      self.norm2 = Layer_Norm(cfg['emb_dim'])
      self.att = MultiheadAttention(din=cfg['emb_dim'],
                                    dout=cfg['emb_dim'],
                                    context_length=cfg['context_length'],
                                    dropout=cfg['drop_rate'],
                                    num_heads=cfg['n_head'])
      self.ff = FeedForward(cfg)
      self.drop_shortcut = nn.Dropout(cfg['drop_rate'])

  def forward(self,x):

      shortcut = x
      x = self.norm1(x)
      x = self.att(x)
      x = self.drop_shortcut(x)
      x = x+shortcut
      shortcut = x
      x = self.norm2(x)
      x = self.ff(x)
      x = self.drop_shortcut(x)
      x = x+shortcut

      return x


In [52]:
torch.manual_seed(123)
x = torch.rand(1,2,768)
block = Transformer(GPT_CONFIG_124M)
op=block.forward(x)
print(op.shape)
print(op)

torch.Size([1, 2, 768])
tensor([[[-0.4187,  0.5166,  0.2517,  ...,  0.9541,  0.8567,  0.6279],
         [-0.3046,  0.4029,  0.3019,  ..., -0.1490,  0.6203,  0.7598]]],
       grad_fn=<AddBackward0>)
