In [1]:
import torch
import torch.nn as nn

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [3]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [4]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [5]:
ff = FeedForward(GPT_CONFIG_124M)

In [6]:
input = torch.randn(3, 768)

In [7]:
input.shape

torch.Size([3, 768])

In [8]:
input

tensor([[ 8.2706e-02,  7.4583e-01,  1.4808e-02,  ...,  7.1245e-01,
         -3.0544e+00, -7.3611e-02],
        [-5.2431e-01, -1.0271e+00,  9.5360e-01,  ...,  9.1839e-01,
          2.8037e-01, -6.1168e-01],
        [ 2.2656e-03, -3.5256e-01,  3.7641e-01,  ...,  1.9870e-01,
         -5.3920e-02, -1.2445e+00]])

In [9]:
output = ff(input)

In [10]:
output

tensor([[-0.2693,  0.0182, -0.1524,  ...,  0.1779,  0.0662,  0.0666],
        [-0.1008,  0.0821,  0.3188,  ..., -0.0782, -0.0099,  0.1486],
        [-0.0554,  0.1528,  0.3607,  ...,  0.2012, -0.1556,  0.1543]],
       grad_fn=<AddmmBackward0>)

In [11]:
output = ff(output)

In [12]:
output.shape

torch.Size([3, 768])