Workflow:
$$
\text{Text} \xrightarrow{\text{Tokenize}} \text{Token IDs} \xrightarrow{\text{Linear}} \text{Embedding} \xrightarrow{\text{Multi-Head Attention}} \text{Attention} \xrightarrow{\text{Feed Forward}} \text{Output}
$$

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoConfig

In [2]:
config = AutoConfig.from_pretrained("bert-base-uncased")

In [3]:
class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, block_size, n_embd) -> None:
        super().__init__()
        self.tok_embedder = nn.Embedding(vocab_size, n_embd)
        self.pos_embedder = nn.Embedding(block_size, n_embd)
    
    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)

        tok_emb = self.tok_embedder(input_ids)
        pos_emb = self.pos_embedder(position_ids)
        return tok_emb + pos_emb

In [4]:
input_embedding = InputEmbedding(config.vocab_size, config.max_position_embeddings, config.hidden_size)
text = "Time flies like an arrow."
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
tokens = tokenizer(text, return_tensors='pt', add_special_tokens=False)
tokens.input_ids.size()

torch.Size([1, 6])

In [6]:
tokens.input_ids

tensor([[ 2051, 10029,  2066,  2019,  8612,  1012]])

In [7]:
input_embedding(tokens.input_ids).size()

torch.Size([1, 6, 768])