In [1]:
# Simple sentence
text = "Transformers are powerful models"
tokens = text.lower().split()
print("Text   :", text)
print("Tokens :", tokens)

Text   : Transformers are powerful models
Tokens : ['transformers', 'are', 'powerful', 'models']


In [2]:
# Build a tiny vocabulary
vocab = sorted(set(tokens))
token_to_id = {token: idx for idx, token in enumerate(vocab)}
id_to_token = {idx: token for token, idx in token_to_id.items()}

print("Vocabulary :", vocab)
print("Token -> ID:", token_to_id)

Vocabulary : ['are', 'models', 'powerful', 'transformers']
Token -> ID: {'are': 0, 'models': 1, 'powerful': 2, 'transformers': 3}


In [3]:
# Convert tokens to token IDs
token_ids = [token_to_id[token] for token in tokens]
print("Token IDs:", token_ids)

Token IDs: [3, 0, 2, 1]


In [5]:
# Create random embeddings for each vocab token
import torch

embedding_dim = 4
embedding_matrix = torch.randn(len(vocab), embedding_dim)

print("Embedding matrix shape:", embedding_matrix.shape)
print(embedding_matrix)

Embedding matrix shape: torch.Size([4, 4])
tensor([[-0.8427,  2.0724, -0.5994,  0.3365],
        [ 0.8138, -0.0234, -0.5669, -0.2270],
        [ 0.6936, -0.6392, -0.2089, -1.0178],
        [ 1.5710, -0.9381, -1.1020, -1.8505]])


In [None]:
# Lookup embeddings for sentence token IDs to get sentence embeddings in format [3, 0, 2, 1]
sentence_embeddings = embedding_matrix[token_ids]

print("Sentence embedding shape:", sentence_embeddings.shape)
print(sentence_embeddings)

Sentence embedding shape: torch.Size([4, 4])
tensor([[ 1.5710, -0.9381, -1.1020, -1.8505],
        [-0.8427,  2.0724, -0.5994,  0.3365],
        [ 0.6936, -0.6392, -0.2089, -1.0178],
        [ 0.8138, -0.0234, -0.5669, -0.2270]])
