In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import matplotlib.pyplot as plt
import numpy as np

# Load pretrained model and tokenizer
model_name = "bert-base-uncased"  # You can try other models like "roberta-base", "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_token_embeddings(sentence):
    """
    Get embeddings for each token in a sentence.
    
    Args:
        sentence (str): Input sentence
        
    Returns:
        dict: Dictionary mapping tokens to their embeddings
    """
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True)
    token_ids = inputs["input_ids"][0]
    
    # Get the tokens as strings
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    
    # Get model output (without computing gradients)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the embeddings from the last hidden state for each token
    last_hidden_state = outputs.last_hidden_state[0]  # First (and only) batch
    
    # Create a dictionary mapping tokens to their embeddings
    token_embeddings = {}
    for i, token in enumerate(tokens):
        # Convert tensor to numpy array
        embedding = last_hidden_state[i].numpy()
        token_embeddings[token] = embedding
    
    return token_embeddings

# Example usage
sentence = "Natural language processing is fascinating and powerful."
token_embeddings = get_token_embeddings(sentence)

# Print all tokens and their embedding dimensions
print(f"Tokens in the sentence:")
for token, embedding in token_embeddings.items():
    print(f"Token: '{token}', Embedding shape: {embedding.shape}")

# Print the first few values of one token's embedding
example_token = list(token_embeddings.keys())[2]  # Get the third token
print(f"\nFirst 10 values of embedding for '{example_token}':")
print(token_embeddings[example_token][:10])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Tokens in the sentence:
Token: '[CLS]', Embedding shape: (768,)
Token: 'natural', Embedding shape: (768,)
Token: 'language', Embedding shape: (768,)
Token: 'processing', Embedding shape: (768,)
Token: 'is', Embedding shape: (768,)
Token: 'fascinating', Embedding shape: (768,)
Token: 'and', Embedding shape: (768,)
Token: 'powerful', Embedding shape: (768,)
Token: '.', Embedding shape: (768,)
Token: '[SEP]', Embedding shape: (768,)

First 10 values of embedding for 'language':
[-0.46853647  0.32274535  0.01619487 -0.8015434   0.2168786   0.31478682
  0.06109725  1.2818849  -0.7566536  -0.13486609]
