# Hands-On with Tokenization and Sentence Embeddings

This notebook covered two fundamental concepts in Large Language Models (LLMs)

1. Tokenization: The process of splitting text into smaller units called tokens, which can be words or subwords.

2. Embeddings: Dense vector representations of words or sentences that capture their semantic meanings.

## Tokenization

https://tiktokenizer.vercel.app/?model=gpt-4-1106-preview

In [None]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
	torch.cuda.manual_seed_all(random_seed)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
text = "Il était une fois dans un village lointain, niché au cœur des montagnes"

# Tokenize the text again for reference
tokenized_text = tokenizer.tokenize(text)
#print tokenized text
print(f"tokenized Text: {tokenized_text}")



tokenized Text: ['Il', 'é', '##tai', '##t', 'une', 'f', '##ois', 'dans', 'un', 'village', 'lo', '##int', '##ain', ',', 'ni', '##ch', '##é', 'au', 'c', '##œ', '##ur', 'des', 'mon', '##tag', '##nes']


In [None]:
# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus(
	 [text],			 # List of input texts
	padding=True,			 # Pad to the maximum sequence length
	truncation=True,		 # Truncate to the maximum sequence length if necessary
	return_tensors='pt',	 # Return PyTorch tensors
	add_special_tokens=True # Add special tokens CLS and SEP
)

input_ids = encoding['input_ids'] # Token IDs
# print input IDs
print(f"Input ID: {input_ids}")


Input ID: tensor([[  101,  9190,   255, 13564,  1204, 25731,   175,  8586, 22463,  8362,
          1491, 25338, 10879,  8104,   117, 11437,  1732,  2744, 12686,   172,
         28241,  2149,  3532, 19863, 21365,  3965,   102]])


In [None]:
attention_mask = encoding['attention_mask'] # Attention mask
# print attention mask
print(f"Attention mask: {attention_mask}")

Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])


In [None]:
# Decode the token IDs back to text
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
#print decoded text
print(f"Decoded Text: {decoded_text}")

Decoded Text: Il était une fois dans un village lointain, niché au cœur des montagnes


In [None]:
# Load BERT model
model = BertModel.from_pretrained('bert-base-cased')

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
# Generate embeddings using BERT model
with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask)
	word_embeddings = outputs.last_hidden_state # This contains the embeddings

# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")


Shape of Word Embeddings: torch.Size([1, 27, 768])


In [None]:
# Print word embeddings for each token

for token, embedding in zip(tokenized_text[:2], word_embeddings[0][:2]):
	print(f"Token: {token}")
	print(f"Embedding: {embedding}")
	print("\n")

Token: il
Embedding: tensor([-4.3385e-01, -1.1419e-01, -3.7230e-01, -2.2298e-01,  3.1748e-01,
        -1.9876e-01,  2.8873e-01,  3.7331e-01, -4.1483e-02, -2.8517e-01,
        -2.6976e-01, -2.9488e-01,  8.4574e-03,  2.1081e-01,  1.5625e-01,
         7.3450e-01, -5.2006e-01, -1.2086e-01,  4.3528e-01, -1.3087e-01,
        -2.9477e-01, -6.8831e-01, -2.9657e-01, -6.3729e-02,  2.1001e-01,
         1.5923e-01, -2.2835e-01,  2.0979e-01,  1.2235e-01,  6.7449e-01,
        -1.7216e-01,  4.7418e-01, -3.5390e-01, -2.0796e-01, -4.8897e-02,
         2.3675e-01,  5.4990e-01, -3.7203e-01, -2.3419e-02,  1.3248e-01,
        -1.6161e-01,  3.3405e-01,  7.1490e-02, -3.7327e-01, -2.1441e-01,
        -3.1276e-01, -2.9607e+00,  4.5754e-01, -4.4658e-01, -8.3060e-02,
         1.0084e-01, -5.5038e-01,  2.1839e-02,  1.6186e-01,  4.2944e-02,
         2.1028e-01, -7.0925e-01,  5.1507e-01,  5.8491e-01, -6.7190e-03,
         1.5842e-02,  8.6203e-02,  6.5102e-01,  6.2059e-01, -5.9952e-01,
         2.1237e-01,  1.8645e-

In [None]:
# Compute the average of word embeddings to get the sentence embedding
sentence_embedding = word_embeddings.mean(dim=1) # Average pooling along the sequence length dimension

# Print the sentence embedding
print("Sentence Embedding:")
print(sentence_embedding)

# Output the shape of the sentence embedding
print(f"Shape of Sentence Embedding: {sentence_embedding.shape}")

Sentence Embedding:
tensor([[-7.3910e-01,  2.0420e-01, -2.6139e-01, -1.4901e-01,  1.4463e-01,
          6.2943e-02, -2.8401e-02,  6.8353e-01,  1.7715e-01, -8.4263e-01,
         -3.5896e-01, -1.8445e-01, -1.4119e-01,  7.7652e-01, -3.6239e-01,
          1.0560e+00,  4.0020e-01, -2.4790e-01, -1.3497e-02, -2.3319e-01,
         -3.0819e-01, -1.4808e-03, -7.1733e-01,  1.6808e-01,  3.4885e-01,
          7.8839e-02, -7.5386e-02,  2.9300e-01,  1.1218e-01,  2.3872e-01,
          2.6815e-01,  5.7935e-01, -3.2794e-01, -8.7232e-01,  1.3450e-02,
          1.3123e-01,  4.0642e-02, -1.6628e-01, -4.5108e-02,  9.9873e-02,
         -5.6600e-01, -6.6684e-01, -1.2269e-01, -1.7988e-01, -4.3719e-01,
         -7.7284e-02,  4.6787e-01, -4.6250e-02,  4.1949e-01,  1.3054e-01,
         -3.5245e-01,  2.1646e-01, -1.8957e-01, -6.9770e-01,  2.4868e-01,
          9.9290e-01, -4.2214e-01, -7.8526e-01,  1.1436e-01, -1.7558e-01,
         -4.5981e-01,  3.5836e-01,  8.0702e-01,  7.6851e-02, -1.6821e-01,
          6.3287e-

In [None]:
# Example sentence for similarity comparison
sentences = ["That is a happy person", "Today is a sunny day", "That is a very happy person", "That is a happy dog"]

# Tokenize and encode the example sentence
example_encoding = tokenizer.batch_encode_plus(
	sentences,
	padding=True,
	truncation=True,
	return_tensors='pt',
	add_special_tokens=True
)
example_input_ids = example_encoding['input_ids']
example_attention_mask = example_encoding['attention_mask']

# Generate embeddings for the example sentence
with torch.no_grad():
	example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
	example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)

# Compute cosine similarity between the original sentence embedding and the example sentence embedding
for i in range(1, 4):
    similarity = cosine_similarity([example_sentence_embedding[0]], [example_sentence_embedding[i]])[0][0]
    print(f"Cosine similarity between : '{sentences[0]}' and '{sentences[i]}': {similarity:.3f}")

Cosine similarity between : 'That is a happy person' and 'Today is a sunny day': 0.824
Cosine similarity between : 'That is a happy person' and 'That is a very happy person': 0.970
Cosine similarity between : 'That is a happy person' and 'That is a happy dog': 0.949


## Embedding with all-MiniLM-L6-v2

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ["That is a happy person", "Today is a sunny day", "That is a very happy person", "That is a happy dog"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings.shape)

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Sentence embeddings:
torch.Size([4, 384])


In [None]:
for i in range(1, 4):
    similarity = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[i]])[0][0]
    print(f"Cosine similarity between : '{sentences[0]}' and '{sentences[i]}': {similarity:.3f}")

Cosine similarity between : 'That is a happy person' and 'Today is a sunny day': 0.257
Cosine similarity between : 'That is a happy person' and 'That is a very happy person': 0.943
Cosine similarity between : 'That is a happy person' and 'That is a happy dog': 0.695
