In [4]:
# Install the transformers library if you haven't already
%pip install transformers




In [14]:
print("--- Combined Tokenization and Embedding --- ")
sentence_for_all = input("Enter a sentence to process: ")

# Perform Word Tokenization
perform_word_tokenization(sentence_for_all)

# Perform Sub-word Tokenization
perform_subword_tokenization(sentence_for_all)

# Generate and Display Embedding
generate_sentence_embedding(sentence_for_all)

print("------------------------------------------")

--- Combined Tokenization and Embedding --- 
Enter a sentence to process: Every moment is a beginning.

Original Sentence (Word Tokenization): Every moment is a beginning.
Word Tokens: ['Every', 'moment', 'is', 'a', 'beginning', '.']

Original Sentence (Sub-word Tokenization): Every moment is a beginning.
Sub-word Tokens: ['every', 'moment', 'is', 'a', 'beginning', '.']

Original Sentence (Embedding): Every moment is a beginning.
Shape of sentence embedding: torch.Size([768])
Sentence Embedding (first 10 dimensions): tensor([ 0.0380,  0.3059, -0.2016,  0.1223, -0.6078, -0.3073,  0.2931,  0.5669,
         0.3546, -0.7233])
------------------------------------------


### What is Tokenization?

Tokenization is the process of breaking down a text into smaller units called "tokens." These tokens can be words, subwords, or even characters, depending on the tokenization method. It's a fundamental step in Natural Language Processing (NLP).



### 1. Word Tokenization

Word tokenization involves splitting text into individual words. This is often the simplest form of tokenization. We will use the `nltk` library for this.


In [1]:
# Install NLTK if you haven't already
%pip install nltk




In [6]:
import nltk

# Download the 'punkt' tokenizer models, which are necessary for word tokenization
nltk.download('punkt')
nltk.download('punkt_tab') # Download 'punkt_tab' to resolve the LookupError

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### 2. Sub-word Tokenization

Sub-word tokenization breaks words into smaller units (sub-words). This is useful for handling out-of-vocabulary words, reducing vocabulary size, and capturing morphological information. We will use a pre-trained tokenizer from the `transformers` library (specifically, a BERT tokenizer) for this.


In [10]:
from nltk.tokenize import word_tokenize

def perform_word_tokenization(sentence):
    tokens = word_tokenize(sentence)
    print("\nOriginal Sentence (Word Tokenization):", sentence)
    print("Word Tokens:", tokens)

# The function will be called from the combined cell.

In [11]:
from transformers import BertTokenizer

# Load a pre-trained BERT tokenizer
# You can choose other models like 'RobertaTokenizer', 'XLNetTokenizer', etc.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def perform_subword_tokenization(sentence):
    tokens = tokenizer.tokenize(sentence)

    print("\nOriginal Sentence (Sub-word Tokenization):", sentence)
    print("Sub-word Tokens:", tokens)

# The function will be called from the combined cell.

### 3. Creating Embeddings

Embeddings are numerical representations of text that capture semantic meaning. We'll use the pre-trained BERT model to generate these for a given sentence. These embeddings are often used as input for various downstream NLP tasks like sentiment analysis, text classification, or semantic search.

In [15]:
from transformers import BertModel
import torch

# Load a pre-trained BERT model
# We use 'output_hidden_states=True' to get all hidden states, including embeddings
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval() # Set the model to evaluation mode

def generate_sentence_embedding(sentence):
    # Tokenize the sentence and get input IDs and attention mask
    encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad(): # Disable gradient calculation for inference
        output = model(**encoded_input)

    # The last hidden state contains the contextualized embeddings for each token
    last_hidden_state = output.last_hidden_state

    # For sentence embedding, often the embedding of the [CLS] token (first token) is used
    # Alternatively, you could average all token embeddings or use other pooling strategies
    sentence_embedding = last_hidden_state[:, 0, :].squeeze()

    print("\nOriginal Sentence (Embedding):", sentence)
    print("Shape of sentence embedding:", sentence_embedding.shape)
    print("Sentence Embedding (full tensor):", sentence_embedding)

    return sentence_embedding

# The function will be called from the combined cell.

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [16]:
print("--- Combined Tokenization and Embedding --- ")
sentence_for_all = input("Enter a sentence to process: ")

# Perform Word Tokenization
perform_word_tokenization(sentence_for_all)

# Perform Sub-word Tokenization
perform_subword_tokenization(sentence_for_all)

# Generate and Display Embedding
generate_sentence_embedding(sentence_for_all)

print("------------------------------------------")

--- Combined Tokenization and Embedding --- 
Enter a sentence to process: Every moment is a begiinning

Original Sentence (Word Tokenization): Every moment is a begiinning
Word Tokens: ['Every', 'moment', 'is', 'a', 'begiinning']

Original Sentence (Sub-word Tokenization): Every moment is a begiinning
Sub-word Tokens: ['every', 'moment', 'is', 'a', 'beg', '##ii', '##nni', '##ng']

Original Sentence (Embedding): Every moment is a begiinning
Shape of sentence embedding: torch.Size([768])
Sentence Embedding (full tensor): tensor([-1.5569e-01,  2.1537e-01,  1.0082e-01, -5.5937e-02, -2.8631e-01,
        -1.9818e-01,  1.3434e-01,  3.7674e-01,  2.1692e-02, -4.3172e-01,
        -7.1860e-02,  2.7242e-02,  1.5415e-01,  2.8584e-01,  1.6479e-01,
        -2.3501e-02,  4.6647e-02,  3.2918e-01,  4.3362e-02, -1.9999e-01,
        -1.0753e-01, -5.3640e-02,  8.6403e-02, -8.7758e-02, -2.5036e-02,
        -1.0619e-01, -5.5695e-03, -1.8419e-01,  2.7631e-02,  3.1525e-01,
         2.1008e-01,  1.8959e-01, -1.