<a href="https://colab.research.google.com/github/Net-AI-Git/LLMs-02---Implementing-Tokenization/blob/main/implementing_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk
!pip install transformers==4.42.1
!pip install sentencepiece
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install scikit-learn
!pip install torch==2.2.2
!pip install torchtext==0.17.2
!pip install numpy==1.26.0

In [None]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
import spacy
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.util import ngrams
from transformers import BertTokenizer
from transformers import XLNetTokenizer

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /home/jupyterlab/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyterlab/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


---


In [None]:
text = "Natural language processing helps computers understand human communication."
tokens = word_tokenize(text)
print(tokens)

['Natural', 'language', 'processing', 'helps', 'computers', 'understand', 'human', 'communication', '.']


In [None]:
# This showcases word_tokenize from nltk library

text = "Models learn language patterns. Can they reason? Don't doubt it."
tokens = word_tokenize(text)
print(tokens)

['Models', 'learn', 'language', 'patterns', '.', 'Can', 'they', 'reason', '?', 'Do', "n't", 'doubt', 'it', '.']


In [None]:
# This showcases the use of the 'spaCy' tokenizer with torchtext's get_tokenizer function

text = "GPT models can't process infinite contexts. They don't have unlimited memory."
nlp = spacy.load("en_core_web_sm")
print("nlp:", nlp)
doc = nlp(text)
print("doc:", doc)

# Making a list of the tokens and priting the list
token_list = [token.text for token in doc]
print("Tokens:", token_list)

# Showing token details
for token in doc:
    print(token.text, token.pos_, token.dep_)

nlp: <spacy.lang.en.English object at 0x75d5bd149a90>
doc: GPT models can't process infinite contexts. They don't have unlimited memory.
Tokens: ['GPT', 'models', 'ca', "n't", 'process', 'infinite', 'contexts', '.', 'They', 'do', "n't", 'have', 'unlimited', 'memory', '.']
GPT PROPN compound
models NOUN nsubj
ca AUX aux
n't PART neg
process VERB ROOT
infinite PROPN compound
contexts PROPN dobj
. PUNCT punct
They PRON nsubj
do AUX aux
n't PART neg
have VERB ROOT
unlimited ADJ amod
memory NOUN dobj
. PUNCT punct


In [None]:
text = "BERT uses bidirectional attention mechanisms."
token = word_tokenize(text)
print(token)

['BERT', 'uses', 'bidirectional', 'attention', 'mechanisms', '.']


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize("BERT uses bidirectional attention mechanisms.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['bert',
 'uses',
 'bid',
 '##ire',
 '##ction',
 '##al',
 'attention',
 'mechanisms',
 '.']

In [None]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenizer.tokenize("BERT uses bidirectional attention mechanisms.")

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

['▁B', 'ERT', '▁uses', '▁bi', 'directional', '▁attention', '▁mechanisms', '.']

In [None]:
dataset = [
    (1,"Introduction to NLP"),
    (2,"Basics of PyTorch"),
    (1,"NLP Techniques for Text Classification"),
    (3,"Named Entity Recognition with PyTorch"),
    (3,"Sentiment Analysis using PyTorch"),
    (3,"Machine Translation with PyTorch"),
    (1," NLP Named Entity,Sentiment Analysis,Machine Translation "),
    (1," Machine Translation with NLP "),
    (1," Named Entity vs Sentiment Analysis  NLP ")]

In [None]:
from torchtext.data.utils import get_tokenizer

In [None]:
tokenizer = get_tokenizer("basic_english")

In [None]:
tokenizer(dataset[0][1])

['introduction', 'to', 'nlp']

In [None]:
def yield_tokens(data_iter):
    for  _,text in data_iter:
        yield tokenizer(text)

In [None]:
my_iterator = yield_tokens(dataset)

In [None]:
next(my_iterator)

['introduction', 'to', 'nlp']

In [None]:
vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
def get_tokenized_sentence_and_indices(iterator):
    tokenized_sentence = next(iterator)  # Get the next tokenized sentence
    token_indices = [vocab[token] for token in tokenized_sentence]  # Get token indices
    return tokenized_sentence, token_indices

tokenized_sentence, token_indices = get_tokenized_sentence_and_indices(my_iterator)
next(my_iterator)

print("Tokenized Sentence:", tokenized_sentence)
print("Token Indices:", token_indices)

Tokenized Sentence: ['basics', 'of', 'pytorch']
Token Indices: [11, 15, 2]


In [None]:
lines = ["Multimodal models process multiple formats",
         "They don't understand visual content",
         "Cross-modal attention connects different inputs."]

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')

tokens = []
max_length = 0

for line in lines:
    tokenized_line = tokenizer_en(line)
    tokenized_line = ['<bos>'] + tokenized_line + ['<eos>']
    tokens.append(tokenized_line)
    max_length = max(max_length, len(tokenized_line))

for i in range(len(tokens)):
    tokens[i] = tokens[i] + ['<pad>'] * (max_length - len(tokens[i]))

print("Lines after adding special tokens:\n", tokens)

# Build vocabulary without unk_init
vocab = build_vocab_from_iterator(tokens, specials=['<unk>'])
vocab.set_default_index(vocab["<unk>"])

# Vocabulary and Token Ids
print("Vocabulary:", vocab.get_itos())
print("Token IDs for 'tokenization':", vocab.get_stoi())

Lines after adding special tokens:
 [['<bos>', 'Multimodal', 'models', 'process', 'multiple', 'formats', '<eos>', '<pad>', '<pad>', '<pad>'], ['<bos>', 'They', 'do', "n't", 'understand', 'visual', 'content', '<eos>', '<pad>', '<pad>'], ['<bos>', 'Cross', '-', 'modal', 'attention', 'connects', 'different', 'inputs', '.', '<eos>']]
Vocabulary: ['<unk>', '<pad>', '<bos>', '<eos>', '-', '.', 'Cross', 'Multimodal', 'They', 'attention', 'connects', 'content', 'different', 'do', 'formats', 'inputs', 'modal', 'models', 'multiple', "n't", 'process', 'understand', 'visual']
Token IDs for 'tokenization': {'visual': 22, 'understand': 21, 'process': 20, "n't": 19, 'multiple': 18, 'models': 17, 'modal': 16, 'inputs': 15, 'formats': 14, 'do': 13, '<unk>': 0, '<pad>': 1, '-': 4, '<bos>': 2, '<eos>': 3, 'connects': 10, '.': 5, 'Multimodal': 7, 'content': 11, 'Cross': 6, 'They': 8, 'attention': 9, 'different': 12}


In [None]:
new_line = "I learned about embeddings and attention mechanisms."

# Tokenize the new line
tokenized_new_line = tokenizer_en(new_line)
tokenized_new_line = ['<bos>'] + tokenized_new_line + ['<eos>']

# Pad the new line to match the maximum length of previous lines
new_line_padded = tokenized_new_line + ['<pad>'] * (max_length - len(tokenized_new_line))

# Convert tokens to IDs and handle unknown words
new_line_ids = [vocab[token] if token in vocab else vocab['<unk>'] for token in new_line_padded]

# Example usage
print("Token IDs for new line:", new_line_ids)

Token IDs for new line: [2, 0, 0, 0, 0, 0, 9, 0, 5, 3]


In [None]:
text = """
Going through the world of tokenization has been like walking through a huge maze made of words, symbols, and meanings. Each turn shows a bit more about the cool ways computers learn to understand our language. And while I'm still finding my way through it, the journey’s been enlightening and, honestly, a bunch of fun.
Eager to see where this learning path takes me next!"
"""

# Counting and displaying tokens and their frequency
from collections import Counter
def show_frequencies(tokens, method_name):
    print(f"{method_name} Token Frequencies: {dict(Counter(tokens))}\n")

In [None]:
import nltk
import spacy
from transformers import BertTokenizer, XLNetTokenizer
from datetime import datetime

# NLTK Tokenization
start_time = datetime.now()
nltk_tokens = nltk.word_tokenize(text)
nltk_time = datetime.now() - start_time

# SpaCy Tokenization
nlp = spacy.load("en_core_web_sm")
start_time = datetime.now()
spacy_tokens = [token.text for token in nlp(text)]
spacy_time = datetime.now() - start_time

# BertTokenizer Tokenization
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
start_time = datetime.now()
bert_tokens = bert_tokenizer.tokenize(text)
bert_time = datetime.now() - start_time

# XLNetTokenizer Tokenization
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
start_time = datetime.now()
xlnet_tokens = xlnet_tokenizer.tokenize(text)
xlnet_time = datetime.now() - start_time

# Display tokens, time taken for each tokenizer, and token frequencies
print(f"NLTK Tokens: {nltk_tokens}\nTime Taken: {nltk_time} seconds\n")
show_frequencies(nltk_tokens, "NLTK")

print(f"SpaCy Tokens: {spacy_tokens}\nTime Taken: {spacy_time} seconds\n")
show_frequencies(spacy_tokens, "SpaCy")

print(f"Bert Tokens: {bert_tokens}\nTime Taken: {bert_time} seconds\n")
show_frequencies(bert_tokens, "Bert")

print(f"XLNet Tokens: {xlnet_tokens}\nTime Taken: {xlnet_time} seconds\n")
show_frequencies(xlnet_tokens, "XLNet")

NLTK Tokens: ['Going', 'through', 'the', 'world', 'of', 'tokenization', 'has', 'been', 'like', 'walking', 'through', 'a', 'huge', 'maze', 'made', 'of', 'words', ',', 'symbols', ',', 'and', 'meanings', '.', 'Each', 'turn', 'shows', 'a', 'bit', 'more', 'about', 'the', 'cool', 'ways', 'computers', 'learn', 'to', 'understand', 'our', 'language', '.', 'And', 'while', 'I', "'m", 'still', 'finding', 'my', 'way', 'through', 'it', ',', 'the', 'journey', '’', 's', 'been', 'enlightening', 'and', ',', 'honestly', ',', 'a', 'bunch', 'of', 'fun', '.', 'Eager', 'to', 'see', 'where', 'this', 'learning', 'path', 'takes', 'me', 'next', '!', "''"]
Time Taken: 0:00:00.000506 seconds

NLTK Token Frequencies: {'Going': 1, 'through': 3, 'the': 3, 'world': 1, 'of': 3, 'tokenization': 1, 'has': 1, 'been': 2, 'like': 1, 'walking': 1, 'a': 3, 'huge': 1, 'maze': 1, 'made': 1, 'words': 1, ',': 5, 'symbols': 1, 'and': 2, 'meanings': 1, '.': 3, 'Each': 1, 'turn': 1, 'shows': 1, 'bit': 1, 'more': 1, 'about': 1, 'cool