In [1]:
!pip install -q --upgrade transformers torch torchvision torchaudio
!pip install -q tokenizers==0.13.3
!pip install -q bitsandbytes transformers accelerate gradio thread6

[0m

##### Theres many types of tokenizers
##### The first type we'll touch on is word-based tokenization
##### It's generally easy to setup, has few rules, and yeilds decent results
##### Below, the goal is to split the raw text into words and find a numberical represntaion for each of them

In [21]:
from IPython.display import Image
print("Word Based Tokenization")
Image(url="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/word_based_tokenization-dark.svg", width=900, height=900)

Word Based Tokenization


In [6]:
# There's  different ways to split the text. The example above uses splits on spaces and on punctuations.
tokenized_text = "Jim Henson was a puppeteer".split()

# This example tokenizes on spaces
tokenized_text

['Jim', 'Henson', 'was', 'a', 'puppeteer']

##### Both types of the above tokenizations are not recommended as they have a lot of draw backs such as large vocabuallry size, 
##### Also, words like dog and dogs are counted as not being similar
##### A custom token is also added for unknown words, its typically "[UNK]"

#### One way to reduce the amount of tokens is to use 'Character-based' tokenization

### Character based tokenization split the text into characters(letters, numbers, symbols), rather than words. 

#### The provides 2 benefits
  ##### - The vocabulary is much smaller
  ##### - There are much fewer out-of-vocabulary (unknown) tokens, since every word can be built from characters

#### Another thing to consider is that we'll end up with a avery large amount of tokens to be processed by our model

In [20]:
from IPython.display import Image
print("Character based tokenization")
Image(url="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/character_based_tokenization-dark.svg", width=900, height=900)

Character based tokenization


### Sub-word Tokenization

#### Relie's on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed inoto meaningful subwords

   ##### - For example: "Annoyingly" can be considered rare since its composed of both "Annoying" and "ly". Theses are likely to appear often as standalone subwords, while at the same time the meaning of "annoyingly" is kept by the composite meaning of "annoying" and "ly"

In [11]:
from IPython.display import Image
print("subword tokenization")
Image(url="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/bpe_subword-dark.svg", width=900, height=900)

subword tokenization


##### Loading and saving tokenizers is the same as with model using the from_pretrained() and the save_pretrained() methods
##### These methods will save the algorithm used by the tokenizer as well as the vocabulary

In [14]:
# load the BERT tokenizer trained with the same checkpoint as BERT
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [15]:
# similar to AutoModel, the AutoTokenizer class will grab the proper tokenizer class in the library based on the checkpoint name
# it can be used directly with any checkpoint

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [16]:
# lets use the tokenizer
tokenizer("He had a vague sense that trees gave birth to dinosaurs.")

{'input_ids': [101, 1124, 1125, 170, 14673, 2305, 1115, 2863, 1522, 3485, 1106, 23570, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
# saving a tokenizer is the same as saving a model
tokenizer.save_pretrained("/notebooks/NLP_huggingface/Chapter_2")

('/notebooks/NLP_huggingface/Chapter_2/tokenizer_config.json',
 '/notebooks/NLP_huggingface/Chapter_2/special_tokens_map.json',
 '/notebooks/NLP_huggingface/Chapter_2/vocab.txt',
 '/notebooks/NLP_huggingface/Chapter_2/added_tokens.json',
 '/notebooks/NLP_huggingface/Chapter_2/tokenizer.json')

In [2]:
# We're going to tokenize some input and print its tokenized form
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

# The outuput is a list of strings, Notice how its using sub-word tokenization
tokens

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']

In [3]:
# The conversion to input ID's is handled by the convert_tokens_to_ids() tokenizer method
ids = tokenizer.convert_tokens_to_ids(tokens)

ids

[7993, 170, 13809, 23763, 2443, 1110, 3014]

In [5]:
# Exercise example

# sample text to tokenize
example_sequence = [
    "I’ve been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

# tokenize sample text and store it in variable
example_tokens = tokenizer.tokenize(example_sequence)

print(example_tokens)

# convert tokenized text to input ID's
example_ids = tokenizer.convert_tokens_to_ids(example_tokens)

print(example_ids)

['I', '’', 've', 'been', 'waiting', 'for', 'a', 'Hu', '##gging', '##F', '##ace', 'course', 'my', 'whole', 'life', '.', 'I', 'hate', 'this', 'so', 'much', '!']
[146, 787, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 146, 4819, 1142, 1177, 1277, 106]


In [9]:
# Decoding means going backwards, so input ID's -> tokenized text -> text
decoded_string = tokenizer.decode(example_ids)

# Note that the decode method not only converts back tokens, but also groups together the tokens that were part of the same words to produce a readable sentence.
decoded_string

'I ’ ve been waiting for a HuggingFace course my whole life. I hate this so much!'