# Tokenizers (PyTorch)

Install the Transformers and Datasets libraries to run this notebook.

In [1]:
! pip install datasets transformers[sentencepiece]



In [2]:
#Defining text to use for this exercise

text = "Jim Henson was a puppeteer"

In [3]:
# Split text into word tokens

tokenized_text = text.split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']


In [4]:
# Tokenizing text using a bert tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(text)
print(tokens)

['jim', 'henson', 'was', 'a', 'puppet', '##eer']


In [5]:
# Tokenizing text using a albert tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")
tokens = tokenizer.tokenize(text)
print(tokens)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

['▁jim', '▁henson', '▁was', '▁a', '▁puppet', 'eer']


In [6]:
# Load BERT tokenizer using the BERTTokenizer class

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Load same model using AutoTokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
# Using tokenizer to produce model inputs

tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
# Saving a tokeinzer locally

tokenizer.save_pretrained("directory_on_my_computer")

('directory_on_my_computer/tokenizer_config.json',
 'directory_on_my_computer/special_tokens_map.json',
 'directory_on_my_computer/vocab.txt',
 'directory_on_my_computer/added_tokens.json',
 'directory_on_my_computer/tokenizer.json')

In [10]:
# Using tokenizer.tokenize to split text into tokens

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [11]:
# Using tokenizer.convert_tokens_to_ids to convert our tokens produced above to input IDs

ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [12]:
# Decoding a list of token IDs (if subword or char method was used the words will be grouped back together by default)

decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

Using a transformer network is simple


In [13]:
# Putting it all togehter! tokenize, convert to ids, add special tokens, decode

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
final_input_ids = tokenizer.prepare_for_model(input_ids)
decoded_ids = tokenizer.decode(final_input_ids['input_ids'])

print('Input IDs: ')
print(input_ids)
print('\n')

print('Input IDs with special tokens: ')
print(final_input_ids)
print('\n')

print('Decoded IDs: ')
print(decoded_ids)
print('\n')

Input IDs: 
[3958, 27227, 2001, 1037, 13997, 11510]


Input IDs with special tokens: 
{'input_ids': [101, 3958, 27227, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


Decoded IDs: 
[CLS] jim henson was a puppeteer [SEP]


