# Tokenizer

In [2]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
# Here I can see that the vocabulary size is 30522 tokens. 
# Each of the tokens are mapped to a token ID.
print(tokenizer.vocab)
print(f'The vocabulary size is {len(tokenizer.vocab)}')

The vocabulary size is 30522


In [5]:
# The original sentence is "I love NLP so freaking much"

# When I use WordPiece tokenization, because I'm using bert-base-uncased checkpoint, 
# all of the tokens are converted into lowercase and I then have I, love, and then double hash p

# These are then mapped to numerical IDs or tokens based on the vocabulary.
# I can see that the CLS token is added to the front and the SEP token is at the back.
sentence = 'I love NLP so freaking much'
print(sentence)

tokens = tokenizer.tokenize(sentence)
print(tokens)

ids = tokenizer.encode(sentence)
print(ids)
print(tokenizer.decode(ids))

I love NLP so freaking much
['i', 'love', 'nl', '##p', 'so', 'freaking', 'much']
[101, 1045, 2293, 17953, 2361, 2061, 13847, 2172, 102]
[CLS] i love nlp so freaking much [SEP]


In [6]:
# I can also tokenze two sentences 
first_sent = 'I like NLP'
second_sent = 'What about you'

# The return_tensors='pt' means that I want to return Pytorch tensors
input = tokenizer(first_sent, second_sent, return_tensors='pt')
input

# What's returned by the tokenizers are input IDs that can be fed into the BERT model.

{'input_ids': tensor([[  101,  1045,  2066, 17953,  2361,   102,  2054,  2055,  2017,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
input['input_ids']

tensor([[  101,  1045,  2066, 17953,  2361,   102,  2054,  2055,  2017,   102]])

In [8]:
input['token_type_ids']

tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1]])

In [9]:
input['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [10]:
first_sent = 'I like NLP.'
second_sent = 'What are your thoughts on the subject?'

input = tokenizer([first_sent, second_sent], padding=True, return_tensors='pt')
input['attention_mask']

# The 1's correspond to the word and the 0's for the padding.

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])