## Word based

In [4]:
text = "Hello, how're you doing?"
text.split(" ")

['Hello,', "how're", 'you', 'doing?']

## Character based
- Very lond sequences, less meaningful individual tokens
- splits text into characters rather than words. 
- Fewer out of vocabulary token compared to word based tokenisation

## Subword tokenisation
- Combines the best of word and character based.
- Frequent words are not split into smaller words and rare words decompose into meaningful subwords.
- Eg BPE

In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Using a Transformer network is simple")
print(tokens)

['using', 'a', 'transform', '##er', 'network', 'is', 'simple']


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer(text))

{'input_ids': [101, 7592, 1010, 2129, 1005, 2128, 2017, 2725, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}




## Encoding

In [9]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[2478, 1037, 10938, 2121, 2897, 2003, 3722]


In [10]:
decoded = tokenizer.decode(ids)
print(decoded)

using a transformer network is simple


## Multiple Sequence
- Pad smaller sentences to the length of the longest one
- Batching: Sending multiple sentences throught the model all at once

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

seq = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(seq)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print(f"Input ids: {input_ids}")

output = model(input_ids)
print(f"Logits: {output.logits}")

Input ids: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [19]:
padding = 100
seq1_ids = [[200, 200, 200]]
seq2_ids = [[200, 200]]

batched_ids = [
    [200, 200, 200],
    [200, 200, padding]
]

# To tell the model to ignore the padding
attention_mask = [
    [1, 1, 1],
    [1, 1, 0]
]

output = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(f"Logits: {output.logits}")

Logits: tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


## All together

In [20]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

mod = tokenizer(seq)
print(mod)

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [21]:
seq2 = ["I've been waiting for a HuggingFace course my whole life.", "So have I"]
mod2 = tokenizer(seq2)
print(mod2)

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}


In [22]:
tokens = tokenizer(seq, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output.logits)

tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>)
