# Imports

In [1]:
#!pip install transformers datasets pandas

In [4]:

import pandas as pd
import datasets
from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, ClassLabel, Value

## Transformers - Real tokenizer example

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [6]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

sentence = "100000 wasn‚Äôt a meme. GME üöÄ:rocket::rocket: ### (Empty body)"
encoded = preprocess_function({"text":[sentence]})
print(sentence)
print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

100000 wasn‚Äôt a meme. GME üöÄ:rocket::rocket: ### (Empty body)
{'input_ids': [[101, 6694, 8889, 2347, 1521, 1056, 1037, 2033, 4168, 1012, 13938, 2063, 100, 1024, 7596, 1024, 1024, 7596, 1024, 1001, 1001, 1001, 1006, 4064, 2303, 1007, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
101 [CLS]
6694 1000
8889 ##00
2347 wasn
1521 ‚Äô
1056 t
1037 a
2033 me
4168 ##me
1012 .
13938 gm
2063 ##e
100 [UNK]
1024 :
7596 rocket
1024 :
1024 :
7596 rocket
1024 :
1001 #
1001 #
1001 #
1006 (
4064 empty
2303 body
1007 )
102 [SEP]


In [8]:
sentence = "If you use obscure, arcane, abstruse, recondite or incorectly spelled words, it will still work"
encoded = preprocess_function({"text":[sentence]})
print(sentence)
# print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

If you use obscure, arcane, abstruse, recondite or incorectly spelled words, it will still work
101 [CLS]
2065 if
2017 you
2224 use
14485 obscure
1010 ,
8115 arc
7231 ##ane
1010 ,
14689 abs
16344 ##tr
8557 ##use
1010 ,
28667 rec
15422 ##ond
4221 ##ite
2030 or
4297 inc
5686 ##ore
6593 ##ct
2135 ##ly
11479 spelled
2616 words
1010 ,
2009 it
2097 will
2145 still
2147 work
102 [SEP]


In [9]:
sentence = "Zelfs antweirpse woorden die in het stadstheatre zouden passen, ca marche h√© alors. Âç≥‰ΩøÊòØÊô¶ÊæÄÈõ£ÊáÇÁöÑ‰∏≠ÊñáÂñÆË©ûÂíåÊ¶ÇÂøµ‰πüÂèØ‰ª•"
encoded = preprocess_function({"text":[sentence]})
print(sentence)
# print(encoded)
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

Zelfs antweirpse woorden die in het stadstheatre zouden passen, ca marche h√© alors. Âç≥‰ΩøÊòØÊô¶ÊæÄÈõ£ÊáÇÁöÑ‰∏≠ÊñáÂñÆË©ûÂíåÊ¶ÇÂøµ‰πüÂèØ‰ª•
101 [CLS]
27838 ze
10270 ##lf
2015 ##s
14405 ant
19845 ##wei
14536 ##rp
3366 ##se
15854 woo
18246 ##rden
3280 die
1999 in
21770 het
2358 st
19303 ##ads
10760 ##the
4017 ##at
2890 ##re
1062 z
19224 ##oud
2368 ##en
3413 pass
2368 ##en
1010 ,
6187 ca
28791 marche
2002 he
2632 al
5668 ##ors
1012 .
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
1916 ÁöÑ
1746 ‰∏≠
1861 Êñá
100 [UNK]
100 [UNK]
1796 Âíå
100 [UNK]
100 [UNK]
1750 ‰πü
100 [UNK]
100 [UNK]
102 [SEP]


In [10]:
for w in encoded['input_ids'][0]:
    print(w, tokenizer.decode([w]))

101 [CLS]
27838 ze
10270 ##lf
2015 ##s
14405 ant
19845 ##wei
14536 ##rp
3366 ##se
15854 woo
18246 ##rden
3280 die
1999 in
21770 het
2358 st
19303 ##ads
10760 ##the
4017 ##at
2890 ##re
1062 z
19224 ##oud
2368 ##en
3413 pass
2368 ##en
1010 ,
6187 ca
28791 marche
2002 he
2632 al
5668 ##ors
1012 .
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
100 [UNK]
1916 ÁöÑ
1746 ‰∏≠
1861 Êñá
100 [UNK]
100 [UNK]
1796 Âíå
100 [UNK]
100 [UNK]
1750 ‰πü
100 [UNK]
100 [UNK]
102 [SEP]
