## Chapter 2: Working with Text Data

### 1) Tokenization

In [13]:
import os 


with open("../data_brzechwa.txt", "r") as file:
    raw_text = file.read()

In [14]:
import re

text = "Hello world. This is a test"
result = re.split(r'(\s)', text)

print(result)

['Hello', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [15]:
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [16]:
result = [item for item in result if item not in [' ', '']]
print(result)

['Hello', 'world', '.', 'This', 'is', 'a', 'test']


In [19]:
text = "Hello, world. is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in result if item.strip()]
print(len(preprocessed))

102246


### 2) Converting tokens into token IDs

In [24]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

21563


In [27]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [28]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|\s)', r'\1', text)
        return text

In [29]:
tokenizer = SimpleTokenizerV1(vocab)

In [41]:
text = """To jest przykład, który ma na celu pokazać działanie."""

In [43]:
ids = tokenizer.encode(text)
ids

[2389, 6400, 13312, 3, 7272, 7727, 8503, 3911, 11677, 5164, 4]

In [44]:
tokenizer.decode(ids)

'To jest przykład, który ma na celu pokazać działanie.'

In [45]:
tokenizer.decode(tokenizer.encode(text))

'To jest przykład, który ma na celu pokazać działanie.'

### 3) Adding special tokens

In [46]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab))

21565


In [48]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('–wykrzykiwał', 21560)
('–zawołałem', 21561)
('––', 21562)
('<|endoftext|>', 21563)
('<|unk|>', 21564)


In [51]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|\s)', r'\1', text)
        return text

In [52]:
tokenizer = SimpleTokenizerV2(vocab)

In [53]:
text = """To jest przykładowy tekst, z użyciem słów, których nie było w vocab"""

In [54]:
tokenizer.encode(text)

[2389,
 6400,
 21564,
 21564,
 3,
 19196,
 21564,
 16163,
 3,
 7273,
 9022,
 3818,
 17396,
 21564]

In [55]:
tokenizer.decode(tokenizer.encode(text))

'To jest <|unk|> <|unk|>, z <|unk|> słów, których nie było w <|unk|>'

### 4) Byte Pair Encoding (BPE)

In [58]:
import tiktoken

In [59]:
tiktoken.__version__

'0.9.0'

In [67]:
tokenizer = tiktoken.get_encoding("o200k_base")

In [68]:
text = """To jest przykładowy tekst, z użyciem słów. <|endoftext|> których nie było w vocab"""

In [69]:
tokenizer.encode(text, allowed_special={'<|endoftext|>'})

[1385,
 12637,
 142014,
 1318,
 9272,
 88,
 38692,
 11,
 579,
 97187,
 183790,
 55488,
 9205,
 13,
 220,
 199999,
 98765,
 4725,
 78129,
 286,
 72627]