## Chapter 2: Working with Text Data

### 1) Tokenization

In [1]:
import os 


with open("../data_brzechwa.txt", "r") as file:
    raw_text = file.read()

In [2]:
import re

text = "Hello world. This is a test"
result = re.split(r'(\s)', text)

print(result)

['Hello', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [3]:
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [4]:
result = [item for item in result if item not in [' ', '']]
print(result)

['Hello', 'world', '.', 'This', 'is', 'a', 'test']


In [5]:
text = "Hello, world. is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in result if item.strip()]
print(len(preprocessed))

102357


### 2) Converting tokens into token IDs

In [6]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

21627


In [7]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [8]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|\s)', r'\1', text)
        return text

In [9]:
tokenizer = SimpleTokenizerV1(vocab)

In [10]:
text = """To jest przykład, który ma na celu pokazać działanie."""

In [11]:
ids = tokenizer.encode(text)
ids

[2444, 6460, 13372, 3, 7332, 7787, 8563, 3971, 11737, 5224, 4]

In [12]:
tokenizer.decode(ids)

'To jest przykład, który ma na celu pokazać działanie.'

In [13]:
tokenizer.decode(tokenizer.encode(text))

'To jest przykład, który ma na celu pokazać działanie.'

### 3) Adding special tokens

In [14]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab))

21629


In [15]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('––', 21624)
('–––––––––––––––––––––––––––––––––––––––––––', 21625)
('––––––––––––––––––––––––––––––––––––––––––––––––', 21626)
('<|endoftext|>', 21627)
('<|unk|>', 21628)


In [16]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|\s)', r'\1', text)
        return text

In [17]:
tokenizer = SimpleTokenizerV2(vocab)

In [18]:
text = """To jest przykładowy tekst, z użyciem słów, których nie było w vocab"""

In [19]:
tokenizer.encode(text)

[2444,
 6460,
 21628,
 21628,
 3,
 19256,
 21628,
 16223,
 3,
 7333,
 9082,
 3878,
 17456,
 21628]

In [20]:
tokenizer.decode(tokenizer.encode(text))

'To jest <|unk|> <|unk|>, z <|unk|> słów, których nie było w <|unk|>'

### 4) Byte Pair Encoding (BPE)

In [21]:
import tiktoken

In [22]:
tiktoken.__version__

'0.9.0'

In [23]:
tokenizer = tiktoken.get_encoding("o200k_base")

In [24]:
text = """To jest przykładowy tekst, z użyciem słów. <|endoftext|> których nie było w vocab"""

In [25]:
tokenizer.encode(text, allowed_special={'<|endoftext|>'})

[1385,
 12637,
 142014,
 1318,
 9272,
 88,
 38692,
 11,
 579,
 97187,
 183790,
 55488,
 9205,
 13,
 220,
 199999,
 98765,
 4725,
 78129,
 286,
 72627]

### 5) Data sampling with a sliding window

In [26]:
with open("../data_brzechwa.txt", "r") as file:
    raw_text = file.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

203693


In [27]:
enc_sample = enc_text[:50]

context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [198, 135923, 658, 1400]
y:      [135923, 658, 1400, 42212]


In [28]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    target = enc_sample[i:i+1]
    print(tokenizer.decode(context), "------>", tokenizer.decode(target))


 ------> PAN

PAN ------>  K

PAN K ------> LE

PAN KLE ------> KS


In [29]:
import torch 

In [30]:
torch.__version__

'2.6.0+cu124'

In [31]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [33]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=8):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [34]:
with open('../data_brzechwa.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [36]:
len(raw_text)

576515

In [38]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 198,   47, 1565,  509]]), tensor([[  47, 1565,  509, 2538]])]


In [39]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  47, 1565,  509, 2538]]), tensor([[ 1565,   509,  2538, 27015]])]


In [41]:
dataloader = create_dataloader_v1(raw_text, batch_size=4, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f"Inputs: {inputs}")
print(f"Targets: {targets}")

Inputs: tensor([[  198,    47,  1565,   509],
        [   47,  1565,   509,  2538],
        [ 1565,   509,  2538, 27015],
        [  509,  2538, 27015,   198]])
Targets: tensor([[   47,  1565,   509,  2538],
        [ 1565,   509,  2538, 27015],
        [  509,  2538, 27015,   198],
        [ 2538, 27015,   198, 26691]])
