<b><u>Exercise 2.1</u></b>: Byte pair encoding of unknown words

Try the BPE tokenizer from tiktoken on the unknown words "Akwirw ier" and print the individual token IDs. Then, call the decode function on each of the resulting integers in this list to reproduce the mapping. Lastly, call the decode method on the token IDs to check whether it can reconstruct the original input.

In [1]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
phrase = "Akwirw ier"
integers = tokenizer.encode(phrase, allowed_special={"<|endoftext|>"})
print(integers) 
# [33_901, 86, 343, 86, 220, 959]

[33901, 86, 343, 86, 220, 959]


In [6]:
for i in integers:
    mapping = tokenizer.decode([i])
    print(i, ":", mapping)

33901 : Ak
86 : w
343 : ir
86 : w
220 :  
959 : ier


In [None]:
tokenizer.decode(integers) # done perfectly

'Akwirw ier'

<b><u>Exercise 2.2</u></b>: Dataloaders with different strides and context lengths

To develop more of an intuition for how the dataloader works, try to run it with different settings such as max_length = 2 and stride = 2, and max_length = 8 and stride = 2. 

In [8]:
# Read in text
with open("../the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # tokenizes the entire text

        for i in range(0, len(token_ids) - max_length, stride): # sliding window to chunk text into overlapping sequences of max_length
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self): # total number of rows in the dataset
        return len(self.input_ids)
    
    def __getitem__(self, idx): # returns a single row from the dataset
        return self.input_ids[idx], self.target_ids[idx]

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [10]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) # creates the dataset
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last, # if True, drops the last batch if it is smaller than specified batch_size
        num_workers=num_workers # number of CPU processes to use for preprocessing
    )

    return dataloader

In [None]:
# The way done in class
# max_length of 4 means 4 tokens per batch
# stride of 1 means shift position along by 1 when grabbing second batch
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print("First batch:", first_batch)
print("Second batch:", second_batch)

First batch: [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
Second batch: [tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [None]:
# max_length of 2 means 2 tokens per batch
# stride of 2 means shift position along by 2 when grabbing second batch - no overlapping
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=2, stride=2, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print("First batch:", first_batch)
print("Second batch:", second_batch)

First batch: [tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
Second batch: [tensor([[2885, 1464]]), tensor([[1464, 1807]])]


In [None]:
# max_length of 8 means 8 tokens per batch
# stride of 2 means shift position along by 2 when grabbing second batch - back to overlapping
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=8, stride=2, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print("First batch:", first_batch)
print("Second batch:", second_batch)

First batch: [tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
Second batch: [tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]


In [None]:
# max_length of 4 means 4 tokens per batch
# stride of 4 means shift position along by 4 when grabbing second batch - no overlapping
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=4, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print("First batch:", first_batch)
print("Second batch:", second_batch)

First batch: [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
Second batch: [tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]
