In [1]:
import tiktoken

In [2]:
with open("Datasets/the-verdict.txt", 'r', encoding = "utf-8") as f:
    book_text = f.read()

tokenizer = tiktoken.get_encoding("o200k_base") # GPT-4o 
enc_text = tokenizer.encode(book_text)
print(len(enc_text))

4836


In [3]:
# Removing first 50 tokens
enc_sample = enc_text[50:]

In [4]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print("x :", x)
print("y :", y)

x : [11166, 306, 261, 38350]
y : [306, 261, 38350, 402]


In [5]:
# input target data pairs

for i in range(1, context_size+1):
    input = enc_sample[:i]
    target = enc_sample[i]

    print(input, "---->", target)

[11166] ----> 306
[11166, 306] ----> 261
[11166, 306, 261] ----> 38350
[11166, 306, 261, 38350] ----> 402


In [6]:
# Decoded input-target pairs

for i in range(1, context_size+1):
    input = enc_sample[:i]
    target = enc_sample[i]

    print(tokenizer.decode(input), "---->", tokenizer.decode([target]))

 himself ---->  in
 himself in ---->  a
 himself in a ---->  villa
 himself in a villa ---->  on


<h2>Implementing via Data Loader</h2>

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunck = token_ids[i : i+max_length]
            target_chunck = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunck))
            self.target_ids.append(torch.tensor(target_chunck))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [9]:
def create_dataloader_v1(txt, batch_size = 4, max_length = 256,
                         stride = 128, shuffle = True, drop_last = True,
                         num_workers = 0):  # cant increase num_workers due to error, to increase move the GPTDatasetV1 class to .py file
    tokenizer = tiktoken.get_encoding("o200k_base")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    return dataloader

In [10]:
with open("Datasets/the-verdict.txt", 'r', encoding = "utf-8") as f:
    book_text = f.read()

In [11]:
dataloader = create_dataloader_v1(
    book_text, batch_size = 1, max_length = 4, stride = 1, shuffle = False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[    40, 148954,   3324,   4525]]), tensor([[148954,   3324,   4525,  10874]])]


In [12]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[148954,   3324,   4525,  10874]]), tensor([[  3324,   4525,  10874, 165003]])]


In [13]:
dataloader = create_dataloader_v1(
    book_text, batch_size = 8, max_length = 4, stride = 4, shuffle = False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("Targets:\n", targets)

Inputs:
 tensor([[    40, 148954,   3324,   4525],
        [ 10874, 165003,  33750,   7542],
        [   261,  12424,  59245,    375],
        [  6460,    261,   1899,  19807],
        [  4951,    375,    786,    480],
        [   673,    860,   2212,  19005],
        [   316,    668,    316,   9598],
        [   484,     11,    306,    290]])
Targets:
 tensor([[148954,   3324,   4525,  10874],
        [165003,  33750,   7542,    261],
        [ 12424,  59245,    375,   6460],
        [   261,   1899,  19807,   4951],
        [   375,    786,    480,    673],
        [   860,   2212,  19005,    316],
        [   668,    316,   9598,    484],
        [    11,    306,    290,   4679]])
