In [1]:
import tiktoken 

test_file = "./The_Verdict.txt"
with open(test_file, "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_txt = tokenizer.encode(raw_text)
print(len(encoded_txt))

5145


In [2]:
enc_sample = encoded_txt[50:]
#enc_sample

In [3]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f'x:{x}')
print(f'y:     {y}')

x:[290, 4920, 2241, 287]
y:     [4920, 2241, 287, 257]


In [4]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired])) 
    

[290] ----> 4920
 and ---->  established
[290, 4920] ----> 2241
 and established ---->  himself
[290, 4920, 2241] ----> 287
 and established himself ---->  in
[290, 4920, 2241, 287] ----> 257
 and established himself in ---->  a


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        #Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the original book(raw text) into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem(self, index):
        return self.input_ids[index], self.target_ids[index]



In [6]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle=shuffle,
        drop_last = drop_last,
        num_workers=num_workers
    )

    return dataloader



In [11]:
dataloader = create_dataloader_v1( raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

NotImplementedError: Subclasses of Dataset should implement __getitem__.