In this section, we implement data loader that fetches the input-target pairs using the sliding window approach. 
At first, we tokenize the the verdict text using BPE tokenizer

In [2]:
import tiktoken

In [3]:
tokenizer = tiktoken.get_encoding('gpt2')

In [4]:
with open('the-verdict.txt','r',encoding = 'utf-8') as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))
    

5145


In [6]:
enc_sample = enc_text[50:]

In [7]:
#Context_size determines how many tokens are included in the input
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f'x: {x}')
print(f'y:      {y}')

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [8]:
#processing the input along with targets, which are the inputs shifted by one position, we can create a next word prediction tasks as follows:

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    
    print(context,'---->',desired)


[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [14]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context),'--->', tokenizer.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


In [16]:
#Implementing a Data Loader

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        #Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special = {'<|endoftext|>'})

        #Use the sliding window to chunk the bookinto overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1 : i+max_length +1 ]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx],self.target_ids[idx]

In [24]:
#Create DataLoader

def create_dataloader_V1(txt, batch_size = 4, max_length = 256 ,stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    #Initialize the tokenizer
    tokenizer = tiktoken.get_encoding('gpt2')

    #Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    #Create DataLoader
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [26]:
#Let test the dataloader with a batch size of 1 and context size of 4,
# this will develop an intuition of how the GPTDatasetV1 class and create_dataloader_V1 function work together:

In [34]:
with open('the-verdict.txt', encoding = 'utf-8') as f:
    raw_text = f.read() 

In [36]:
#convert dataloader into python iterator to fetch the next entry via Python's built-in next() function

dataloader = create_dataloader_V1(raw_text, batch_size =1, max_length = 4, stride = 1, shuffle = False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)


[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [38]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [42]:
dataloader = create_dataloader_V1(raw_text, batch_size =8, max_length = 4, stride = 4, shuffle = False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [46]:
dataloader = create_dataloader_V1(raw_text, batch_size =16, max_length = 4, stride = 8, shuffle = False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [10899,  2138,   257,  7026],
        [  922,  5891,  1576,   438],
        [ 1049,  5975,   284,   502],
        [  287,   262,  6001,   286],
        [  550,  5710,   465, 12036],
        [27075,    11,   290,  4920],
        [   64,   319,   262, 34686],
        [  314,  2138,  1807,   340],
        [  393, 28537,  2014,   198],
        [  286,   465, 13476,     1],
        [  262,  1466,  1444,   340],
        [ 9074,    13, 46606,   536],
        [ 4842,  1650,   353,   438],
        [48422,   540,   450,    67],
        [ 1781,   340,   338,  1016]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 2138,   257,  7026, 15632],
        [ 5891,  1576,   438,   568],
        [ 5975,   284,   502,   284],
        [  262,  6001,   286,   465],
        [ 5710,   465, 12036,    11],
        [   11,   290,  4920,  2241],
        [  319,   262, 34686, 41976],
        [ 2138,  1807,   340,   561],
        [28537,  2014,   198,