In [8]:
from torch.utils.data import Dataset, DataLoader
import torch
import tiktoken

In [15]:
##Create input target tensors
class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer, max_length,stride):
        self.input_ids = []
        self.target_ids = []

        #Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={'|eos|'})

        #Use sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0,len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    ##Dataloader requires Map-style or iterable style, dataloader will check the __getitem__ 1st and after that
    # after that only it will go and create input and target chunk.
    # getitem will clearly what input and target should be - its based on index

        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx): 
        return self.input_ids[idx],self.target_ids[idx]

step 1: Initialize tokenizer

step 2: Create dataset

step 3: drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training 

step 4: The number of CPU processes to use for preprocessing

In [None]:
##Will feed data into dataloaders
#batch_size = no. of batches the model processes at once before it updates the parameters, to make
    # model update its parameter quickly
# Max_length - checks 256 words and predicts next word - LLMS like gpt2 works similarly
##Stride - how much we need to skip before need to make next i/p target batch.
##num_workers - no. of process that system can run on parallel - 0 all process.
##Why dataloader - it will allow to do parallel processing, and analyze multiple batches 1 time

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    ##Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    #Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    #Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [13]:
from pypdf import PdfReader
reader = PdfReader("war-and-peace.pdf")
text = ""
for page in reader.pages:
    text += page.extract_text()

raw_text = ""
for page in reader.pages:
    raw_text += page.extract_text()

In [None]:
import torch
print("Pytorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text,batch_size=4,max_length=4,stride=1,shuffle=False
)

data_iter = iter(dataloader)
input,target = next(data_iter)
print(f"Input: {input}\n\n Target: {target[0]}")

Pytorch version: 2.7.1+cu118
Input: tensor([[  220,   198, 13195,   290],
        [  198, 13195,   290, 12689],
        [13195,   290, 12689,   220],
        [  290, 12689,   220,   198]])

 Target: tensor([[  198, 13195,   290, 12689],
        [13195,   290, 12689,   220],
        [  290, 12689,   220,   198],
        [12689,   220,   198,  3123]])


In [19]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[12689,   220,   198,  3123],
        [  220,   198,  3123,    78],
        [  198,  3123,    78, 20054],
        [ 3123,    78, 20054,   301]]), tensor([[  220,   198,  3123,    78],
        [  198,  3123,    78, 20054],
        [ 3123,    78, 20054,   301],
        [   78, 20054,   301,   726]])]


In [None]:
print(f"Input: {input}\n\n Target: {[p[0] for p in target]}")




Input: tensor([[  220,   198, 13195,   290],
        [  198, 13195,   290, 12689],
        [13195,   290, 12689,   220],
        [  290, 12689,   220,   198]])

 Target: [tensor(198), tensor(13195), tensor(290), tensor(12689)]
