**For the efficient data loader implementation, we will use pytorch build-in-dataset and dataloader classes**

Step1: Tokenize the entire text

Step2: use a sliding window to chunk the book into the overlaping sequence of max length

Step3: Return the total number of rows in the dataset (no. token id in one batch)

Step4: Return single row from dataset

In [6]:


import torch
import re
import tiktoken
from torch.utils.data import Dataset, DataLoader


"""
The GPTDatasetV1 Class is listing 2.5 is based on Pytorch Dataset class
It defines how individual rows are fetched dataset
Each row consists of a number of token IDs (based on max length) assigned to input_chunk tensor
The target chunk contains corresponding targets
"""

In [None]:

class GPTDatasetV1(Dataset) :
  def __init__(self, tokenizer, text, context_window_size, stride):
    self.input_ids = []
    self.target_ids = []

    # tokenize the entire text
    token_ids = tokenizer.encode(text, allowed_special = {"<|endoftext|>"})
    print(f"Number of token ids : {len(token_ids)}")




    # Every journey start with one step
    # Every -> journey 
    # Every journey -> start 
    # Every journey start -> with

    # use a sliding window to chunk the book into overlapping sequence of max length
    for i in range(0, len(token_ids) - context_window_size, stride):
      input_chunk = token_ids[i: i+context_window_size]
      target_chunk = token_ids[i+1: i+context_window_size + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]


In [None]:
with open("./fiction_stories.txt", "r", encoding= "utf8") as f:
    text = f.read()

  # preprocessed = re.split(r"[.,:;!?()\"']|--|\s", text)
  # preprocessed = [item.strip() for item in preprocessed if item.strip()]

tokenizer = tiktoken.get_encoding("gpt2")

dataloader = GPTDatasetV1(tokenizer, text, 4, 1)

# len of dataset input ids =   [len(token_id) / stride]
print(f"shape of each inputs id: {dataloader.__len__()}")

#length of dataloader"s input ids is [len(token_ids) - context_window_size]
print(f"length of dataloader's input id: {dataloader.__len__()}")

Number of token ids : 394637
shape of each inputs id: 394633
length of dataloader's input id: 394633



the following code will use GPTDatasetV1 to load inputs in batches via Pytorch Dataloader Step1: Initialize the tokenizer

Step2: Create dataset

Step3: drop_last = True, drop the last batch it it is shorter than the specified batch size to prevent loss spikes during training

Step4: The number of CPU process to use for preprocessing

In [9]:
# """
# params :
#      # num_worker = 0, Number of subprocesses to use for for data loading
#      # pin_memory = False, if true, the data loader will copy tensors into CUDA pinned memory before returning
#      # prefetch_factor = 2, Number of batches to prefetch
#      # drop_last = False, if True, drop the last incomplete batch
#      # batch size = 4, Number of batches to processes at once before adjusting weights of LLM model

# """

In [10]:

def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_worker = 0):
  # Initialize the tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")

  #create dataset
  dataset = GPTDatasetV1(tokenizer, text, max_length, stride)

  # Create dataloader
  dataloader = DataLoader(dataset,
                          batch_size = batch_size,
                          shuffle = shuffle,
                          drop_last = drop_last,
                          num_workers = num_worker)

  return dataloader

In [11]:

# Lets test the datalader with a batch size of 1 for an LLM with with the cotext size of 4
# This will develop initution of how the GPTDatasetV1 class and the create dataloader v21 function work together

# with open("/content/drive/MyDrive/Colab Notebooks/fiction_stories.txt", "r", encoding= "utf8") as f:
#   text = f.read()

dataloader = create_dataloader_v1(text, batch_size=1, max_length = 4, stride=1, shuffle=False)

data_iter = iter(dataloader)

# The first batch variable contains two tensors: the first tensor storesthe input token IDs stores the target t
# Since the max_length is set to 4, each of two tensors contains 4 token IDs


first_batch = next(data_iter)
print(first_batch)

second_batch = next(data_iter)
print(second_batch)


dataloader = create_dataloader_v1(text, batch_size=8, max_length = 4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, target = next(data_iter)


print("inputs: \n", inputs)
print("targets: \n", target)


Number of token ids : 394637
[tensor([[14126,   314,   198,   198]]), tensor([[ 314,  198,  198, 1268]])]
[tensor([[ 314,  198,  198, 1268]]), tensor([[ 198,  198, 1268, 7655]])]
Number of token ids : 394637
inputs: 
 tensor([[14126,   314,   198,   198],
        [ 1268,  7655, 20739,  9370],
        [41119,  1921,   376,  7730],
        [   38,  5357,   350,  1921],
        [ 5188, 30709, 12425, 15859],
        [ 8905,    51,   412, 16219],
        [25401,    11,   198, 10970],
        [16329,  7054, 32337,  5781]])
targets: 
 tensor([[  314,   198,   198,  1268],
        [ 7655, 20739,  9370, 41119],
        [ 1921,   376,  7730,    38],
        [ 5357,   350,  1921,  5188],
        [30709, 12425, 15859,  8905],
        [   51,   412, 16219, 25401],
        [   11,   198, 10970, 16329],
        [ 7054, 32337,  5781,    11]])



"""
    Note that if we increase the stride to 4. This is to utilize the data set fully
    (we dont't skip a single word more overlap between batches will lead to increased overfitting )
"""