In [1]:
# Store the book in a variable raw_text
with open("books_text/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
with open("books_text/sorcerers-stone.txt", "r", encoding="utf-8") as f:
    hp1_sorcerers_stone_text = f.read()
print(raw_text[:50])
print(hp1_sorcerers_stone_text[:50])

I HAD always thought Jack Gisburn rather a cheap g
THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number


In [2]:
import re

# Use regex to get split by punctuation
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
# Get rid of the white spaces. White spaces are not needed for this use case
preprocessed = [item.strip() for item in preprocessed if item.strip()]

hp1_preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', hp1_sorcerers_stone_text)
hp1_preprocessed = [item.strip() for item in hp1_preprocessed if item.strip()]

print(preprocessed[:50])
print(hp1_preprocessed[:50])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself']
['THE', 'BOY', 'WHO', 'LIVED', 'Mr', '.', 'and', 'Mrs', '.', 'Dursley', ',', 'of', 'number', 'four', ',', 'Privet', 'Drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.', 'They', 'were', 'the', 'last', 'people', 'you’d', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', ',', 'because']


In [3]:
# Create the Token IDs
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [4]:
vocab = {token: integer for integer,token in enumerate(all_words)}
for token,i in vocab.items():
    print(f'ID is {i} and item is {token}')
    if i == 10:
        break


ID is 0 and item is !
ID is 1 and item is "
ID is 2 and item is '
ID is 3 and item is (
ID is 4 and item is )
ID is 5 and item is ,
ID is 6 and item is --
ID is 7 and item is .
ID is 8 and item is :
ID is 9 and item is ;
ID is 10 and item is ?


In [5]:
# Create Tokenizer class
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: item for item,i in vocab.items()}
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

# Test out the Tokenizer class, and the encode method
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)


[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [6]:
# Test out the decode method
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [7]:
# If tokens are not properly mapped, an error will be thrown since in this case, 
# Hello was not in the pretrained data
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

In [8]:
# <|unk|> and <|endoftext|> can be used when,
# Use <|unk|> when tokenizer encounters an unknown word (isn't in vocab)
# Use <|endoftext|> to signifiy the end of a text 
# Ex. When we finish reading article 1, we put <|endoftext|>
# At the start of article 2, and so on
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

all_tokens_hp1 = sorted(list(set(hp1_preprocessed)))
all_tokens_hp1.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: i for i,token in enumerate(all_tokens)}
vocab_hp1 = {token: i for i,token in enumerate(all_tokens_hp1)}

print(len(vocab))
print(len(vocab_hp1))

1132
7388


In [9]:
# As you can see, the two special tokens are present at the bottom
for i,item in enumerate(list(vocab.items())[-5:]):
    print(f'({item}, {i})')
print("--------")
for i,item in enumerate(list(vocab_hp1.items())[-5:]):
    print(f'({item}, {i})')

(('younger', 1127), 0)
(('your', 1128), 1)
(('yourself', 1129), 2)
(('<|endoftext|>', 1130), 3)
(('<|unk|>', 1131), 4)
--------
(('”', 7383), 0)
(('•k', 7384), 1)
(('■”', 7385), 2)
(('<|endoftext|>', 7386), 3)
(('<|unk|>', 7387), 4)


In [10]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: token for token,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)           
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        # If token is not present in our current vocab, replace it by an unknown token
        preprocessed = [
            token if token in self.str_to_int 
            else "<|unk|>" for token in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace"
# Append the <|endoftext|> at the end of text 1 and beginning
# of text 2
text = " <|endoftext|> ".join((text1,text2))
print(text)
        

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace


In [11]:
hp1_tokenizer = SimpleTokenizerV2(vocab_hp1)
text1 = "Percy Jackson and the Lightning Thief"
text2 = "Hello. It's me. I was wondering if after all..."

text_hp = " <|endoftext|> ".join((text1,text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace


In [12]:
# 1131 is the token ID of "<|unk|>" and 1130 is the 
# token ID of <|endoftext|>
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131]

In [13]:
hp1_tokenizer.encode(text_hp)

[849,
 7387,
 1380,
 6013,
 7387,
 1105,
 7386,
 7387,
 7,
 587,
 2,
 5058,
 4058,
 7,
 568,
 6452,
 6641,
 3604,
 1328,
 1354,
 7,
 7,
 7]

In [14]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>'

In [15]:
hp1_tokenizer.decode(hp1_tokenizer.encode(text_hp))

"Percy <|unk|> and the <|unk|> Thief <|endoftext|> <|unk|>. It' s me. I was wondering if after all..."

In [16]:
# From comparing the decoded tokenized text above with 
# The original input text, we know that the training set 
# did not contain the words "Hello" and "palace"

# Could use other tokens
# [BOS] (beginning of sequence) Marks the start of a text/sequence
# [EOS] (end of sequence): Positioned at the end of a text
# and is useful for concatenating multiple unrelated texts.
# [PAD] (padding): When training LLMs with a batch size larger
# than one, the batch might contain texts of varying lengths. To ensure that 
# all the texts have the smae length the shorter texts
# are padded using [PAD] token, up to length of the longest text in batch

# Tokenizer used for GPT models does not need any of these 
# tokens mentioned but only uses an <|endoftext|> for simplicity

# byte pair enconding tokenizer breaks down words 
# into subword units for tokenizer with GPT models

#### BYTE PAIR ENCODING

In [17]:
# Use tiktoken github
! pip3 install tiktoken



In [18]:
import importlib
import tiktoken

print(f'tiktoken version: {importlib.metadata.version("tiktoken")}')

tiktoken version: 0.11.0


In [19]:
# Instantiate BPE tokenizer from tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
hp1_tokenizer = tiktoken.get_encoding("gpt2")

In [20]:
# Usage of tokenizer is similar to SimpleTokenzizerV2
# We implemented previously via an encode method
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace"
)
encoded_mapping = tokenizer.encode(text, allowed_special = {"<|endoftext|>"})
print(encoded_mapping)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271]


In [21]:
# Can then convert token IDs back into text using the decode
# method. Similar to the SimpleTokenizerV2
# Tokenizer is able to encode words that look wrong, like 
# someunknownPlace

tokens = tokenizer.decode(encoded_mapping)
print(tokens)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace


In [22]:
# Two advantages so far.
# 1. Reduces the amount of tokens we have
# 2. Can deal with unknown words


### Can make two noteworthy observations based on the token IDs and decoded text above:

#### 1. <|endoftext|> token is assigned to relatively large token ID (50256)
#### 2. BPE tokenizer above encodes and decodes unknown words, can handle any unknown word.

#### Algorithm underlying BPE breaks down words that aren't in its predefined vocab into smaller subwords. 
#### This enables it to handle out-of-vocabulary words.
#### Because of BPE algorithm, if the tokenizer encounters an unfamilar word during tokenization, it can represent it as a sequence of subwords or characters

In [23]:
# Proof that it works, even with gibberish
encoded_mapping = tokenizer.encode("Akwirq ier")
print(encoded_mapping)

tokens = tokenizer.decode(encoded_mapping)
print(tokens)

[33901, 86, 343, 80, 220, 959]
Akwirq ier


In [24]:
# One more example with words that include apostrophes
text = "tames, breaks, lames, fames, dont's, can't, shouldn't, can't, won't, against, lust, must, rust, fuss, shan't"
encoded_mapping = tokenizer.encode(text)
print(encoded_mapping)

tokens = tokenizer.decode(encoded_mapping)
print(tokens)

[83, 1047, 11, 9457, 11, 300, 1047, 11, 277, 1047, 11, 17666, 338, 11, 460, 470, 11, 6584, 470, 11, 460, 470, 11, 1839, 470, 11, 1028, 11, 22279, 11, 1276, 11, 17000, 11, 34297, 11, 427, 272, 470]
tames, breaks, lames, fames, dont's, can't, shouldn't, can't, won't, against, lust, must, rust, fuss, shan't


### Creating Input-Target Pairs


#### Implement a data loader that fetches input-target pairs using sliding window approach
#### Tokenize the two stories first.

In [25]:
# Store the book in a variable raw_text
with open("books_text/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
with open("books_text/sorcerers-stone.txt", "r", encoding="utf-8") as f:
    hp1_sorcerers_stone_text = f.read()
enc_text_ver = tokenizer.encode(raw_text)
enc_text_hp1 = tokenizer.encode(hp1_sorcerers_stone_text)
print(f'The Verdict encoder text length (tokens) is {len(enc_text_ver)}')
print(f'Harry Potter Book 1 encoder text length (tokens) is {len(enc_text_hp1)}')

The Verdict encoder text length (tokens) is 5145
Harry Potter Book 1 encoder text length (tokens) is 133860


In [26]:
# x: [1,2,3,4], y: [2,3,4,5]
# x:input, y:output
# if [1] is the input, [2] should be output
# if [1,2] is input, [3] should be output
# if [1,2,3] is input, [4] should be output
# if [1,2,3,4] is input. [5] should be output
# Context size is how may words do we want to give as input
# for model to predict next word

context_size = 4 # length of input
# Model is trained to look at sequence of 4 words (or tokens)
# to predict next word in sequence.

# For The Verdict sample
x = enc_text_ver[:context_size]
y = enc_text_ver[1:context_size+1]

print(f'x: {x}')
print(f'y:     {y}')


x: [40, 367, 2885, 1464]
y:     [367, 2885, 1464, 1807]


In [27]:
context_size_hp1 = 20
x_hp = enc_text_hp1[:context_size_hp1]
y_hp = enc_text_hp1[1:context_size_hp1+1]
print(f'x_hp: {x_hp}')
print(f'y_hp:      {y_hp}')

x_hp: [10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074, 13, 360, 1834, 1636, 11, 286, 1271]
y_hp:      [16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074, 13, 360, 1834, 1636, 11, 286, 1271, 1440]


In [28]:
# Processing inputs along with targets, we can create 
# next word prediction task as follows (The Verdict):
for i in range(1, context_size+1):
    context = enc_text_ver[:i]
    desired = enc_text_ver[i]

    print(context, "---->", desired)


[40] ----> 367
[40, 367] ----> 2885
[40, 367, 2885] ----> 1464
[40, 367, 2885, 1464] ----> 1807


In [29]:
# Processing inputs along with targets, we can create 
# next word prediction task as follows (Harry Potter):
for i in range(1, context_size_hp1+1):
    context = enc_text_hp1[:i]
    desired = enc_text_hp1[i]

    print(context, "---->", desired)


[10970] ----> 16494
[10970, 16494] ----> 56
[10970, 16494, 56] ----> 19494
[10970, 16494, 56, 19494] ----> 406
[10970, 16494, 56, 19494, 406] ----> 3824
[10970, 16494, 56, 19494, 406, 3824] ----> 1961
[10970, 16494, 56, 19494, 406, 3824, 1961] ----> 198
[10970, 16494, 56, 19494, 406, 3824, 1961, 198] ----> 198
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198] ----> 5246
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246] ----> 13
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13] ----> 290
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290] ----> 9074
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074] ----> 13
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074, 13] ----> 360
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074, 13, 360] ----> 1834
[10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074, 13, 360, 1834] ----> 1636
[10970, 16494, 56, 19494, 406, 3824, 19

In [30]:
# Repeat process but converting the IDs to tokens for Visualization 
for i in range(1, context_size+1):
    context = enc_text_ver[:i]
    desired = enc_text_ver[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))


I ---->  H
I H ----> AD
I HAD ---->  always
I HAD always ---->  thought


In [31]:
# For Harry Potter book 1:
for i in range(1, context_size_hp1+1):
    context = enc_text_hp1[:i]
    desired = enc_text_hp1[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))


THE ---->  BO
THE BO ----> Y
THE BOY ---->  WHO
THE BOY WHO ---->  L
THE BOY WHO L ----> IV
THE BOY WHO LIV ----> ED
THE BOY WHO LIVED ----> 

THE BOY WHO LIVED
 ----> 

THE BOY WHO LIVED

 ----> Mr
THE BOY WHO LIVED

Mr ----> .
THE BOY WHO LIVED

Mr. ---->  and
THE BOY WHO LIVED

Mr. and ---->  Mrs
THE BOY WHO LIVED

Mr. and Mrs ----> .
THE BOY WHO LIVED

Mr. and Mrs. ---->  D
THE BOY WHO LIVED

Mr. and Mrs. D ----> urs
THE BOY WHO LIVED

Mr. and Mrs. Durs ----> ley
THE BOY WHO LIVED

Mr. and Mrs. Dursley ----> ,
THE BOY WHO LIVED

Mr. and Mrs. Dursley, ---->  of
THE BOY WHO LIVED

Mr. and Mrs. Dursley, of ---->  number
THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number ---->  four


#### Created input-target pairs that we can turn into use for LLM training


#### Implement an efficient data loader that iterates over input dataset and returns inputs and targets as PyTorch tensors (multidimensional arrays)

#### We are interested in returning two tensors: An input tensor containing the text that LLM sees and a target tensor that includes the LLM to predict

## Implementing a Data Loader
#### Use PyTorch in built Datasets and Dataloader

In [None]:
# Install PyTorch
!pip3 install torch

Collecting torch
  Downloading torch-2.8.0-cp312-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.8.0-cp312-none-macosx_11_0_arm64.whl (73.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.12
    Uninstalling sympy-1.12:
      Successfully uninstalled sympy-1.12
Successfully installed sympy-1.14.0 torch-2.8.0


In [None]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize entire text
        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

        # Use sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)

    # What DataLoader will be using
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

#### GPTDatasetV1 Describes how indivdiual rows are fetched from dataset.
#### Each row consists of a number of token IDs (based on max_length) assigned to an input_chunk tensor
#### target_chunk tensor contains corresponding targets


In [None]:
# Step 1: Initialize tokenizer
# Step 2: Create Dataset
# Step 3: drop_last = True drops the last batch if it is shorder
#         Than the specified batch_size to prevent 
#         loss spikes during training
# Step 4: Number of CPU processes to use for preprocessing

# Will help us doing parallel processing and can analyze 
# Multiple batch_sizes at a time
# Batch_size = number of batches model processes at once before
#              Updating parameters
# num_workers = for parallel processing on multiple threads
#               on CPU
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

#### Let's test dataloader with a batch size of 1 for an LLM with a context size of 4
#### Purpose: Develop intuition of how GPTDatasetV1 class and the create_dataloader_v1 function work together

In [34]:
# Store the book in a variable raw_text
with open("books_text/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
with open("books_text/sorcerers-stone.txt", "r", encoding="utf-8") as f:
    hp1_sorcerers_stone_text = f.read()

#### Convert dataloader into a Python iterator to fetch the next entry via Python's built-in next() function

In [35]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1,shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 2.8.0
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [39]:
dataloader_hp1 = create_dataloader_v1(
    hp1_sorcerers_stone_text, batch_size=1, max_length=4, stride=1,shuffle=False
)

data_iter_hp1 = iter(dataloader_hp1)
first_batch_hp1 = next(data_iter_hp1)
print(first_batch_hp1)

[tensor([[10970, 16494,    56, 19494]]), tensor([[16494,    56, 19494,   406]])]


#### first_batch variable contains two tensors: the first tensor stores the input token IDs and the second tensor stores the output token IDs
#### Since max_length is set to 4, each of two tensors contain 4 token IDs

In [37]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [40]:
second_batch_hp1 = next(data_iter_hp1)
print(second_batch_hp1)

[tensor([[16494,    56, 19494,   406]]), tensor([[   56, 19494,   406,  3824]])]


#### NOTE: Batch_size is a trade_off and a hyperparameter to experiment with when training LLMs

In [None]:
# What happens when batch_size is more than 1?
dataloader = create_dataloader_v1(
    raw_text, 
    batch_size=8,
    max_length=4,
    stride=4,
    shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("Targets:\n", targets)
# Model will process this batch before making parameter updates

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [None]:
# What happens when batch_size is more than 1 for HP?
# When we try using parallel processers (# workers), Jupyter Notebooks throws an error, this is because,
# On macOS, multiprocessing with fork can cause errors when num_workers > 0, especially in Jupyter or VSCode notebooks. 
# PyTorch tries to spawn subprocesses to load data, but the way Python handles multiprocessing on macOS is different 
# (it defaults to spawn instead of fork), which can break if the dataset or tokenizer isn’t picklable.
dataloader_hp1 = create_dataloader_v1(
    hp1_sorcerers_stone_text, 
    batch_size=16,
    max_length=256,
    stride=256,
    shuffle=False)
data_iter_hp1 = iter(dataloader_hp1)
inputs_hp1, targets_hp1 = next(data_iter_hp1)
print("Inputs:\n", inputs_hp1)
print("Targets:\n", targets_hp1)

Inputs:
 tensor([[10970, 16494,    56,  ..., 27034,    13, 14179],
        [  373,  9074,    13,  ...,   198,   198, 14202],
        [  286,   606,  6810,  ...,    13,   360,  1834],
        ...,
        [  355,   627,  1428,  ..., 14412,   373, 47207],
        [   13,   679,   373,  ...,   290,   900,   572],
        [  866,   262,   198,  ..., 25031, 11130,   261]])
Targets:
 tensor([[16494,    56, 19494,  ...,    13, 14179,   373],
        [ 9074,    13,   360,  ...,   198, 14202,   286],
        [  606,  6810,   257,  ...,   360,  1834,  1636],
        ...,
        [  627,  1428,   618,  ...,   373, 47207,    13],
        [  679,   373,  8179,  ...,   900,   572,   866],
        [  262,   198, 25662,  ..., 11130,   261, 44906]])


#### Increase stride to 4 so we can use dataset fully (we dont skip a word), but also avoid any overlap between the batches because increased overlap can lead to increased overfitting (for The Verdict)

In [None]:
# Whole Input text:
# "In the heart of the city stood the old library, a relic from
#  a bygone era. Its stone walls bore the mars of time, and ivy
#  clung tightly to its facade..."

# People keep the stride length = to context length, so
# We dont miss any words

# Stride = 1: 
# Input of batch 1: "In the heart of"
# Input of batch 2: "the heart of the"

# Stride = 4:
# Input of batch 1: "In the heart of"
# Input of batch 2: "the city stood the"
