In [2]:
import importlib
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
# reading the book
with open(r"Game_of_thrones_books/001ssb.txt",encoding="utf-8") as f:
    raw_text=f.read()

In [4]:
enc_text=tokenizer.encode(raw_text)
print(len(enc_text))

433157


In [6]:
enc_sample=enc_text[:50]


In [7]:
context_size = 4 #length of the input
#The context_size of 4 means that the model is trained to look at a sequence of 4 words (or tokens) 
#to predict the next word in the sequence. 
#The input x is the first 4 tokens [1, 2, 3, 4], and the target y is the next 4 tokens [2, 3, 4, 5]

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [32, 3776, 3226, 20902]
y:      [3776, 3226, 20902, 220]


In [11]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[32] ----> 3776
[32, 3776] ----> 3226
[32, 3776, 3226] ----> 20902
[32, 3776, 3226, 20902] ----> 220


In [12]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

A ---->  Game
A Game ---->  Of
A Game Of ---->  Thrones
A Game Of Thrones ---->  


In [15]:
## Implementing DATA loader

In [21]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        
        #text: book, 
        #tokenizer,
        #Max_length = CONTENT LENGHTH,
        #stride =Stride is how many tokens you move forward when sliding a context window over a long sequence.

        
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length] # if i=1 pick data from 1-4
            target_chunk = token_ids[i + 1: i + max_length + 1] # if i=1 pick data from 2-5
            self.input_ids.append(torch.tensor(input_chunk)) # convert to tensor and append in input
            self.target_ids.append(torch.tensor(target_chunk))# convert to tensor and append in output

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx): 
        # once the input  self.input_ids = [] and self.target_ids = [] TENSORS are ready , 
        #here we provide the index for the row and it fetches that row from these tensors
        return self.input_ids[idx], self.target_ids[idx]

In [22]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    #batch_size = how many samples are processed in parallel in one forward/backward pass.
    #after going through all the batches , model will update the weights
    #here batch=4 ie The model processes 4 independent sequences simultaneously
    #batch_size = 4 → the model does vectorized / parallel math over 4 samples in one forward/backward pass (GPU/CPU will process them together)
    #num_workers = 0 → data loading and preprocessing run serially in the main process. There is no parallelism for preparing batches (reading files, tokenizing, transforms).
    
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [45]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)

PyTorch version: 2.9.1+cpu


In [46]:
x,y=first_batch
xin = tokenizer.decode(x[0].tolist())
yin = tokenizer.decode(y[0].tolist())
print(xin,'----->>> ',yin)

A Game Of Thrones ----->>>   Game Of Thrones 


In [53]:
forth_batch = next(data_iter)
x,y=forth_batch
xin = tokenizer.decode(x[0].tolist())
yin = tokenizer.decode(y[0].tolist())
print(xin,'----->>> ',yin)

 One of A Song ----->>>   of A Song of


In [54]:
#setting the batch size to 10
dataloader = create_dataloader_v1(
    raw_text, batch_size=10, max_length=4, stride=4, shuffle=False
)


In [55]:
data_iter = iter(dataloader)
first_batch = next(data_iter)


A Game Of Thrones ----->>>   Game Of Thrones 


In [56]:
x

tensor([[   32,  3776,  3226, 20902],
        [  220,   198, 10482,  1881],
        [  286,   317, 10940,   286],
        [ 6663,   290,  3764,   220],
        [  198,  3886,  4502,   371],
        [   13,   371,    13,  5780],
        [  220,   198,  4805, 33462],
        [ 8924,   220,   198,     1],
        [ 1135,   815,   923,   736],
        [  553,   402,  1144, 11643]])

In [57]:
y

tensor([[ 3776,  3226, 20902,   220],
        [  198, 10482,  1881,   286],
        [  317, 10940,   286,  6663],
        [  290,  3764,   220,   198],
        [ 3886,  4502,   371,    13],
        [  371,    13,  5780,   220],
        [  198,  4805, 33462,  8924],
        [  220,   198,     1,  1135],
        [  815,   923,   736,   553],
        [  402,  1144, 11643,   355]])