In [2]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7f35f44dbd90>)

In [7]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
with open("the-verdict.txt", "r") as f:
    text = f.read()
tokens = tokenizer.encode(text)
print(f"Number of tokens: {len(tokens)}")
print(f"First 10 tokens: {tokens[:10]}")
print(f"Decoded first 10 tokens: {tokenizer.decode(tokens[:10])}")

Number of tokens: 5145
First 10 tokens: [40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]
Decoded first 10 tokens: I HAD always thought Jack Gisburn rather


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataSet(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    

def create_dataloader(text, tokenizer=tiktoken.get_encoding("gpt2"), max_length=128, stride=64, batch_size=32, drop_last=True, shuffle=True, num_workers=0):
    dataset = GPTDataSet(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

dataloader = create_dataloader(text, batch_size=8, shuffle=False, num_workers=2)
for batch_idx, (input_ids, target_ids) in enumerate(dataloader):
    print(f"Batch {batch_idx}:")
    print(f"Input IDs shape: {input_ids.shape}")
    print(f"Target IDs shape: {target_ids.shape}")
    display(input_ids)
    display(target_ids)
    if batch_idx == 1:
        break
    

Batch 0:
Input IDs shape: torch.Size([8, 128])
Target IDs shape: torch.Size([8, 128])


tensor([[   40,   367,  2885,  ...,  3758,   262,  1988],
        [  314,  2138,  1807,  ...,   340,   373,   407],
        [  286,   616,  4286,  ...,   198,  5779, 28112],
        ...,
        [ 8673,    13,  1002,  ...,   262,  1781,   286],
        [ 4150,     8,  3688,  ...,  9074,    13,   402],
        [  257,  1178,  2745,  ...,   561,   423,   587]])

tensor([[  367,  2885,  1464,  ...,   262,  1988,   286],
        [ 2138,  1807,   340,  ...,   373,   407,   691],
        [  616,  4286,   705,  ...,  5779, 28112, 10197],
        ...,
        [   13,  1002,   340,  ...,  1781,   286,   257],
        [    8,  3688,   284,  ...,    13,   402,   271],
        [ 1178,  2745,     6,  ...,   423,   587,  1327]])

Batch 1:
Input IDs shape: torch.Size([8, 128])
Target IDs shape: torch.Size([8, 128])


tensor([[  271, 10899,   550,  ..., 14093,   656,   465],
        [ 1327,   284,  5879,  ...,   616,   835,   284],
        [ 1021,   757,   438,  ..., 29543,  2745,    11],
        ...,
        [  550,   587, 11191,  ...,  9343,   683,    11],
        [  423,  4750,   326,  ...,   691, 12226,   318],
        [  351,   281,  5585,  ...,  2087,   329,   616]])

tensor([[10899,   550,   366,  ...,   656,   465,  1021],
        [  284,  5879,   326,  ...,   835,   284, 22489],
        [  757,   438, 10919,  ...,  2745,    11,   314],
        ...,
        [  587, 11191,   416,  ...,   683,    11,   351],
        [ 4750,   326,  9074,  ..., 12226,   318,   284],
        [  281,  5585,   286,  ...,   329,   616, 35957]])

In [13]:
dataloader = create_dataloader(text, batch_size=8, shuffle=False, stride=4, max_length=4)
data_iter = iter(dataloader)
input_ids, target_ids = next(data_iter)
print("Input IDs:")
display(input_ids)
print("Target IDs:")
display(target_ids)
# decode for sample
for i in range(8):
    print(f"Sample {i}:")
    print("Input: ", tokenizer.decode(input_ids[i].tolist()))
    print("Target:", tokenizer.decode(target_ids[i].tolist()))

Input IDs:


tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Target IDs:


tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])

Sample 0:
Input:  I HAD always
Target:  HAD always thought
Sample 1:
Input:   thought Jack Gis
Target:  Jack Gisburn
Sample 2:
Input:  burn rather a cheap
Target:  rather a cheap genius
Sample 3:
Input:   genius--though a
Target: --though a good
Sample 4:
Input:   good fellow enough--
Target:  fellow enough--so
Sample 5:
Input:  so it was no
Target:  it was no great
Sample 6:
Input:   great surprise to me
Target:  surprise to me to
Sample 7:
Input:   to hear that,
Target:  hear that, in
