# Loading Data

In [2]:
import os
import urllib.request

In [3]:
if not os.path.exists("the-verdict.txt"):
    url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt"
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path) 

In [4]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [5]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [6]:
len(raw_text)

20479

In [7]:
import re

result = re.split(r'([,.;:?_!"()\']|--|\s)', raw_text)
result = [item for item in result if item.strip()]
preprocessed = result

In [11]:
len(preprocessed)

4690

In [40]:
unique_words = sorted(set(preprocessed))
unique_words.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(unique_words)
print(vocab_size)

1132


In [41]:
vocab = {token:integer for integer, token in enumerate(unique_words)}
# vocab

In [42]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self,text):
        pre_processed = re.split(r'([,.;:?_!"()\']|--|\s)', text)
        pre_processed = [item for item in pre_processed if item.strip()]

        pre_processed = [item if item in self.str_to_int else "<|unk|>" for item in pre_processed]

        ids = [self.str_to_int[s] for s in pre_processed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.;:?_!"()\'])', r'\1', text)
        return text
        

In [43]:
tokenizer = SimpleTokenizerV1(vocab)
ids = tokenizer.encode("I am Heena")
# ids = tokenizer.encode("I HAD always thought Jack Gisburn rather a cheap genius")
ids

[53, 150, 1131]

In [44]:
text = tokenizer.decode(ids)
text

'I am <|unk|>'

In [45]:
import tiktoken

In [51]:
tokenizer = tiktoken.get_encoding("gpt2")
ids = tokenizer.encode("I am Heena")
ids

# text = tokenizer.decode(ids)
# text

[40, 716, 679, 8107]

In [54]:
ids = tokenizer.encode("I am Heena <|endoftext|> sdadvghsghd", allowed_special = {"<|endoftext|>"})
ids

[40, 716, 679, 8107, 220, 50256, 45647, 32225, 456, 82, 456, 67]

In [56]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

encoded_text = tokenizer.encode(raw_text)
len(encoded_text)

5145

In [59]:
encoded_sample = encoded_text[50:]
len(encoded_sample)

5095

In [66]:
context_size = 4
x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:       {y}")

x: [290, 4920, 2241, 287]
y:       [4920, 2241, 287, 257]


In [78]:
for i in range(1, context_size+1):
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    print(context, "------>", desired)
    print(tokenizer.decode(context), "------->", tokenizer.decode([desired]))
    print("-------------------------------------\n")

[290] ------> 4920
 and ------->  established
-------------------------------------

[290, 4920] ------> 2241
 and established ------->  himself
-------------------------------------

[290, 4920, 2241] ------> 287
 and established himself ------->  in
-------------------------------------

[290, 4920, 2241, 287] ------> 257
 and established himself in ------->  a
-------------------------------------



In [79]:
import torch

In [80]:
torch.__version__

'2.8.0'

In [82]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
        

In [83]:
def create_dataloader_V1(txt, batch_size=2, max_length=4, stride=4,
                         shuffle=True, drop_last=True, num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader=DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader
        

In [84]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [93]:
dataloader = create_dataloader_V1(raw_text, batch_size=8,
                                  max_length=4, stride=4,shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n" , inputs)
print("Targets:\n", targets)
# second_batch = next(data_iter)
# print(second_batch)



Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [94]:
input_ids = torch.tensor([2, 3, 5, 1])

In [118]:
vocab_size = 6
embed_dim = 4

torch.manual_seed(45)
embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, embed_dim)
embedding_layer.weight

Parameter containing:
tensor([[ 0.1371,  1.5252,  1.4718,  1.7016],
        [-0.2837,  0.1012,  0.6574, -0.3337],
        [-0.6286,  2.0443,  0.3665,  1.2984],
        ...,
        [ 0.8496,  1.2921, -0.2246, -1.3317],
        [-1.0666, -0.6355, -0.5139,  2.3446],
        [ 0.6524,  0.1242, -0.2163,  0.2439]], requires_grad=True)

In [121]:
embedding_layer(input_ids[3])

tensor([-0.2837,  0.1012,  0.6574, -0.3337], grad_fn=<EmbeddingBackward0>)

In [122]:
embedding_layer(input_ids)

tensor([[-0.6286,  2.0443,  0.3665,  1.2984],
        [ 0.1522,  1.2693, -1.6287, -0.0724],
        [-0.1322,  0.0068, -0.6861, -1.8957],
        [-0.2837,  0.1012,  0.6574, -0.3337]], grad_fn=<EmbeddingBackward0>)

In [123]:
vocab_size = 50257
embed_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size, embed_dim)

In [128]:
max_length=4
dataloader = create_dataloader_V1(raw_text, batch_size=8,
                                  max_length=max_length, stride=4,shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n" , inputs)
print("Targets:\n", targets)
print(inputs.shape)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
torch.Size([8, 4])


In [129]:
input_embeddings = embedding_layer(inputs)
print(input_embeddings.shape)
input_embeddings

torch.Size([8, 4, 256])


tensor([[[-1.0974, -0.1790,  0.4086,  ..., -0.4036,  0.8255, -0.1127],
         [-0.0074, -1.3646, -0.2895,  ...,  0.3538,  2.3053,  1.1255],
         [ 0.3359, -1.6044, -1.1187,  ..., -1.7974, -1.0150,  0.1089],
         [ 1.1360, -0.4833,  1.2469,  ...,  0.4166, -1.1166,  0.4299]],

        [[ 0.9458, -1.4868,  0.0047,  ...,  1.1199, -1.7761, -0.7141],
         [ 0.4991,  0.7764, -0.5569,  ...,  0.1477, -0.3736,  0.3258],
         [-1.3073, -0.7453,  0.1764,  ..., -2.0238,  1.8035,  0.1329],
         [ 0.6771,  1.4481,  0.1107,  ..., -0.0698, -0.6614, -0.8492]],

        [[-1.5925, -0.0193, -0.9250,  ...,  0.2599,  1.4961, -0.2522],
         [-0.5835, -1.0900,  0.2053,  ...,  0.3877, -0.6090,  0.2715],
         [ 0.0589, -1.1527,  1.5295,  ..., -2.1484, -0.7515, -0.1383],
         [ 1.1173,  0.9033,  2.1241,  ...,  1.7240,  0.2325, -0.2705]],

        ...,

        [[-0.1882,  0.1692,  1.6405,  ...,  0.6591, -0.6606,  0.5364],
         [ 0.2712, -0.4886,  0.5323,  ..., -0.1536, -0.48

In [135]:
context_length = max_length
positional_embedding_layer = torch.nn.Embedding(context_length, embed_dim)

In [133]:
torch.arange(context_length)

tensor([0, 1, 2, 3])

In [137]:
positional_embedding_layer.weight

Parameter containing:
tensor([[-1.9337, -0.6522,  0.8583,  ...,  2.2515, -1.5347, -1.3875],
        [ 0.1568, -0.1282,  1.1525,  ..., -0.5930, -0.6430,  0.0718],
        [-0.4449,  0.8573, -1.9526,  ..., -0.3890,  0.1812, -0.2504],
        [ 0.5548,  1.1977, -0.5784,  ...,  0.7824,  0.4554, -0.1360]],
       requires_grad=True)

In [139]:
pos_embeddings = positional_embedding_layer(torch.arange(context_length))
pos_embeddings.shape

torch.Size([4, 256])

In [140]:
input_embeddings.shape

torch.Size([8, 4, 256])

In [142]:
input_embeddings[0] + pos_embeddings

tensor([[-3.0311, -0.8311,  1.2668,  ...,  1.8479, -0.7092, -1.5002],
        [ 0.1493, -1.4928,  0.8630,  ..., -0.2392,  1.6623,  1.1973],
        [-0.1090, -0.7471, -3.0712,  ..., -2.1865, -0.8338, -0.1415],
        [ 1.6908,  0.7144,  0.6685,  ...,  1.1990, -0.6612,  0.2938]],
       grad_fn=<AddBackward0>)

In [146]:
input_embeddings = input_embeddings + pos_embeddings
input_embeddings

tensor([[[-6.8985, -2.1354,  2.9833,  ...,  6.3509, -3.7787, -4.2751],
         [ 0.4629, -1.7493,  3.1680,  ..., -1.4252,  0.3762,  1.3410],
         [-0.9989,  0.9675, -6.9763,  ..., -2.9645, -0.4713, -0.6424],
         [ 2.8003,  3.1098, -0.4882,  ...,  2.7637,  0.2495,  0.0218]],

        [[-4.8553, -3.4432,  2.5795,  ...,  7.8744, -6.3803, -4.8765],
         [ 0.9695,  0.3917,  2.9006,  ..., -1.6312, -2.3027,  0.5413],
         [-2.6421,  1.8266, -5.6813,  ..., -3.1909,  2.3472, -0.6184],
         [ 2.3414,  5.0412, -1.6245,  ...,  2.2774,  0.7048, -1.2573]],

        [[-7.3936, -1.9757,  1.6498,  ...,  7.0143, -3.1081, -4.4146],
         [-0.1131, -1.4747,  3.6628,  ..., -1.3912, -2.5381,  0.4870],
         [-1.2760,  1.4191, -4.3282,  ..., -3.3155, -0.2078, -0.8897],
         [ 2.7816,  4.4964,  0.3890,  ...,  4.0711,  1.5986, -0.6786]],

        ...,

        [[-5.9893, -1.7872,  4.2153,  ...,  7.4135, -5.2648, -3.6260],
         [ 0.7416, -0.8733,  3.9898,  ..., -1.9325, -2.41

In [147]:
input_embeddings.shape

torch.Size([8, 4, 256])