# Chapter 2: Working With Text Data

**2.2 Tokenizing text**

In [1]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
  url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
  file_path = "the-verdict.txt"
  urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
  raw_text = f.read()

In [4]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [5]:
len(raw_text)

20479

In [9]:
import re

text = "Hellow, world. This. is a test."
result = re.split(r'\s',text)

print(result)

['Hellow,', 'world.', 'This.', 'is', 'a', 'test.']


In [10]:
result = re.split(r'([,.]|\s)',text)

print(result)

['Hellow', ',', '', ' ', 'world', '.', '', ' ', 'This', '.', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [13]:
result = [item for item in result if item.strip()]
print(result)

['Hellow', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [17]:
text = "Hellow, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
result = [item for item in result if item.strip()]
preprocessed = result


In [18]:
len(preprocessed)

4690

In [19]:
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

# 2.3 Converting tokens into token IDs

In [24]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)


1130


set(preprocessed): This converts the preprocessed list into a set. A set is an unordered collection of unique elements. This step automatically removes any duplicate words from the list.
sorted(...): This function sorts the elements of the set alphabetically.

In [28]:
vocab = {integer:token for token,integer in enumerate(all_words)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [41]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}
        self.unknown_token = '<unk>'
        if self.unknown_token not in self.str_to_int:
            unknown_index = len(self.str_to_int)
            self.str_to_int[self.unknown_token] = unknown_index
            self.int_to_str[unknown_index] = self.unknown_token

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int.get(s, self.str_to_int[self.unknown_token]) for s in preprocessed]
        return ids

    def decode(self, ids):
        # This decode method might need further refinement for better text reconstruction
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'([,.:;?_!"()\']|--|\s)', r'\1', text)
        # Simple attempt to remove spaces before punctuation for better readability
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

In [42]:
tokenizer = SimpleTokenizerV1(vocab)

In [43]:
text = """"It's the last he painted,you know,"
          Mrs. Gisburn said with pardonable pride."""

In [44]:
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [36]:
text = tokenizer.decode(ids)
print(text)

" It ' s the last he painted , you know , " Mrs . Gisburn said with pardonable pride .


# 2.4 Adding Special context tokens

In [37]:
text = "Hello,do you like tea. is this-- a test?"

tokenizer.encode(text)

KeyError: 'Hello'

The error KeyError: 'Hello' occurs because the word "Hello" is not present in the vocab dictionary that was created from "the-verdict.txt". This means the tokenizer doesn't know how to convert "Hello" into an integer ID.

To fix this, we need to modify the encode method in the SimpleTokenizerV1 class to handle words that are not in the vocabulary. A common way to do this is to add an "unknown" token to the vocabulary and use its ID for any word not found during encoding.

In [38]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>","<|unknown|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [39]:
len(vocab)

1132

In [40]:
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)


('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unknown|>', 1131)


In [49]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}
        self.unknown_token = '<unk>'
        if self.unknown_token not in self.str_to_int:
            unknown_index = len(self.str_to_int)
            self.str_to_int[self.unknown_token] = unknown_index
            self.int_to_str[unknown_index] = self.unknown_token

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        # Added solution for unknown word
        preprocessed = [
            item if item in self.str_to_int
            else "<|unknown|>" for item in preprocessed
        ]
        ids = [self.str_to_int.get(s, self.str_to_int[self.unknown_token]) for s in preprocessed]
        return ids

    def decode(self, ids):
        # This decode method might need further refinement for better text reconstruction
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'([,.:;?_!"()\']|--|\s)', r'\1', text)
        # Simple attempt to remove spaces before punctuation for better readability
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

In [46]:
tokenizer = SimpleTokenizerV2(vocab)

In [47]:
text = "Hello,do you like tea. is this-- a test?"

tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 7, 584, 999, 6, 115, 1131, 10]

In [48]:
tokenizer.decode(tokenizer.encode(text))

'<|unknown|>, do you like tea. is this -- a <|unknown|>?'

# 2.5 Byte pair encoding

In [50]:
import tiktoken

In [51]:
tiktoken.__version__

'0.9.0'

In [52]:
tokenizer = tiktoken.get_encoding("gpt2")

In [53]:
tokenizer.encode("hello world")

[31373, 995]

In [55]:
tokenizer.decode(tokenizer.encode("hello world"))

'hello world'

In [57]:
text =(
    "hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)
tokenizer.encode(text)

ValueError: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


In [58]:
tokenizer.encode(text, allowed_special={"<|endoftext|>"})

[31373,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 1659,
 617,
 34680,
 27271,
 13]

In [59]:
tokenizer.decode(tokenizer.encode(text, allowed_special={"<|endoftext|>"}))

'hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

# 2.6 Data Sampling with a sliding window

In [60]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [68]:
enc_sample = enc_text[50:]
len(enc_sample)

5095

In [67]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]


print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [77]:
for i in range (1, context_size+1):
  context =enc_sample[:i]
  desired = enc_sample[i]
  print(tokenizer.decode(context), "--->", tokenizer.decode([desired]))


 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


In [78]:
import torch

In [79]:
torch.__version__

'2.6.0+cu124'

In [115]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    # tokenize the entire text
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    # Use a sliding window to chunk the book into overlapping sequence of max_length
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [118]:
def create_dataloader_v1(txt, batch_size=2, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
  # Initialize the tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")

  # Create dataset
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  # Create DataLoader
  dataloader = DataLoader(dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last,
                          num_workers=num_workers
                          )
  return dataloader


In [119]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
  raw_text = f.read()

In [94]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=8,
    max_length=5,
    stride=4,
    shuffle=True,
    drop_last=True,

)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  572,   257, 40426, 10956,  9546],
        [22489, 40089,    11,  4978,   257],
        [  338,   281, 12659,  2829,  1122],
        [  743,   307, 41746, 12004,   262],
        [ 1936,  2431,   438,   392,   340],
        [  290,  1297,  9074,    13,   520],
        [  691,  8208,    12, 14337,  2250],
        [  460,  1560,   345,   287,  1936]]), tensor([[  257, 40426, 10956,  9546,   546],
        [40089,    11,  4978,   257, 19350],
        [  281, 12659,  2829,  1122,    11],
        [  307, 41746, 12004,   262,  6473],
        [ 2431,   438,   392,   340,  1422],
        [ 1297,  9074,    13,   520,  5493],
        [ 8208,    12, 14337,  2250,    11],
        [ 1560,   345,   287,  1936,  2431]])]


In [92]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  198,  1870,   465,  8216],
        [ 3886,   449,   659,   438],
        [  290,  8465,   286,   281],
        [  714,  1464,   651,  1474],
        [12450,    11,   991, 16755],
        [  640,    11,   339,   581],
        [ 4808,   321,    62,   991],
        [  314,   550,  3750,   351]]), tensor([[ 1870,   465,  8216,  1297],
        [  449,   659,   438,    64],
        [ 8465,   286,   281, 33954],
        [ 1464,   651,  1474,  1576],
        [   11,   991, 16755,    13],
        [   11,   339,   581,  4714],
        [  321,    62,   991, 12036],
        [  550,  3750,   351,   262]])]


# 2.7 Creating token Embeddings

In [95]:
input_ids = torch.tensor([2, 3, 5, 1])

In [105]:
vocab_size = 6
output_dim = 3

# torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [99]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [101]:
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

In [106]:
# torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, output_dim)

In [109]:
print(embedding_layer.weight)

Parameter containing:
tensor([[-1.7859, -1.6374,  0.8049],
        [ 0.5145, -0.2813, -0.5962],
        [-2.6897,  0.2557,  0.0639],
        ...,
        [ 2.8932, -0.3238,  1.5379],
        [-0.3669, -0.6236, -0.1213],
        [ 0.3120, -0.4057, -0.0941]], requires_grad=True)


# 2.8 Encoding word position

In [126]:
vocab_size = 50257
output_dim = 256


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [112]:
print(token_embedding_layer.weight)

Parameter containing:
tensor([[ 1.2063, -1.3265,  0.1377,  ..., -2.2146,  0.1584, -0.8616],
        [-0.4869, -1.7988,  1.6947,  ..., -0.5246,  1.1986,  1.4098],
        [ 1.3912, -0.5450,  0.3555,  ..., -0.5620,  0.7484,  0.1939],
        ...,
        [ 1.8663,  0.3249,  0.1583,  ...,  0.6272,  0.0072, -0.3193],
        [-0.0065, -1.8300, -0.5468,  ...,  0.9688, -0.1714, -0.0054],
        [-0.4938,  0.1243,  0.1832,  ...,  0.8621,  0.6091,  1.4655]],
       requires_grad=True)


In [120]:
max_length = 4

dataloader = create_dataloader_v1(
    raw_text,
    batch_size=8,
    max_length=max_length,
    stride=max_length,
    shuffle=True,
    drop_last=True,

)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [122]:
print(inputs)
print(inputs.shape)

tensor([[ 1738,   526,   198,   198],
        [  326,   612,   550,   587],
        [ 5986,    30,  2011, 20136],
        [  706,   683,    11,  7425],
        [   11,  9074,    13,   520],
        [  829,   526,   198,   198],
        [  438,  1659, 10833,   340],
        [  683,    13,  9754,   465]])
torch.Size([8, 4])


In [127]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [128]:
token_embeddings[0, 0]

tensor([-2.5619, -0.3378,  1.1003,  0.0953, -0.3632,  0.3551,  0.1283,  0.9432,
        -0.2163, -0.0567,  0.1259,  1.0136, -1.9421, -0.4041,  0.4834,  0.2921,
        -0.3763,  0.4229,  0.7766, -0.7245, -1.6580, -0.6419, -1.8448, -0.6177,
        -0.5738, -0.2406,  1.8161, -0.1848,  0.0518, -0.0894, -0.0409, -0.0070,
        -0.6494,  0.1360,  0.8586,  0.3045,  0.8480, -1.3007,  0.7818, -0.5352,
         0.0104,  1.1115, -0.6078,  0.9405,  0.0872, -0.2616,  0.0513,  1.1761,
        -0.0746, -1.4136, -0.2675, -0.6707,  0.5151, -0.4906, -1.1773, -0.0766,
        -0.3080,  2.0301, -0.7031,  0.3711,  0.1509, -0.0717,  0.9229, -1.3743,
        -0.6593,  2.2644,  0.4463,  1.5728,  0.1775, -0.3774, -0.8245,  0.1445,
         0.2215, -1.1865, -0.1905,  0.1849, -0.9273, -0.7332, -0.8635, -0.4601,
        -0.0733, -0.4657,  0.8515,  1.4805, -0.1650,  0.9564,  0.0475, -1.5697,
         0.8140,  1.7358,  0.4784,  0.6327,  0.7846, -0.5852, -1.2699, -0.0917,
        -0.7527,  0.6468,  0.4799,  0.72

In [129]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [130]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [131]:
pos_embedding_layer.weight

Parameter containing:
tensor([[-1.3612,  1.9678, -0.1865,  ...,  0.5606, -1.5007,  1.6241],
        [ 0.9420,  0.0695, -0.2236,  ..., -0.0553,  0.0200, -0.0693],
        [ 1.0954,  0.6826,  1.3983,  ...,  0.6344,  1.5968, -0.1572],
        [ 0.8778,  0.1266, -0.6630,  ...,  0.3224, -0.3242, -1.2349]],
       requires_grad=True)

In [133]:
pos_embedding_layer(torch.arange(max_length))

tensor([[-1.3612,  1.9678, -0.1865,  ...,  0.5606, -1.5007,  1.6241],
        [ 0.9420,  0.0695, -0.2236,  ..., -0.0553,  0.0200, -0.0693],
        [ 1.0954,  0.6826,  1.3983,  ...,  0.6344,  1.5968, -0.1572],
        [ 0.8778,  0.1266, -0.6630,  ...,  0.3224, -0.3242, -1.2349]],
       grad_fn=<EmbeddingBackward0>)

In [132]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
pos_embeddings.shape

torch.Size([4, 256])

In [134]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [135]:
pos_embeddings.shape

torch.Size([4, 256])

In [136]:
input_embeddings =  token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
