# Processing Text

## Tokenization

In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print(f"Length of text (total number of characters) = {len(raw_text)}")
print(raw_text[:99])

Length of text (total number of characters) = 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


Splitting the text into constituent words, punctuations and white spaces:

In [2]:
import regex as re
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
print(preprocessed[:99])

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', 'great', ' ', 'surprise', ' ', 'to', ' ', 'me', ' ', 'to', ' ', 'hear', ' ', 'that', ',', '', ' ', 'in', ' ', 'the', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', ',', '', ' ', 'he', ' ', 'had', ' ', 'dropped', ' ', 'his', ' ', 'painting', ',', '', ' ', 'married', ' ', 'a', ' ', 'rich', ' ', 'widow', ',', '', ' ', 'and', ' ', 'established', ' ', 'himself', ' ', 'in', ' ', 'a']


Removing whitespace from the list of words and punctuation:

In [3]:
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:10])
print(len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius']
4649


Now, we create a vocabulary from the preprocessed textual data.

In [4]:
vocabulary  = sorted(list(set(preprocessed)))
vocab_size = len(vocabulary)
print(vocabulary[:99])
print(f"Vocabulary size = {vocab_size}")

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Carlo;', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindle:', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon-dancers', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Rickham;', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There']
Vocabulary size = 1159


In [5]:
vocab = {text: integer for integer, text in enumerate(vocabulary)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)
('Her', 51)


### Creating a tokenizer:

In [6]:
class SimpleTokenizer:
  def __init__(self,vocab):
    self.text_to_int = vocab
    self.int_to_text = {integer:text for  text, integer in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    preprocessed = [item for item in preprocessed if item.strip()]
    ids = [self.text_to_int[item] for item in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_text[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

testing the tokenizer

In [7]:
tokenizer = SimpleTokenizer(vocab)

text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)
text = tokenizer.decode(ids)
print(text)

[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [8]:
# text = "Hello, how are you doing ?"
# ids = tokenizer.encode(text)
# print(ids)
# text = tokenizer.decode(ids)
# print(text)

The code gives an error when it finds a word that wasn't previously in its vocabulary. To fix this, we add a term in the vocabulary: <|unk|> this handles all the unknown words in the vocabulary.

In [9]:
#Updating the vocabulary:
vocabulary.extend(["<|unk|>", "<|endoftext|>"])
vocab = {text: integer for integer, text in enumerate(vocabulary)}

#Creating a new and improved tokenizer:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.int_to_text = {i:s for s, i in vocab.items()}
    self.text_to_int = vocab

  def encode(self, text):
    preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [item if item in self.text_to_int else "<|unk|>" for item in preprocessed]
    ids = [self.text_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_text[s] for s in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

Testing this new tokenizer

In [10]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello!, how are you ?"
text2 = "I am fine, what about you ?"
text = " <|endoftext|> ".join([text1, text2])
ids = tokenizer.encode(text)
print(ids)
decoded = tokenizer.decode(ids)
print(decoded)

[1159, 0, 5, 571, 174, 1155, 10, 1160, 55, 155, 1159, 5, 1116, 122, 1155, 10]
<|unk|>!, how are you? <|endoftext|> I am <|unk|>, what about you?


## Creating a Dataset class

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    """txt: raw text data (string)
    tokenizer: object with encode() method to turn text into token IDs
    max_length: length of each training sequence
    stride: controls overlap between sequences"""
    self.tokenizer = tokenizer
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(txt)
    for i in range(0, len(token_ids) - max_length, stride):
      '''Slides a window over the long tokenized text.
      Starts at i = 0, steps by stride.
      Stops before len(token_ids) - max_length to avoid overflow.
      This gives overlapping sequences if stride < max_length.'''
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

## Creating a Dataset Loader for our LLM

In [12]:
#!pip install tiktoken

In [13]:
import tiktoken
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) #B
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) #C
  return dataloader


In [14]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


## Creating Token Embeddings

In [15]:
vocab_size = 6
output_dim = 3 # How many vectors we want to use to represent each token

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


We can see that the weight matrix of the embedding layer contains small, random values. These values are optimized during LLM training as part of the LLM optimization itself, as we will see in upcoming chapters. Moreover, we can see that the weight matrix has six rows and three columns. There is one row for each of the six possible tokens in the vocabulary. And there is one column for each of the three embedding dimensions.

In [17]:
print(embedding_layer(torch.tensor([1,2,3])))

tensor([[ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


The vector embeddings for 1 is [ 0.9178,  1.5810,  1.3010] and so on for 2 and 3

This embedding is equal to the row index 1 (second row) in the embedding_layer.weight output

## Embedding word positions

Since the self-attention mechanisms don't have sense of sequence of words, we have to encode that data too

In the previous embedding system, the number 1 will be embedded the same wether it is at the 2nd position or the first.

In [19]:
print(f"{embedding_layer(torch.tensor([1,2,3]))}\n--------------\n{embedding_layer(torch.tensor([2,1,4]))}")


tensor([[ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)
--------------
tensor([[ 1.2753, -0.2010, -0.1606],
        [ 0.9178,  1.5810,  1.3010],
        [-1.1589,  0.3255, -0.6315]], grad_fn=<EmbeddingBackward0>)


So there is no sense of positioning in the embedded data

In [20]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(token_embedding_layer.weight)

Parameter containing:
tensor([[-2.1338,  1.0524, -0.3885,  ...,  0.2461,  1.2119,  0.3171],
        [ 1.2277, -0.4297, -2.2121,  ..., -0.1640, -0.3348, -0.0221],
        [ 1.3382,  0.2706,  0.5071,  ...,  0.0175, -2.1517,  0.3924],
        ...,
        [-1.4889, -1.2456,  1.8034,  ..., -0.6392, -1.4939,  0.3614],
        [-1.0703,  0.2795, -0.2637,  ..., -0.2810, -1.4755, -0.1183],
        [-0.0071,  0.4982, -0.3319,  ...,  0.4970,  0.9365, -0.2091]],
       requires_grad=True)


In [21]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
input, targets = next(data_iter)
print("Token ID:\n", input)
print("Input shape:\n", input.shape)

Token ID:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Input shape:
 torch.Size([8, 4])


In [23]:
token_embeddings = token_embedding_layer(input)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [24]:
context_length = max_length
#How many tokens the model can see at once
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
# 1 embedding for each position in the sequence, Each embedding is a vector of size output_dim
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
# Passes position IDs [0, 1, ..., max_length - 1] through the embedding layer. Gets the corresponding position vectors.
print(pos_embeddings.shape)

torch.Size([4, 256])
