In [1]:
with open("the-verdict.txt", 'r') as f:
  raw_text = f.read()

In [2]:
print(f"Total number of characters: {len(raw_text)}")

Total number of characters: 20479


In [3]:
raw_text[:99]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

In [4]:
import re
text = "I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no "
re = re.split(r'(\s)', text)
print(re)

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius--though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough--so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', '']


In [5]:
result = [item for item in re if item.strip()]
print(result)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius--though', 'a', 'good', 'fellow', 'enough--so', 'it', 'was', 'no']


In [6]:
import re

text = "I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no "

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no']


In [7]:
processed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
processed = [item.strip() for item in processed if item.strip()]
print(processed[:10])
print("Length: ", len(processed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius']
Length:  4690


In [8]:
all_words = sorted(set(processed))
print(f"Total number of words: {len(all_words)}")
vocab_size = len(all_words)
print(f"Vocab size: {vocab_size}")

Total number of words: 1130
Vocab size: 1130


In [9]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [10]:
for i, item in enumerate(vocab.items()):
  print(item)
  if i >= 20:
    break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)


In [11]:
class SimpleTokenizeV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s, i in vocab.items()}

  def encode(self, text):
    processed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    processed = [item.strip() for item in processed if item.strip()]
    ids = [self.str_to_int[s] for s in processed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[s] for s in ids])
    # Replacing whitespace between punctuations
    text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
    return text

In [12]:
tokenizer = SimpleTokenizeV1(vocab=vocab)

In [13]:
text = "I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no"
print(tokenizer.encode(text))

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709]


In [14]:
text = [53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709]
print(tokenizer.decode(text))

I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no


In [15]:
text = "Hello, World"
print(tokenizer.encode(text))

KeyError: 'Hello'

## **Handling Unknown words**

In [16]:
all_tokens = sorted(set(processed))
all_tokens.extend(['<|endoftext|>', '<|unk|>'])

vocab = {token: integer for integer, token in enumerate(all_tokens)}

In [17]:
print("Length of new vocab: ", len(vocab))
for i, j in list(vocab.items())[-2:]:
  print(i, j)

Length of new vocab:  1132
<|endoftext|> 1130
<|unk|> 1131


In [18]:
class SimpleTokenizeV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {j:i for i, j in vocab.items()}

  def encode(self, text):
    processed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    processed = [item.strip() for item in processed if item.strip()]
    processed = [
        item if item in self.str_to_int else '<|unk|>' for item in processed
    ]
    ids = [self.str_to_int[s] for s in processed]
    return ids

  def decode(self, ids):
      text = " ".join([self.int_to_str[s] for s in ids])
      # Replacing whitespace between punctuations
      text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
      return text

In [19]:
tokenizer = SimpleTokenizeV2(vocab=vocab)

In [20]:
text1 = "Hello, do you like tea"
text2 = "Welcome to programming"

text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea <|endoftext|> Welcome to programming


In [21]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 1130, 1131, 1016, 1131]

In [22]:
tokenizer.decode([1131, 5, 355, 1126, 628, 975, 1130, 1131, 1016, 1131])

'<|unk|>, do you like tea <|endoftext|> <|unk|> to <|unk|>'