## Step 1: Creating Tokens

In [15]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("total Number of character: ", len(raw_text))
print(f"raw_text: {raw_text[:99]}")

total Number of character:  20479
raw_text: I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [16]:
import re

text = "Hello, This is me, I would like to test you out!"
results = re.split(r'(\s|[,.])', text)

print(results)

['Hello', ',', '', ' ', 'This', ' ', 'is', ' ', 'me', ',', '', ' ', 'I', ' ', 'would', ' ', 'like', ' ', 'to', ' ', 'test', ' ', 'you', ' ', 'out!']


In [17]:
results = [item for item in results if item.strip()]
print(results, len(results))

['Hello', ',', 'This', 'is', 'me', ',', 'I', 'would', 'like', 'to', 'test', 'you', 'out!'] 13


In [18]:
import re

text = "Hello, This is me, I would like -- to test you out!, should i help you with it? and Jack said: He is not gonna be available _ on __ (silently)"
results = re.split(r'(\s|[,.:;?_!"()\'])', text)
results = [item for item in results if item.strip()]

print(results)

['Hello', ',', 'This', 'is', 'me', ',', 'I', 'would', 'like', '--', 'to', 'test', 'you', 'out', '!', ',', 'should', 'i', 'help', 'you', 'with', 'it', '?', 'and', 'Jack', 'said', ':', 'He', 'is', 'not', 'gonna', 'be', 'available', '_', 'on', '_', '_', '(', 'silently', ')']


In [19]:
preprocessed = re.split(r'(\s|[,.:;?_!"()\']|--)', raw_text)
print(f"preprocessed with whitespaces: {len(preprocessed)}")

preprocessed = [item for item in preprocessed if item.strip()]
print(f"Preprocssed without whitespace: {len(preprocessed)}")


preprocessed with whitespaces: 9235
Preprocssed without whitespace: 4690


## Step 2: Creating Token IDs

In [20]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(f"Vocab size: {vocab_size}")

Vocab size: 1130


In [21]:
# we building a simpl encoder where assigning vocabs with token ids
vocab = {token: integer for integer, token in enumerate(all_words)}

In [22]:
for i, item in enumerate(vocab.items()):
    print(f"{item[0]} : {item[1]}")
    if i >= 50:
        break

! : 0
" : 1
' : 2
( : 3
) : 4
, : 5
-- : 6
. : 7
: : 8
; : 9
? : 10
A : 11
Ah : 12
Among : 13
And : 14
Are : 15
Arrt : 16
As : 17
At : 18
Be : 19
Begin : 20
Burlington : 21
But : 22
By : 23
Carlo : 24
Chicago : 25
Claude : 26
Come : 27
Croft : 28
Destroyed : 29
Devonshire : 30
Don : 31
Dubarry : 32
Emperors : 33
Florence : 34
For : 35
Gallery : 36
Gideon : 37
Gisburn : 38
Gisburns : 39
Grafton : 40
Greek : 41
Grindle : 42
Grindles : 43
HAD : 44
Had : 45
Hang : 46
Has : 47
He : 48
Her : 49
Hermia : 50


In [23]:
# Tokenizer Class
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encoder(self, text):
        preprocessed = re.split(r'(\s|[,.:;?_!"()\']|--)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuation
        text = re.sub(f'\s+([,.?!"()\'])', r'\1', text)
        return text


In [24]:
tokenizer = SimpleTokenizerV1(vocab)

text = "It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."

encoder = tokenizer.encoder(text)
print(encoder)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]


In [25]:
tokenizer.decode(encoder)

"It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride."

### Adding special token context

In [26]:
# try with vocab that is not existing in the dictionary

text = "Hello, How we are driving the car"
tokenizer.encoder(text)

KeyError: 'Hello'

Adding two special tokens to handle some of the edge cases in the tokenization

In [27]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: integer for integer, token in enumerate(all_tokens)}

print(f"vocab size : {len(vocab)}")

vocab size : 1132


In [37]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encoder(self, text):
        preprocessed = re.split(r'(\s|[,.:;?_!"()\']|--)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decoder (self, ids):
        text = " ".join([self.int_to_str[s] for s in ids])
        return text

In [38]:
tokenizer = SimpleTokenizerV2(vocab)

for i, item in enumerate(list(tokenizer.int_to_str.items())[-5:]):
    print(item)

(1127, 'younger')
(1128, 'your')
(1129, 'yourself')
(1130, '<|endoftext|>')
(1131, '<|unk|>')


In [39]:
text1 = "Hello, would like to drive my car on the nation highway"
text2 = "Le mans raced was ended on july with monsoon weather"

text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, would like to drive my car on the nation highway <|endoftext|> Le mans raced was ended on july with monsoon weather


In [41]:
encode_ids = tokenizer.encoder(text)
print(encode_ids)

[1131, 5, 1120, 628, 1016, 1131, 697, 1131, 727, 988, 1131, 1131, 1130, 1131, 1131, 1131, 1077, 1131, 727, 1131, 1108, 1131, 1131]


In [42]:
decode_text = tokenizer.decoder(encode_ids)
print(decode_text)


<|unk|> , would like to <|unk|> my <|unk|> on the <|unk|> <|unk|> <|endoftext|> <|unk|> <|unk|> <|unk|> was <|unk|> on <|unk|> with <|unk|> <|unk|>
