In [1]:
# Core Python
import os
import sys
import math
import random
import time
from datetime import datetime

# Data manipulation
import numpy as np
import pandas as pd

#Text processing
import re

In [2]:
with open("the-verdict.txt", "r") as f:
    raw_text = f.read()
print("Total number of chars: ", len(raw_text))
print(raw_text[:99])

Total number of chars:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
process_raw = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
process_raw = [item.strip() for item in process_raw if item.strip()]
print(process_raw[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [4]:
all_words = sorted(set(process_raw))
vocab_size = len(all_words)
print(vocab_size)

1130


In [5]:
vocab = {token: i for i, token in enumerate(all_words)}

In [6]:
for token, index in vocab.items():
    print([token, index])
    if index == 30:
        break

['!', 0]
['"', 1]
["'", 2]
['(', 3]
[')', 4]
[',', 5]
['--', 6]
['.', 7]
[':', 8]
[';', 9]
['?', 10]
['A', 11]
['Ah', 12]
['Among', 13]
['And', 14]
['Are', 15]
['Arrt', 16]
['As', 17]
['At', 18]
['Be', 19]
['Begin', 20]
['Burlington', 21]
['But', 22]
['By', 23]
['Carlo', 24]
['Chicago', 25]
['Claude', 26]
['Come', 27]
['Croft', 28]
['Destroyed', 29]
['Devonshire', 30]


In [7]:
class basicTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: token for token, i in vocab.items()}

    def encoder(self, text):
        process_raw = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        process_raw = [item.strip() for item in process_raw if item.strip()]
        ids = [self.str_to_int[s] for s in process_raw]
        return ids

    def decoder(self, ids):
        text = " ".join(self.int_to_str[i] for i in ids)
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


In [8]:
tokenizer = basicTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

test_encoder = tokenizer.encoder(text)
test_encoder[:10]

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126]

In [9]:
test_decoder = tokenizer.decoder(test_encoder)
test_decoder

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [10]:
tokenizer = basicTokenizerV1(vocab)

text_not_matching_vocab = """"Hello Worlds test."""

test_encoder = tokenizer.encoder(text_not_matching_vocab)
test_encoder[:10]

KeyError: 'Hello'

In [None]:
### we need to add a token for hadling the tokens that are missing from our vocab
### we need to add a token that gives a signals for end of sequence (unrelated texts) 

In [11]:
all_tokens = sorted(set(process_raw))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: i for i, token in enumerate(all_tokens)}
len(vocab)

1132

In [12]:
for index, item in list(vocab.items())[-5:]:
    print([index, item])

['younger', 1127]
['your', 1128]
['yourself', 1129]
['<|endoftext|>', 1130]
['<|unk|>', 1131]


In [13]:
class basicTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: token for token, i in vocab.items()}

    def encoder(self, text):
        process_raw = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        process_raw = [item.strip() for item in process_raw if item.strip()]
        process_raw = [
            item if item in self.str_to_int
            else "<|unk|>" for item in process_raw
        ]
        ids = [self.str_to_int[s] for s in process_raw]
        return ids

    def decoder(self, ids):
        text = " ".join(self.int_to_str[i] for i in ids)
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [14]:
tokenizer = basicTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [15]:
test_encoder2 = tokenizer.encoder(text)
print(test_encoder2)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [16]:
test_decoder2 = tokenizer.decoder(test_encoder2)
print(test_decoder2)

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
