In [2]:
import re

In [None]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Print size of text
len(raw_text)

20479

In [4]:
def re_tokenization(text):
    tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)

    tokens = [i.strip() for i in tokens if i.strip()]

    return tokens

In [5]:
# using regular expressions to create the tokens and create a dictionary vocab
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        tokens = [i.strip() for i in tokens if i.strip()] # removes empty and whitespace elements

        # make sure that the tokenizer works even with words that do not exist in vocab
        tokens = [i if i in self.str_to_int
                  else "<|unk|>" for i in tokens]

        return [self.str_to_int[s] for s in tokens]
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        # this removes spaces before some of the punctuations
        return re.sub(r'\s+([,.?!"()\'])', r'\1', text)


In [6]:
# tokenize the text
re_tokens = re_tokenization(raw_text)

# Create a vocabulary: 
# first we remove duplicates (by creating a set),
# then we sort the tokens alphabetically
all_tokens = sorted(list(set(re_tokens)))
all_tokens.extend(["<|unk|>", "<|endoftext|>"])
vocab_size = len(all_tokens)
print(vocab_size)

vocab = {token:integer for integer,token in enumerate(all_tokens)}

# Create the tokenizer from the vocab
tokenizer = SimpleTokenizer(vocab)

print(len(tokenizer.str_to_int))

1132
1132


In [7]:
# integer tokennized text

print(tokenizer.encode(raw_text))

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120, 530, 208, 85, 734, 34, 7, 4, 1, 93, 538, 722, 549, 496, 1, 6, 987, 1077, 1089, 988, 1112, 242, 585, 7, 53, 244, 535, 67, 7, 37, 100, 6, 549, 602, 25, 897, 6, 326, 549, 1042, 116, 7, 1, 73, 297, 585, 2, 850, 498, 1016, 866, 988, 1059, 722, 697, 769, 2, 1083, 1051, 9, 239, 53, 359, 2, 970, 998, 722, 987, 5, 66, 7, 83, 6, 988, 646, 1016, 16, 584, 145, 53, 998, 722, 7, 1, 93, 1116, 5, 727, 67, 7, 100, 2, 850, 633, 5, 693, 586, 114, 847, 114, 177, 1002, 994, 1088, 827, 568, 156, 389, 1069, 722, 677, 7, 14, 585, 1077, 711, 731, 988, 67, 7, 101, 1097, 688, 7, 45, 711, 988, 410, 50, 28, 5, 180, 988, 602, 40, 36, 882, 5, 929, 663, 209, 38, 2, 850, 1, 65, 1, 1016, 856, 5, 1108, 976, 568, 539, 4