In [53]:
import numpy as np 
import re

In [54]:

text = "hello world, These is a text IN WHICH PHILEAS FOGG and PASSEPARTOUT ACCEPT EACH OTHER, THE ONE AS MASTER, THE OTHER A"
result = re.split(r'([.;,?_!]|--|\s)', text)

print(result)

result = [item.strip() for item in result if item.strip()]

print(result)

['hello', ' ', 'world', ',', '', ' ', 'These', ' ', 'is', ' ', 'a', ' ', 'text', ' ', 'IN', ' ', 'WHICH', ' ', 'PHILEAS', ' ', 'FOGG', ' ', 'and', ' ', 'PASSEPARTOUT', ' ', 'ACCEPT', ' ', 'EACH', ' ', 'OTHER', ',', '', ' ', 'THE', ' ', 'ONE', ' ', 'AS', ' ', 'MASTER', ',', '', ' ', 'THE', ' ', 'OTHER', ' ', 'A']
['hello', 'world', ',', 'These', 'is', 'a', 'text', 'IN', 'WHICH', 'PHILEAS', 'FOGG', 'and', 'PASSEPARTOUT', 'ACCEPT', 'EACH', 'OTHER', ',', 'THE', 'ONE', 'AS', 'MASTER', ',', 'THE', 'OTHER', 'A']


In [55]:
with open("./fiction_stories.txt", "r", encoding= "utf-8") as f:
    raw_text = f.read()


preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['Chapter', 'I', 'IN', 'WHICH', 'PHILEAS', 'FOGG', 'AND', 'PASSEPARTOUT', 'ACCEPT', 'EACH', 'OTHER', ',', 'THE', 'ONE', 'AS', 'MASTER', ',', 'THE', 'OTHER', 'AS', 'MAN', 'Mr', '.', 'Phileas', 'Fogg', 'lived', ',', 'in', '1872', ',']


In [56]:
len(preprocessed)

322243

In [57]:
""" 
REMOVING WHITE SPACE OR NOT
When developing a simple tokenizer, weather we should encode whitespaces as separate or just remove depends on our application abd its requirement. Removing whitespaces reduces the memory and computing requirement However, keeping whitespaces can be useful if we train the model that are sensitive to indentation and spacing 
"""

' \nREMOVING WHITE SPACE OR NOT\nWhen developing a simple tokenizer, weather we should encode whitespaces as separate or just remove depends on our application abd its requirement. Removing whitespaces reduces the memory and computing requirement However, keeping whitespaces can be useful if we train the model that are sensitive to indentation and spacing \n'

In [58]:
# vocab contain unique token and arranged alphabetically
# Note: unk and endoftext is special contain context token
# we have modified the token to deal with unknown words 

In [59]:
preprocessed = sorted(set(preprocessed))

#print(preprocessed)


preprocessed.append("|unk|")
preprocessed.append("|endoftext|")
print(preprocessed[-20:])

# print(f"size of vocab {len(preprocessed)}")

['“‘Excellent', '“‘Frankenstein', '“‘Great', '“‘Heaven', '“‘Hideous', '“‘How', '“‘I', '“‘It', '“‘May', '“‘Near', '“‘No', '“‘That', '“‘They', '“‘Where', '”', '”]', '”—and', '…', '|unk|', '|endoftext|']


In [60]:

# converting token into token id
vocab = {token:id for id, token in enumerate(preprocessed)}

#vocab = {token:integer for integer,token in enumerate(all_words)}

print(vocab)



In [65]:
for i, item in enumerate(vocab.items()):
    if i > 91:
        print(item)

('6th', 92)
('7', 93)
('7th', 94)
('8', 95)
('80', 96)
('80th', 97)
('8th', 98)
('9', 99)
('9th', 100)
(':', 101)
(';', 102)
('?', 103)
('A', 104)
('ABOUT', 105)
('ACCEPT', 106)
('ACROBATIC', 107)
('ACROSS', 108)
('AGE', 109)
('AIDS', 110)
('AMERICAN', 111)
('AN', 112)
('AND', 113)
('ANTIPODES', 114)
('ANYBODY', 115)
('APPEARS', 116)
('ARE', 117)
('AROUND', 118)
('AS', 119)
('ASTOUNDS', 120)
('AT', 121)
('ATTRACTION', 122)
('Abandoning', 123)
('Abbey', 124)
('Abbey”', 125)
('Able-bodied', 126)
('About', 127)
('Above', 128)
('Abraham', 129)
('Abruptly', 130)
('Absence', 131)
('Absolute', 132)
('Absolutely', 133)
('Academy', 134)
('According', 135)
('Accordingly', 136)
('Across', 137)
('Adam', 138)
('Adam’s', 139)
('Addlestone', 140)
('Aden', 141)
('Adieu', 142)
('Admitted', 143)
('Admitting', 144)
('Advancement', 145)
('Affects', 146)
('Afraid', 147)
('Africa', 148)
('African', 149)
('After', 150)
('Afterwards', 151)
('Again', 152)
('Against', 153)
('Agatha', 154)
('Age', 155)
('Ages', 

In [62]:

class simpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab  #subwords to unique tokens
        self.int_to_str = {s:i for i, s in vocab.items()}

    def encode(self,text):
        print(text)
        preprocessed = re.split(r'[.,:;?!"()]|--|\s', text)
        print(preprocessed)

        preprocessed = [i.strip() for i in preprocessed if i.strip()]
        # ids = [self.str_to_int[s] for s in preprocessed]
        ids = []
        id = 0

        for s in preprocessed :
            if s not in self.str_to_int :
                id = self.str_to_int["|unk|"]
            else :
                id = self.str_to_int[s]

        # input text-> subword unknown -> id otherwise dict[s] -> id ids.append(id)
            ids.append(id)

        ids.append(self.str_to_int["|endoftext|"])

        # preprocessed = [self.int_to_str[id] for id in ids]

        return ids
    
    def decode(self, ids): #dict[int] ->str
        # text = " ".join

        text = " ".join(self.int_to_str[id] for id in ids)

        # replace spaces before specified punctuation
        text = re.sub(r'\s+([,.?!()\'])', r'\1', text)

        return text
    


In [63]:

tokenizer = simpleTokenizer(vocab)
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))


hello world, These is a text IN WHICH PHILEAS FOGG and PASSEPARTOUT ACCEPT EACH OTHER, THE ONE AS MASTER, THE OTHER A
['hello', 'world', '', 'These', 'is', 'a', 'text', 'IN', 'WHICH', 'PHILEAS', 'FOGG', 'and', 'PASSEPARTOUT', 'ACCEPT', 'EACH', 'OTHER', '', 'THE', 'ONE', 'AS', 'MASTER', '', 'THE', 'OTHER', 'A']
[18720, 18150, 2343, 10163, 2683, 16422, 1211, 2518, 1780, 866, 3149, 1776, 106, 741, 1711, 2291, 1707, 119, 1464, 2291, 1711, 104, 18721]
|unk| world These is a text IN WHICH PHILEAS FOGG and PASSEPARTOUT ACCEPT EACH OTHER THE ONE AS MASTER THE OTHER A |endoftext|
