In [4]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text = f.read()
    
print("The first 100 characters are : ", text[:99])

print("The total number of characters in this text is : ", len(text))

The first 100 characters are :  I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
The total number of characters in this text is :  20479


In [5]:
import re

samp_text = "Hello, World. This, is a test."
result = re.split('(\s)', samp_text)
print(result)

['Hello,', ' ', 'World.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


##### Now lets say we want to also split the commas and the period

In [10]:
result = re.split('([.,]|\s)', samp_text)
print(result)

new_result =[]
for item in result:
    if(item.strip()):
        new_result.append(item)
print(new_result)


['Hello', ',', '', ' ', 'World', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']
['Hello', ',', 'World', '.', 'This', ',', 'is', 'a', 'test', '.']


In [12]:
samp_text2 = "Hello, World! I am new to this-- do you know how to code?"
result = re.split('([,.?/!;:"()\']|--|\s)', samp_text2)
print(result)

['Hello', ',', '', ' ', 'World', '!', '', ' ', 'I', ' ', 'am', ' ', 'new', ' ', 'to', ' ', 'this', '--', '', ' ', 'do', ' ', 'you', ' ', 'know', ' ', 'how', ' ', 'to', ' ', 'code', '?', '']


### Now, lets proceed to tokenzing the verdict.txt

In [28]:
preprocessed = re.split('([,.?/!;:"()_\']|--|\s)', text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [29]:
print(len(preprocessed))

4690


### Next, we will assign token IDs to these tokens

In [30]:
tokens = sorted(set(preprocessed))
print(len(tokens))

1130


In [31]:
vocab = {}
for idx, token in enumerate(tokens):
    vocab[token] = idx
 

In [52]:
class TokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = self.create_inverse_vocab(vocab)
    
    def create_inverse_vocab(self, vocab):
        inv_dict = {}
        for key, value in vocab.items():
            inv_dict[value] = key
        return inv_dict
    
    def encode(self, sentence):
        preprocessed_sent = re.split('([,.?/!;:"()_\']|--|\s)', sentence)
        preprocessed_sent = [item.strip() for item in preprocessed_sent if item.strip()]
        token_ids_sent = [self.str_to_int[token] for token in preprocessed_sent]
        return token_ids_sent
        
    def decode(self, token_id_sentence):
        sentence = " ".join([self.int_to_str[token_id] for token_id in token_id_sentence])
        sentence = re.sub(r'\s+([,.?/!;:"()_\']|--)', r'\1', sentence)
        return sentence

In [57]:
tokenizer = TokenizerV1(vocab)
sentence = "the, brown man is good."
tokenized_sent = tokenizer.encode(sentence)
print(tokenized_sent)

[988, 5, 235, 656, 584, 500, 7]


In [58]:
tokenizer.decode(tokenized_sent)

'the, brown man is good.'

### But there is one limitation, OOV words need to be handled. Also if multiple sources are used to generate the corpus, we might want to keep the end of sentence token in between each. So, we'll modify the previous section

In [69]:
tokens = sorted(set(preprocessed))
tokens.extend(['<UNK>', '<EOS>'])
print(len(tokens))

1132


In [70]:
vocab = {}
for idx, token in enumerate(tokens):
    vocab[token] = idx


In [71]:
list(vocab.items())[-5:]

[('younger', 1127),
 ('your', 1128),
 ('yourself', 1129),
 ('<UNK>', 1130),
 ('<EOS>', 1131)]

In [72]:
class TokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = self.create_inverse_vocab(vocab)
    
    def create_inverse_vocab(self, vocab):
        inv_dict = {}
        for key, value in vocab.items():
            inv_dict[value] = key
        return inv_dict
    
    def encode(self, sentence):
        preprocessed_sent = re.split('([,.?/!;:"()_\']|--|\s)', sentence)
        preprocessed_sent = [item.strip() for item in preprocessed_sent if item.strip()]
        preprocessed_sent2 = []
        for word in preprocessed_sent:
            if word not in self.str_to_int:
                preprocessed_sent2.append("<UNK>")
            else:
                preprocessed_sent2.append(word)
        token_ids_sent = [self.str_to_int[token] for token in preprocessed_sent2]
        return token_ids_sent
        
    def decode(self, token_id_sentence):
        sentence = " ".join([self.int_to_str[token_id] for token_id in token_id_sentence])
        sentence = re.sub(r'\s+([,.?/!;:"()_\']|--)', r'\1', sentence)
        return sentence

In [73]:
tokenizer = TokenizerV2(vocab)
sentence = "the, brown fox is good."
tokenized_sent = tokenizer.encode(sentence)
print(tokenized_sent)

[988, 5, 235, 1130, 584, 500, 7]


In [74]:
tokenizer.decode(tokenized_sent)

'the, brown <UNK> is good.'

In [82]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlight terraces of the palace"

text = " <EOS> ".join([text1, text2])
print(text)

Hello, do you like tea? <EOS> In the sunlight terraces of the palace


In [83]:
tokenized_sent = tokenizer.encode(text)
print(tokenized_sent)

[1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 1130, 984, 722, 988, 1130]


In [84]:
tokenizer.decode(tokenized_sent)

'<UNK>, do you like tea? <EOS> In the <UNK> terraces of the <UNK>'