### WHITE SPACE TOKENIZERS

In [None]:
# reading the book
with open(r"Game_of_thrones_books/001ssb.txt",encoding="utf-8") as f:
    raw_text=f.read()

print("total numbers of charecters - ",len(raw_text))
print('\n',raw_text[:120])

total numbers of charecters -  1607894

 A Game Of Thrones 
Book One of A Song of Ice and Fire 
By George R. R. Martin 
PROLOGUE 
"We should start back," Gared u


In [None]:
#Checking the whitespace tokeniser
import re
text="""Hi, i am ismail khan, i love NLP!!"""
result=re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()] ## removing the spaces
print(result)

['Hi', ',', 'i', 'am', 'ismail', 'khan', ',', 'i', 'love', 'NLP', '!', '!']


In [31]:
# processing the GOT book

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])
print('\n', len(preprocessed))

['A', 'Game', 'Of', 'Thrones', 'Book', 'One', 'of', 'A', 'Song', 'of', 'Ice', 'and', 'Fire', 'By', 'George', 'R', '.', 'R', '.', 'Martin', 'PROLOGUE', '"', 'We', 'should', 'start', 'back', ',', '"', 'Gared', 'urged']

 367561


In [48]:
# Building Vocabulary - creating tokens id
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])


vocab = {token:integer for integer,token in enumerate(all_tokens)}
vocab_size = len(all_tokens)



In [49]:
class SimpleTokenizerV1:
    def __init__(self, vocab): # here vocab is text
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()} # reverse the mapping of token-id and make id-token fro decoder to fetch words for id while predicting 
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [50]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"Book One of A Song of Ice and Fire"""
ids = tokenizer.encode(text)
print(ids)

[1, 801, 1900, 8961, 609, 2262, 8961, 1471, 2922, 1195]


In [51]:
tokenizer.decode(ids)

'" Book One of A Song of Ice and Fire'

In [58]:
tokenizer.encode("ismail")

[13724]

So far, we have discussed tokenization as an essential step in processing text as input to
LLMs. Depending on the LLM, some researchers also consider additional special tokens such
as the following:

[BOS] (beginning of sequence): This token marks the start of a text. It
signifies to the LLM where a piece of content begins.

[EOS] (end of sequence): This token is positioned at the end of a text,
and is especially useful when concatenating multiple unrelated texts,
similar to <|endoftext|>. For instance, when combining two different
Wikipedia articles or books, the [EOS] token indicates where one article
ends and the next one begins.

[PAD] (padding): When training LLMs with batch sizes larger than one,
the batch might contain texts of varying lengths. To ensure all texts have
the same length, the shorter texts are extended or "padded" using the
[PAD] token, up to the length of the longest text in the batch.

In [55]:
tokenizer = SimpleTokenizerV1(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)
tokenizer.encode(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


[1401,
 7,
 5297,
 13706,
 8063,
 12250,
 608,
 13723,
 1485,
 12325,
 12051,
 13724,
 8961,
 12325,
 9158,
 24]

In [59]:
tokenizer.decode(tokenizer.encode(text))

'Hello, do you like tea? <|endoftext|> In the sunlit <|unk|> of the palace.'

### BYTE PAIR TOKENIZERS

In [61]:
import importlib
import tiktoken

In [62]:
tokenizer = tiktoken.get_encoding("gpt2")

In [67]:
text = ( "ABRAKADABRA")

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[6242, 3861, 42, 2885, 6242, 3861]


In [73]:
for x in integers:
    print(tokenizer.decode([x]))

AB
RA
K
AD
AB
RA


In [74]:
strings = tokenizer.decode(integers)

print(strings)

ABRAKADABRA
