#### Custom TokenizerV1 without handling unknown vocabulary tokens and <|endoftext|>

In [None]:
# creating the CustomTokenizer class 

import re

class CustomTokenizerV1:

    """
    Custom tokenizer class 
    """
    def __init__(self, text:str):        
        self.tokens_to_ids = {}
        self.ids_to_tokens = {}
        
        # self.text = text.lower()
        self.special_chars = r'([,.:;?_!"()\']|--|\s)'
        r'([,.:;?_!"()\']|--|\s)'

        processed_text = re.split(self.special_chars, text)
        print("from init", processed_text)
        self.tokens = [text.strip() for text in processed_text if text.strip()]

        self.tokens_to_ids = {token:idx for idx, token in enumerate(self.tokens)}
        self.ids_to_tokens = {idx: token for idx, token in enumerate(self.tokens)}


    def encode(self, text:str):
        """
        encodes the text into tokens and updates the instance parameters with tokens_to_ids, ids_to_tokens 

        Args:
            text(str): string to tokenize
        
        Returns:
            token_ids(list): list of token_ids
        """
        try:


            processed_text = re.split(self.special_chars, text)
            tokens = [text.strip() for text in processed_text if text.strip()]

            token_ids = [self.tokens_to_ids[token] for token in tokens]
            return token_ids

        except Exception as exe:
            print(f"exception in encode: {exe.__str__()}")
            raise exe
        
    
    def decode(self, token_ids):
        """
        decodes the list of tokens back to string 

        Args:
            tokens_ids(list): list of token_ids 

        Returns:
            decoded text(str) 
        """
        try:
            tokens = [self.ids_to_tokens[token] for token in token_ids]
            decoded_text = []
            for token in tokens:
                if token in self.special_chars: 
                    if decoded_text:  
                        decoded_text[-1] += token
                    else:
                        decoded_text.append(token)
                else:
                    decoded_text.append(token)

            return " ".join(decoded_text)

        except Exception as exe:
            print(f"except occured in decode: {exe.__str__()}")
            raise exe

In [None]:
text = "The quick brown fox jumps over the lazy dog, this"
tokenizer = CustomTokenizerV1(text=text)

print(f"original Text: {text}")

token_ids = tokenizer.encode(text)
print(f"token_ids: {token_ids}")


decoded_text = tokenizer.decode(token_ids)
print(f"decoded text: {decoded_text}")
print(f"tokens: {list(tokenizer.tokens_to_ids.keys())}")

print(tokenizer.tokens_to_ids)
assert text==decoded_text, "both text are not same"

In [None]:
# trying to decode the text that is not available in the vocabulary

print(tokenizer.decode([3,4,7,8,9]))
print(tokenizer.encode(text="this brown fox"))

#### Custom TokenizerV2 handling unknown vocabulary (<|unk|>) and <|endoftext|>

In [None]:
# creating the CustomTokenizer class 

import re

class CustomTokenizerV2:

    """
    Custom tokenizer class handling unknown vocabulary and endoftext tokens
    """
    def __init__(self, text:str):
               
        self.tokens_to_ids = {}
        self.ids_to_tokens = {}
        

        # self.text = text.lower()
        self.special_chars = r'([,.:;?_!"()\']|--|\s)'
        processed_text = re.split(self.special_chars, text)
        print("processed text: ", processed_text)
        self.tokens = [token.strip() for token in processed_text if token.strip()]
        
        print("from init", self.tokens)

        self.tokens_to_ids = {token:idx for idx, token in enumerate(self.tokens)}
        self.ids_to_tokens = {idx: token for idx, token in enumerate(self.tokens)}

        self.tokens_to_ids['<|unk|>'] = -100
        self.ids_to_tokens[-100] = '<|unk|>'

        # self.tokens_to_ids['<|endoftext|>'] = -200
        # self.ids_to_tokens[-200] = '<|endoftext|>'


    def encode(self, text:str):
        """
        encodes the text into tokens and updates the instance parameters with tokens_to_ids, ids_to_tokens 

        Args:
            text(str): string to tokenize
        
        Returns:
            token_ids(list): list of token_ids
        """
        try:

            processed_text = re.split(self.special_chars, text)
            tokens = [text.strip() for text in processed_text if text.strip()]

            token_ids = [self.tokens_to_ids[token] if token in self.tokens else self.tokens_to_ids['<|unk|>'] for token in tokens]
            return token_ids

        except Exception as exe:
            print(f"exception in encode: {exe.__str__()}")
            raise exe
        
    
    def decode(self, token_ids):
        """
        decodes the list of tokens back to string 

        Args:
            tokens_ids(list): list of token_ids 

        Returns:
            decoded text(str) 
        """
        try:
            tokens = [self.ids_to_tokens[token] for token in token_ids]
            decoded_text = []
            for token in tokens:
                if token in self.special_chars: 
                    if decoded_text:  
                        decoded_text[-1] += token
                    else:
                        decoded_text.append(token)
                else:
                    decoded_text.append(token)

            return " ".join(decoded_text)

        except Exception as exe:
            print(f"except occured in decode: {exe.__str__()}")
            raise exe

In [None]:
import os 
with open("/home/maheshbabu/LLMArchitecture/wikipedia_ai.txt", "r") as f:
    text = f.read()

# replace \n with <|endoftext|> 
text = text.replace("\n", " <|endoftext|> <|beginoftext|> ")

In [None]:

tokenizer = CustomTokenizerV2(text)

print(tokenizer.tokens)
print(tokenizer.tokens_to_ids)
print(tokenizer.encode("this is another day called <|endoftext|>"))
print(tokenizer.decode([2,5,6,-100]))




In [105]:
sample_text = "The tiger is the beautiful animal, but it is very dangerous in the forest. Who is maheshbabu, This is demo lecture from the vizuara"

print(f"original text: {sample_text}")
print(f"token ids: {tokenizer.encode(sample_text)}")
print(f"decoded text: {tokenizer.decode(tokenizer.encode(sample_text))}")

original text: The tiger is the beautiful animal, but it is very dangerous in the forest. Who is maheshbabu, This is demo lecture from the vizuara
token ids: [464, 26241, 318, 262, 4950, 5044, 11, 475, 340, 318, 845, 4923, 287, 262, 8222, 13, 5338, 318, 17266, 956, 71, 65, 397, 84, 11, 770, 318, 13605, 19143, 422, 262, 410, 47775, 3301]
decoded text: The tiger is the beautiful animal, but it is very dangerous in the forest. Who is maheshbabu, This is demo lecture from the vizuara


#### Using tiktoken, Dataset, DataLoader creating input-output pairs

In [106]:
# using tiktoken

import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [107]:
print(tokenizer.encode(text="who is maheshbabu", allowed_special={"<|endoftext|>", "<|beginoftext|>"}))
print(tokenizer.decode([10919, 318, 616, 1438]))

[8727, 318, 17266, 956, 71, 65, 397, 84]
what is my name


In [112]:
from torch.utils.data import Dataset, DataLoader
import torch


class CustomDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, context_size=5, stride=4):

        self.input_ids = []
        self.target = []

        token_ids = tokenizer.encode(text=txt, allowed_special={"<|endoftext|>"})
        print("Token ids:", token_ids)

        for i in range(0, len(token_ids)-context_size, stride):
            self.input_ids.append(torch.tensor(token_ids[i:i+context_size]))
            self.target.append(torch.tensor(token_ids[i+1:i+context_size+1]))

        print("input ids: ", self.input_ids)
        print("target ids: ", self.target)


    def __len__(self):
        return len(self.input_ids)
    

    def __getitem__(self, index):
        
        return self.input_ids[index], self.target[index]
    

sample_text = """ 
Trump receives standing ovation as he enters UFC event in Miami.
President Donald Trump entered to a standing ovation and cheers from a crowd of thousands attending a UFC event on Saturday night, shaking hands with supporters against a backdrop of fans waving his trademark MAGA hats.
Just as Trump entered, he greeted podcast host Joe Rogan, who sat to the right of the president.
On the other side of Trump sat Elon Musk, billionaire and chief of the Department of Government Efficiency.
Trump, who accented his dark suit with a bright yellow tie, pumped his fist in the air, prompting cheers to strains of “Taking Care of Business”.
"""
dataset = CustomDatasetV1(txt=sample_text, tokenizer=tokenizer)

Token ids: [220, 198, 6170, 11583, 5055, 267, 10473, 355, 339, 14170, 11448, 1785, 287, 8437, 13, 198, 10364, 3759, 1301, 5982, 284, 257, 5055, 267, 10473, 290, 34550, 422, 257, 4315, 286, 4138, 11969, 257, 11448, 1785, 319, 3909, 1755, 11, 17275, 2832, 351, 5941, 1028, 257, 26373, 286, 3296, 25849, 465, 16028, 28263, 32, 23910, 13, 198, 5703, 355, 1301, 5982, 11, 339, 21272, 9905, 2583, 5689, 8041, 272, 11, 508, 3332, 284, 262, 826, 286, 262, 1893, 13, 198, 2202, 262, 584, 1735, 286, 1301, 3332, 32451, 20119, 11, 18828, 290, 4039, 286, 262, 2732, 286, 5070, 45728, 13, 198, 6170, 11, 508, 697, 4714, 465, 3223, 6050, 351, 257, 6016, 7872, 9839, 11, 29104, 465, 18606, 287, 262, 1633, 11, 21550, 34550, 284, 21245, 286, 564, 250, 26556, 7276, 286, 7320, 447, 251, 13, 198]
input ids:  [tensor([  220,   198,  6170, 11583,  5055]), tensor([ 5055,   267, 10473,   355,   339]), tensor([  339, 14170, 11448,  1785,   287]), tensor([  287,  8437,    13,   198, 10364]), tensor([10364,  3759,  1301,

In [113]:
dataset.__getitem__(index=1)

(tensor([ 5055,   267, 10473,   355,   339]),
 tensor([  267, 10473,   355,   339, 14170]))

In [114]:

dataloader = DataLoader(dataset=dataset, shuffle=False, batch_size=4, drop_last=True, num_workers=2)

data = iter(dataloader)
next(data)

[tensor([[  220,   198,  6170, 11583,  5055],
         [ 5055,   267, 10473,   355,   339],
         [  339, 14170, 11448,  1785,   287],
         [  287,  8437,    13,   198, 10364]]),
 tensor([[  198,  6170, 11583,  5055,   267],
         [  267, 10473,   355,   339, 14170],
         [14170, 11448,  1785,   287,  8437],
         [ 8437,    13,   198, 10364,  3759]])]