## Creating the tokens

## <h4>Sample Text file is taken for understanding</h4>

In [148]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    content = f.read()
print("total characters:", len(content))
print(content[:100])  # Print the first 1000 characters for a quick check

total characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


<h5>Dividing words into Token with all punctuations</h5>

In [149]:
import re
test="Hello, This is Rizwan."
tokens= re.split(r'([,.:;<>]|--|\s)', test)
tokens = [t for t in tokens if t.strip() != '']  # Remove empty tokens
print(tokens)

['Hello', ',', 'This', 'is', 'Rizwan', '.']


<h5>Same for Whole Text file</h5>

In [150]:
preprocessed_tokens = re.split(r'[,<>.:;"\_()*\'?]|\s|--', content)
preprocessed_tokens = [t for t in preprocessed_tokens if t.strip() != '']
print(preprocessed_tokens[:30])  # Print the first 50 tokens to verify
print("total tokens:", len(preprocessed_tokens))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', 'though', 'a', 'good', 'fellow', 'enough', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', 'in', 'the', 'height', 'of']
total tokens: 3788


<h5>Unique Words and Sorted in ascending order</h5>

In [151]:
words=sorted(set(preprocessed_tokens))
print("unique tokens:", len(words))

unique tokens: 1137


<h5>First 30 characters printing</h5>

In [152]:
print(words[:30])

['A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburn!', 'Gisburns']


<h5>Converting word into id</h5>

In [153]:
word_to_id = {word: idx for idx, word in enumerate(words)}

<h5>Sample 10 : words converted into ids</h5>

In [155]:
for w in words[:10]:
    print(f"'{w}': {word_to_id[w]}")

'A': 0
'Ah': 1
'Among': 2
'And': 3
'Are': 4
'Arrt': 5
'As': 6
'At': 7
'Be': 8
'Begin': 9


## <h4>Version 1: where it's not feasible for unknown words</h4>

In [19]:
class TokenizerV1:
    def __init__(self, words):
        self.word_to_id = {word: idx for idx, word in enumerate(words)}
        self.id_to_word = {idx: word for idx, word in enumerate(words)}

    def encode(self, text):
        tokens = re.split(r'[,<>.:;"\_()*\'?]|\s|--', text)
        tokens = [t for t in tokens if t.strip() != '']
        return [self.word_to_id[t] for t in tokens]

    def decode(self, token_ids):
        return ' '.join([self.id_to_word[idx] for idx in token_ids])

In [20]:
tokenizer= TokenizerV1(words)
sample_text= """I looked at the donkey again. "Well, what do you think of that?" I asked."""
encoded= tokenizer.encode(sample_text)
print(encoded)

[43, 639, 174, 990, 355, 134, 102, 1092, 349, 1133, 1000, 720, 989, 43, 173]


In [21]:
tokenizer.decode(token_ids=encoded)

'I looked at the donkey again Well what do you think of that I asked'

## <h4>Till now done tokenization and detokenization</h4>

## <h4>But for unknown words, we will get an error. In Version:1 - V1</h4>

In [22]:
all_tokens= sorted(set(preprocessed_tokens))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

words = {word: idx for idx, word in enumerate(all_tokens)}

In [113]:
len(words)

1139

In [112]:
for i, item in enumerate(list(words.items())[-3:]):
    print(item)

('yourself', 1136)
('<|endoftext|>', 1137)
('<|unk|>', 1138)


## <h3>Creating version 2 of the tokenizer with handling for unknown tokens</h3>

In [121]:
class TokenizerV2:
    def __init__(self, words):
        self.word_to_id = words
        # Properly invert the dictionary: words has word->id, so we reverse it
        self.id_to_word = {idx: word for word, idx in words.items()}

    def encode(self, text):
        # First, tokenize while preserving special tokens
        preprocessed_tokens = re.split(r'(<\|[^|]*\|>)', text)  # Preserve special tokens
        preprocessed_tokens = [t for t in preprocessed_tokens if t.strip()]
        
        # Now split non-special tokens by the regex
        final_tokens = []
        for token in preprocessed_tokens:
            if re.match(r'<\|[^|]*\|>', token):  # If it's a special token, keep it
                final_tokens.append(token)
            else:
                # Split by punctuation and whitespace
                split_tokens = re.split(r'[,<>.:;"\_()*\'?]|\s|--', token)
                split_tokens = [t.strip() for t in split_tokens if t.strip()]
                final_tokens.extend(split_tokens)
        
        # Convert tokens to IDs, replacing unknown words
        final_tokens = [
            t if t in self.word_to_id else "<|unk|>" for t in final_tokens
        ]
        id_list = [self.word_to_id[s] for s in final_tokens]
        return id_list

    def decode(self, token_ids):
        text = ' '.join([self.id_to_word[idx] for idx in token_ids])
        text = re.sub(r'\s+([()!:;"\',.!?;])', r'\1', text) 
        return text

In [122]:
test="this is Rizwan and this is a test with unknown word"
tokenizer= TokenizerV2(words)
encoded2= tokenizer.encode(test)
print(encoded2)

[1001, 580, 1138, 151, 1001, 580, 109, 1138, 1112, 1138, 1122]


In [132]:

text1="Rizwan it is"
text2="this is Abhishek"
tokenizer= TokenizerV2(words)

text= " <|endoftext|> ".join((text1, text2))
print(text)

Rizwan it is <|endoftext|> this is Abhishek


In [133]:
tokenizer.encode(text)

[1138, 581, 580, 1137, 1001, 580, 1138]

In [134]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|> it is <|endoftext|> this is <|unk|>'