In [55]:
# LIBRARIES REQUIRED
import numpy as np 
import json
import pickle
import os 
import ast

In [2]:
# SAMPLE TEXT : VIRAT KOHLI - WIKI
text = "Virat Kohli (born 5 November 1988)[a] is an Indian international cricketer who plays ODI cricket for the national team and is a former captain in all formats.[3] He is a right-handed batsman and occasional right-arm medium pace bowler. Considered one of the greatest all-format batsmen in the history of cricket, he is called the King, the Chase Master, and the Run Machine for his skills, records and ability to lead his team to victory.[4] Kohli is the highest run-scorer in the Indian Premier League, third in T20I, third in ODI, and third in international cricket.[5] He has the most ODI centuries and second-most centuries in international cricket, with a total of 82 centuries across all international formats of the game.[6] Kohli is also the most successful Test captain of India with back-to-back Test mace wins and most victories in his tenure.[7] He is the only batter to earn 900 rating points in all three formats.[8] Kohli was the captain of the 2008 U19 World Cup winning team and was a crucial member of the teams that won 2011 ODI World Cup, 2013 Champions Trophy, 2024 T20 World Cup, and 2025 Champions Trophy. He plays for Royal Challengers Bengaluru in the Indian Premier League and for Delhi in domestic cricket. In 2013, Kohli was ranked number one in the ODI batting rankings. In 2015, he achieved the same in T20I.[9] In 2018, he was ranked number one in Test, making him the only Indian to hold the number one spot in all three formats. He is the first player to score 20,000 runs in a decade. He was the Cricketer of the Decade for 2011 to 2020.[10] Kohli has won ten ICC Awards, making him the most awarded player in international cricket history. He won the ODI Player of the Year award four times in 2012, 2017, 2018, and 2023. He won the Cricketer of the Year award, on two occasions, in 2017 and 2018. In 2018, he became the first player to win all three major awards including Cricketer of the Year, ODI Player of the Year and Test Player of the Year in the same year. He was honored with the Spirit of Cricket Award in 2019 and given the Cricketer of the Decade and ODI Cricketer of the Decade in 2020. Kohli was named the Wisden Leading Cricketer in the World for three consecutive years. Kohli has the most Player of the Series and second most Player of the Match awards to his name in all three formats combined. He was honoured with the Arjuna Award in 2013, the Padma Shri in 2017, and India's highest sporting honour, the Khel Ratna Award, in 2018. Time included him on its 100 most influential people in the world list in 2018.After winning the 2024 T20 World Cup and winning the Player of the Match award in the final, Kohli announced his retirement from T20Is.[11] On 12 May 2025, aged 36, he announced his retirement from the Test format.[12] He is married to actress Anushka Sharma, and they have two children.[13]"

In [3]:
# Total number of characters
len(text)

2858

In [4]:
# Convert each character in the input text to its corresponding byte (integer) using UTF-8 encoding
# Example: "hello" -> [104, 101, 108, 108, 111]
tokens = list(text.encode("utf-8"))


In [5]:
# Print the list of UTF-8 encoded byte values (integers)
# Helps visualize how the original text is represented at the byte level
print(tokens)


[86, 105, 114, 97, 116, 32, 75, 111, 104, 108, 105, 32, 40, 98, 111, 114, 110, 32, 53, 32, 78, 111, 118, 101, 109, 98, 101, 114, 32, 49, 57, 56, 56, 41, 91, 97, 93, 32, 105, 115, 32, 97, 110, 32, 73, 110, 100, 105, 97, 110, 32, 105, 110, 116, 101, 114, 110, 97, 116, 105, 111, 110, 97, 108, 32, 99, 114, 105, 99, 107, 101, 116, 101, 114, 32, 119, 104, 111, 32, 112, 108, 97, 121, 115, 32, 79, 68, 73, 32, 99, 114, 105, 99, 107, 101, 116, 32, 102, 111, 114, 32, 116, 104, 101, 32, 110, 97, 116, 105, 111, 110, 97, 108, 32, 116, 101, 97, 109, 32, 97, 110, 100, 32, 105, 115, 32, 97, 32, 102, 111, 114, 109, 101, 114, 32, 99, 97, 112, 116, 97, 105, 110, 32, 105, 110, 32, 97, 108, 108, 32, 102, 111, 114, 109, 97, 116, 115, 46, 91, 51, 93, 32, 72, 101, 32, 105, 115, 32, 97, 32, 114, 105, 103, 104, 116, 45, 104, 97, 110, 100, 101, 100, 32, 98, 97, 116, 115, 109, 97, 110, 32, 97, 110, 100, 32, 111, 99, 99, 97, 115, 105, 111, 110, 97, 108, 32, 114, 105, 103, 104, 116, 45, 97, 114, 109, 32, 109, 101, 1

In [6]:
# Calculate frequency of all adjacent token pairs in the input list.
# This is the core step in BPE to identify which pair of tokens to merge.

def get_stats(tokens): 
    freq = {}
    for i in range(len(tokens) - 1): 
        pair = tokens[i], tokens[i + 1]  # Create a pair from each adjacent token
        freq[pair] = freq.get(pair, 0) + 1  # Count how often each pair appears
    return freq  # Return a dictionary: pair -> frequency


In [7]:
# Compute the frequency of all adjacent byte pairs in the initial token list
# This will be used to determine which pair should be merged first in the BPE process
vocab = get_stats(tokens)

In [9]:
# Get the total number of unique adjacent token pairs in the current vocabulary
# This gives an idea of how many distinct mergable token pairs exist
len(vocab)


417

In [10]:
print(vocab)

{(86, 105): 1, (105, 114): 9, (114, 97): 5, (97, 116): 24, (116, 32): 28, (32, 75): 11, (75, 111): 9, (111, 104): 9, (104, 108): 9, (108, 105): 11, (105, 32): 11, (32, 40): 1, (40, 98): 1, (98, 111): 2, (111, 114): 31, (114, 110): 7, (110, 32): 58, (32, 53): 1, (53, 32): 2, (32, 78): 1, (78, 111): 1, (111, 118): 1, (118, 101): 5, (101, 109): 6, (109, 98): 6, (98, 101): 6, (101, 114): 37, (114, 32): 39, (32, 49): 3, (49, 57): 3, (57, 56): 1, (56, 56): 1, (56, 41): 1, (41, 91): 1, (91, 97): 1, (97, 93): 1, (93, 32): 11, (32, 105): 48, (105, 115): 19, (115, 32): 50, (32, 97): 46, (97, 110): 31, (32, 73): 11, (73, 110): 10, (110, 100): 28, (100, 105): 9, (105, 97): 8, (105, 110): 62, (110, 116): 12, (116, 101): 22, (110, 97): 18, (116, 105): 15, (105, 111): 10, (111, 110): 29, (97, 108): 23, (108, 32): 19, (32, 99): 18, (99, 114): 9, (114, 105): 24, (105, 99): 17, (99, 107): 16, (107, 101): 16, (101, 116): 16, (32, 119): 23, (119, 104): 1, (104, 111): 5, (111, 32): 13, (32, 112): 8, (112, 

In [11]:
# Find the most frequent token pair to merge next:
# 1. Convert the vocab dict to a list of tuples (frequency, pair)
# 2. Sort the list in descending order of frequency
# 3. Take the first element (most frequent), and extract the pair part

freq_pair = max(sorted([(v, k) for k, v in vocab.items()], reverse=True))[1]
freq_pair  # Display the most frequent pair to be merged


(101, 32)

In [12]:
# Merge all occurrences of the specified pair in the token list with a new token ID.

def merge(tokens, pair, idx): 
    new_tokens = []  # Holds the updated tokens after merging
    i = 0
    while i < len(tokens): 
        # If the current and next token match the target pair
        if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == pair:
            new_tokens.append(idx)  # Replace the pair with the new token ID
            i += 2  # Skip the next token since it's part of the pair
        else:
            new_tokens.append(tokens[i])  # Keep the current token
            i += 1
    return new_tokens  # Return the updated token list after the merge


In [13]:
# Test the merge function with a sample input:
# Attempting to merge the pair (77, 9) by replacing it with token ID 89
# Since the pair (77, 9) does NOT exist in the list, the output will be the same as the input

merge([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], (77, 9), 89)
# Output: [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]


[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]

In [14]:
# This time, the pair (7, 8) exists in the list at position 6 and 7.
# It will be replaced by the token ID 89.
# So, [7, 8] will be replaced with [89], and the rest of the list remains unchanged.

merge([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], (7, 8), 89)
# Output: [1, 2, 3, 4, 5, 6, 89, 9, 0]


[1, 2, 3, 4, 5, 6, 89, 9, 0]

In [15]:
# Perform Byte Pair Encoding (BPE) on the input token list to learn merge rules.

def bpe(vocab_size, original_tokens): 
    tokens = list(original_tokens)  # Copy the input tokens to avoid modifying original

    # BPE requires vocab_size > 256 (first 256 are reserved for byte-level encoding)
    # Also skip if not enough data for meaningful merging
    if vocab_size <= 256 or vocab_size > len(tokens):
        return tokens, None

    idx = 256  # Start assigning new token IDs from 256
    merge_dict = {}  # To store merge rules: (pair) → new token ID
    n_merges = vocab_size - 256  # Number of merges required to reach target vocab size

    for _ in range(n_merges):
        vocab = get_stats(tokens)  # Get frequency of token pairs
        # Select most frequent token pair to merge
        freq_pair = max(sorted([(v, k) for k, v in vocab.items()], reverse=True))[1]
        merge_dict[freq_pair] = idx  # Store merge rule
        tokens = merge(tokens, freq_pair, idx)  # Apply the merge
        idx += 1  # Increment token ID for next merge

    # Return the final merge dictionary, the last pair frequencies, and final token sequence
    return merge_dict, vocab, tokens


In [16]:
# Set the desired vocabulary size (must be > 256 to allow merges)
# Why > 256? → The first 256 token IDs are reserved for all possible single-byte UTF-8 values (0–255).
# Any BPE merges start assigning new token IDs from 256 onward.
desired_vocab_size = 264

# Run the BPE algorithm to get the merge rules, final vocab stats, and compressed tokens
merge_dict, vocab_dict, bpe_tokens = bpe(desired_vocab_size, tokens)

# Calculate compression ratio: original token count / BPE token count
compression = len(tokens) / len(bpe_tokens)

# Print how much compression was achieved after BPE tokenization
print(f"Compression achieved with {desired_vocab_size} vocab size: X{np.round(compression, 2)}")


Compression achieved with 264 vocab size: X1.19


In [17]:
# Increase the desired vocabulary size to allow more merges
# With vocab_size = 512, the BPE algorithm can perform up to 512 - 256 = 256 merges
# More merges → fewer tokens → higher compression (up to a limit)

desired_vocab_size = 512

# Run BPE with the new vocab size
merge_dict, vocab_dict, bpe_tokens = bpe(desired_vocab_size, tokens)

# Calculate how much the token count was reduced due to merging
compression = len(tokens) / len(bpe_tokens)

# Report the achieved compression ratio
print(f"Compression achieved with {desired_vocab_size} vocab size: X{np.round(compression, 2)}")


Compression achieved with 512 vocab size: X3.17


In [18]:
# Display the learned merge rules from the BPE process
# Format: {(token1, token2): new_token_id}
# This shows which byte pairs were merged and what new token ID was assigned to each

merge_dict


{(101, 32): 256,
 (32, 116): 257,
 (105, 110): 258,
 (257, 104): 259,
 (259, 256): 260,
 (100, 32): 261,
 (32, 97): 262,
 (101, 114): 263,
 (50, 48): 264,
 (111, 114): 265,
 (111, 110): 266,
 (115, 32): 267,
 (116, 32): 268,
 (263, 32): 269,
 (114, 105): 270,
 (97, 116): 271,
 (97, 114): 272,
 (258, 32): 273,
 (262, 110): 274,
 (111, 102): 275,
 (44, 32): 276,
 (264, 49): 277,
 (114, 101): 278,
 (115, 268): 279,
 (107, 101): 280,
 (104, 105): 281,
 (275, 260): 282,
 (97, 108): 283,
 (274, 261): 284,
 (270, 99): 285,
 (285, 280): 286,
 (110, 32): 287,
 (102, 265): 288,
 (46, 32): 289,
 (269, 282): 290,
 (258, 103): 291,
 (97, 121): 292,
 (97, 109): 293,
 (286, 116): 294,
 (111, 32): 295,
 (108, 292): 296,
 (108, 105): 297,
 (93, 32): 298,
 (72, 256): 299,
 (46, 91): 300,
 (283, 32): 301,
 (119, 272): 302,
 (105, 266): 303,
 (101, 110): 304,
 (111, 104): 305,
 (305, 297): 306,
 (99, 97): 307,
 (97, 110): 308,
 (75, 306): 309,
 (309, 32): 310,
 (288, 109): 311,
 (262, 108): 312,
 (258, 26

In [19]:
# Encode a given text into BPE token IDs using the learned merge rules.

def encoder(text, merge_dict): 
    tokens = list(text.encode("utf-8"))  # Convert input text to UTF-8 byte tokens
    i = 0
    while i < len(tokens) - 1:
        pair = tokens[i], tokens[i + 1]  # Form adjacent pair
        if pair in merge_dict:
            # If the pair exists in merge rules, merge them into a new token
            tokens = merge(tokens, pair, merge_dict[pair])
            # Note: Restarting from i=0 after merge would be more robust in some BPE variants
        i += 1
    return tokens  # Return the encoded token sequence


In [20]:
# Encode the input text using the previously learned BPE merge rules
encoded_token = encoder(text, merge_dict)

# Measure the length of the encoded token sequence
# This helps compare with the original token count to evaluate compression
len(encoded_token)


1176

In [22]:
# Decode a BPE token sequence back to its original text form using reverse merge rules.

def decoder(tokens, merge_dict):
    # Create a reverse mapping: {new_token_id: (original_pair)}
    map_merge = {v: k for k, v in merge_dict.items()}

    decoded_tokens = []

    # Repeat until all merged token IDs are broken down into basic byte tokens (<= 255)
    while max(tokens) > 256:
        decoded_tokens = []
        for token in tokens:
            if token in merge_dict.values():
                # If the token is a merged token, replace it with its original pair
                decoded_tokens.extend(list(map_merge.get(token)))
            else:
                # Keep unmerged byte-level tokens as is
                decoded_tokens.append(token)
        tokens = decoded_tokens  # Update tokens after one round of decoding

    # Convert final byte-level tokens back to a UTF-8 string
    return bytes(decoded_tokens).decode('utf-8')


In [23]:
# Decode the previously encoded BPE token sequence back to the original text
# This should return the exact input string (e.g., "Virat Kohli") if encoding and decoding worked correctly

decoder(encoded_token, merge_dict)


"Virat Kohli (born 5 November 1988)[a] is an Indian international cricketer who plays ODI cricket for the national team and is a former captain in all formats.[3] He is a right-handed batsman and occasional right-arm medium pace bowler. Considered one of the greatest all-format batsmen in the history of cricket, he is called the King, the Chase Master, and the Run Machine for his skills, records and ability to lead his team to victory.[4] Kohli is the highest run-scorer in the Indian Premier League, third in T20I, third in ODI, and third in international cricket.[5] He has the most ODI centuries and second-most centuries in international cricket, with a total of 82 centuries across all international formats of the game.[6] Kohli is also the most successful Test captain of India with back-to-back Test mace wins and most victories in his tenure.[7] He is the only batter to earn 900 rating points in all three formats.[8] Kohli was the captain of the 2008 U19 World Cup winning team and was

In [24]:
# Encode the input text (including emojis and punctuation) using BPE
# Then immediately decode it back to verify correctness

decoder(
    encoder("Virat Kohli (born 5 November 1988) - ADDED THESE EMOJI 😂😂😂😂😂", merge_dict),
    merge_dict
)

# This should return the original string exactly if all byte merges and reversals worked correctly
# UTF-8 ensures emojis and special characters are preserved during encoding and decoding


'Virat Kohli (born 5 November 1988) - ADDED THESE EMOJI 😂😂😂😂😂'

In [133]:
class BPE:
    def __init__(self): 
        super().__init__()
        self.merge_dict = {}        # Stores merge rules: {(token1, token2): new_token_id}
        self.tokens = None          # Final token sequence after training
        self.text = None            # Optional reference to original text
        self.vocab_size = None      # Total size of vocabulary after training
        self.encoded_tokens = None  # Tokens generated after encoding input text
        self.decoded_tokens = None  # Decoded text string from token sequence
        self.special_tokens = None  # Any manually added special tokens (e.g., <PAD>, <BOS>)
        self.vocab = {}             # Maps token_id → byte sequence

    def add_special_tokens(self, special_tokens):
        if self.special_tokens is None:
            self.special_tokens = {}

        reverse_vocab = {v: k for k, v in self.vocab.items()}
        current_max_id = max(self.vocab.keys(), default=255)

        for token in special_tokens:
            token_bytes = token.encode("utf-8")

            # If the byte version already exists in vocab, re-use the same ID
            if token_bytes in reverse_vocab:
                token_id = reverse_vocab[token_bytes]
            else:
                # Else assign a new ID
                current_max_id += 1
                token_id = current_max_id
                self.vocab[token_id] = token_bytes

            # Track in special_tokens (ensures idempotency)
            self.special_tokens[token] = token_id


    
    def get_token_id(self, token_str):
        if self.special_tokens and token_str in self.special_tokens:
            return self.special_tokens[token_str]
        return None

    def get_vocab(self):
        # Build the full vocabulary from merge_dict and initial byte tokens (0-255)
        vocab = {i: bytes([i]) for i in range(256)}
        for (p0, p1), x in self.merge_dict.items():
            vocab[x] = vocab[p0] + vocab[p1]
        return vocab

    def get_stats(self, tokens=None): 
        # Count frequency of all adjacent token pairs in the sequence
        freq = {}
        for i in range(len(tokens) - 1): 
            pair = tokens[i], tokens[i + 1]
            freq[pair] = freq.get(pair, 0) + 1
        return freq

    def merge(self, tokens=None, pair=None, idx=None): 
        # Merge all instances of the target pair into a single token ID (idx)
        new_tokens = []
        i = 0
        while i < len(tokens): 
            if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == pair:
                new_tokens.append(idx)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        return new_tokens

    def fit(self, text=None, vocab_size=384):
        # Train the BPE tokenizer: learn merge rules and generate token sequence
        if vocab_size > 256:
            self.vocab_size = vocab_size
        else:
            self.vocab_size = None  # invalid size

        original_tokens = list(text.encode("utf-8"))  # convert text to byte-level tokens
        tokens = list(original_tokens)

        if vocab_size <= 256 or vocab_size > len(tokens):
            return None  # skip training if invalid

        idx = 256  # start assigning new token IDs after byte range
        n_merges = vocab_size - 256

        for _ in range(n_merges):
            vocab = self.get_stats(tokens)  # get pair frequencies
            freq_pair = max(sorted([(v, k) for k, v in vocab.items()], reverse=True))[1]
            self.merge_dict[freq_pair] = idx
            tokens = self.merge(tokens, freq_pair, idx)
            idx += 1

        self.vocab = self.get_vocab()  # build vocab from merges
        self.tokens = tokens  # store final token sequence

    def encode(self, text=None):
        # Convert text to token sequence using the trained merge_dict
        if text:
            tokens = list(text.encode("utf-8"))
        else:
            tokens = self.tokens

        try:
            merge_dict = self.merge_dict
            i = 0
            while i < len(tokens) - 1:
                pair = tokens[i], tokens[i + 1]
                if pair in merge_dict:
                    tokens = self.merge(tokens, pair, merge_dict[pair])
                i += 1
            self.encoded_tokens = tokens
            return self.encoded_tokens
        except: 
            return []

    def decode_old(self, tokens=None):
        # Decode by recursively splitting merged tokens using reverse merge_dict
        if not tokens:
            tokens = self.encoded_tokens

        merge_dict = self.merge_dict
        map_merge = {v: k for k, v in merge_dict.items()}
        decoded_tokens = []

        while max(tokens) > 255:
            decoded_tokens = []
            for token in tokens:
                if token in merge_dict.values():
                    decoded_tokens.extend(list(map_merge.get(token)))
                else:
                    decoded_tokens.append(token)
            tokens = decoded_tokens

        self.decoded_tokens = bytes(decoded_tokens).decode('utf-8')
        return self.decoded_tokens

    def decode(self, tokens=None):
        # Decode using precomputed vocab (faster than decode_old)
        if not tokens:
            tokens = self.encoded_tokens
        try:
            self.decoded_tokens = b"".join(self.vocab.get(x) for x in tokens).decode("utf-8", errors='replace')
        except: 
            self.decoded_tokens = "".join(self.vocab.get(str(x)) for x in tokens)

        return self.decoded_tokens

    def save_json(self, path="bpe_tokenizer.json"):
        """Save tokenizer merge rules and vocab size as a JSON file"""
        with open(path, "w") as f:
            json.dump({
                "merge_dict": {str(k): v for k, v in self.merge_dict.items()},
                "vocab": {str(k): v.decode("utf-8", errors="replace") for k, v in self.get_vocab().items()},
                "vocab_size": len(self.vocab)
            }, f)

    def save_pickle(self, path="bpe_tokenizer.pkl"):
        # Save merge rules and vocab in binary form for reloading
        with open(path, "wb") as f:
            pickle.dump({
                "merge_dict": self.merge_dict,
                "vocab": self.vocab,
                "vocab_size": self.vocab_size
            }, f)

    def load_json(self, path="bpe_tokenizer.json"):
        """Load tokenizer merge rules and vocab size from a JSON file"""
        with open(path, "r") as f:
            data = json.load(f)
            self.vocab_size = data["vocab_size"]
            self.merge_dict = data["merge_dict"]
            self.vocab = data["vocab"]

    def load_pickle(self, path="bpe_tokenizer.pkl"):
        # Load BPE model from pickle file (faster than JSON)
        with open(path, "rb") as f:
            data = pickle.load(f)
            self.merge_dict = data["merge_dict"]
            self.vocab = data["vocab"]
            self.vocab_size = data["vocab_size"]

    def get_vocab_size(self):
        # Return the current size of the vocabulary
        return len(self.vocab)


In [134]:
# GETTING THAT TEXT AGAIN FOR REFERENCE
text = "Virat Kohli 😅😂🐐⚡ (born 5 November 1988)[a] is an Indian international cricketer  who plays ODI cricket for the national team and is a former captain in all formats.[3] He is a right-handed batsman and occasional right-arm medium pace bowler. Considered one of the greatest all-format batsmen in the history of cricket, he is called the King, the Chase Master, and the Run Machine for his skills, records and ability to lead his team to victory.[4] Kohli is the highest run-scorer in the Indian Premier League, third in T20I, third in ODI, and third in international cricket.[5] He has the most ODI centuries and second-most centuries in international cricket, with a total of 82 centuries across all international formats of the game.[6] Kohli is also the most successful Test captain of India with back-to-back Test mace wins and most victories in his tenure.[7] He is the only batter to earn 900 rating points in all three formats.[8] Kohli was the captain of the 2008 U19 World Cup winning team and was a crucial member of the teams that won 2011 ODI World Cup, 2013 Champions Trophy, 2024 T20 World Cup, and 2025 Champions Trophy. He plays for Royal Challengers Bengaluru in the Indian Premier League and for Delhi in domestic cricket. In 2013, Kohli was ranked number one in the ODI batting rankings. In 2015, he achieved the same in T20I.[9] In 2018, he was ranked number one in Test, making him the only Indian to hold the number one spot in all three formats. He is the first player to score 20,000 runs in a decade. He was the Cricketer of the Decade for 2011 to 2020.[10] Kohli has won ten ICC Awards, making him the most awarded player in international cricket history. He won the ODI Player of the Year award four times in 2012, 2017, 2018, and 2023. He won the Cricketer of the Year award, on two occasions, in 2017 and 2018. In 2018, he became the first player to win all three major awards including Cricketer of the Year, ODI Player of the Year and Test Player of the Year in the same year. He was honored with the Spirit of Cricket Award in 2019 and given the Cricketer of the Decade and ODI Cricketer of the Decade in 2020. Kohli was named the Wisden Leading Cricketer in the World for three consecutive years. Kohli has the most Player of the Series and second most Player of the Match awards to his name in all three formats combined. He was honoured with the Arjuna Award in 2013, the Padma Shri in 2017, and India's highest sporting honour, the Khel Ratna Award, in 2018. Time included him on its 100 most influential people in the world list in 2018.After winning the 2024 T20 World Cup and winning the Player of the Match award in the final, Kohli announced his retirement from T20Is.[11] On 12 May 2025, aged 36, he announced his retirement from the Test format.[12] He is married to actress Anushka Sharma, and they have two children.[13]"

In [135]:
# Create an instance of the BPE class
bpe = BPE()

# Train the BPE tokenizer on the given input `text` with a target vocabulary size of 260
# This will learn 4 merge rules (260 - 256) and update `bpe.merge_dict`, `bpe.vocab`, and `bpe.tokens`
bpe.fit(text=text, vocab_size=260)


In [136]:
# Encode the first 50 characters of the input text using the trained BPE model
# This will return a list of token IDs based on learned merge rules
encoded = bpe.encode(text[:50])

# Decode the encoded tokens back to a UTF-8 string
# This should reconstruct the original 50 characters accurately
bpe.decode(encoded)


'Virat Kohli 😅😂🐐⚡ (born 5 November 1988)[a] is an I'

In [137]:
# Save the current BPE model (merge_dict, vocab, vocab_size) to a file named 'bpe_tokenizer.pkl'
# This allows the model to be reused later without retraining
bpe.save_pickle()

# Load the saved BPE model back into the current instance
# Useful for restoring the tokenizer in a new session or script
bpe.load_pickle()

# Decode the previously encoded token sequence using the reloaded model
# This should return the original text used to produce `encoded`
bpe.decode(encoded)

# Encode a new string containing emojis and uppercase letters using the reloaded model
# Then decode it back to verify that the model works as expected after loading from pickle
bpe.decode(bpe.encode("THESE LINES ARE DECODED WITH PICKLE: 😅😂🐐⚡"))


'THESE LINES ARE DECODED WITH PICKLE: 😅😂🐐⚡'

In [138]:
# Save the BPE tokenizer to a human-readable JSON file (default: 'bpe_tokenizer.json')
# This stores the merge_dict and vocab in a structured, text-based format
bpe.save_json()

# Load the tokenizer from the saved JSON file
# WARNING: The merge_dict keys were stored as strings, so without parsing them back into tuples,
#          the decoder may not correctly match and decode merged tokens
bpe.load_json()

# Decode the previously encoded tokens
# This may succeed only if the stringified merge_dict was correctly parsed back into tuple format
bpe.decode(encoded)

# FAIL EXAMPLE:
# Attempt to encode and decode a string that was present during training
# It may fail or return incorrect results because the merge_dict keys are not tuples after JSON load
bpe.decode(bpe.encode("THESE LINES MAY FAIL TO DECODE EVEN THOUGH THEY ARE IN TRAINING : 😅😂🐐⚡"))

'THESE LINES MAY FAIL TO DECODE EVEN THOUGH THEY ARE IN TRAINING : ���������������'

In [139]:
# Load the trained BPE model from the pickle file
# This restores merge_dict, vocab, and vocab_size
bpe.load_pickle()

# Determine the next available token ID after the current vocab
# Special tokens should not conflict with existing token IDs
start_id = bpe.get_vocab_size()

# Define special tokens with unique token IDs beyond the current vocab
# These can be used during training or inference in downstream NLP models
specials = {
    "<PAD>": start_id,       # Padding token
    "<UNK>": start_id + 1,   # Unknown token
    "<BOS>": start_id + 2,   # Beginning of sequence
    "<EOS>": start_id + 3,   # End of sequence
}


In [140]:
# Add the defined special tokens to the BPE vocabulary
# Each token is encoded into its UTF-8 byte representation and stored in the vocab dictionary
# This allows the encoder/decoder to recognize and utilize these tokens if explicitly added

bpe.add_special_tokens(special_tokens=specials)


In [141]:
# Get the updated vocabulary size after adding special tokens
# This should now include the original BPE tokens + the number of special tokens added

bpe.get_vocab_size()


264

In [142]:
bpe.get_token_id("<PAD>") 

260

In [143]:
start_id = bpe.get_vocab_size()
specials = {
    "<PAD>": start_id,
    "<UNK>": start_id + 1,
    "<BOS>": start_id + 2,
    "<EOS>": start_id + 3,
}

bpe.add_special_tokens(special_tokens=specials)

In [144]:
bpe.get_vocab_size()

264

In [145]:
bpe.get_token_id("<PAD>") 

260

In [146]:
vocab = bpe.vocab

In [147]:
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'