## BytePairEncoding (BPE)

In [13]:
#### BytePairEncoding (BPE)

# Example text for encoding
text = "This is a simple example to demonstrate BytePairEncoding. BytePairEncoding is effective."

# Mock implementation of BytePairEncoding
def byte_pair_encoding(text, num_merges=10):
    from collections import Counter, defaultdict

    # Tokenize the text into symbols
    tokens = list(text.replace(" ", "_ ")) # Use '_' to denote space (word boundaries)
    print("Initial tokens:", tokens)

    # Count frequency of pairs
    def get_stats(tokens):
        pairs = Counter(zip(tokens[:-1], tokens[1:]))
        return pairs

    # Merge function
    def merge_vocab(pair, v_in):
        v_out = []
        bigram = ' '.join(pair)
        replacement = ''.join(pair)
        for word in v_in:
            if bigram in word:
                word = word.replace(bigram, replacement)
            v_out.append(word)
        return v_out

    # Perform num_merges iterations to merge frequent pairs
    for i in range(num_merges):
        pairs = get_stats(tokens)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        tokens = merge_vocab(best, tokens)
        print(f"Merge #{i+1}: {best} -> {''.join(best)}")

    # Final tokens after merges
    print("Tokens after BPE:", tokens)

# Run the BPE example
byte_pair_encoding(text, num_merges=5)

# Note: This is a simplified example. In practice, BPE computes frequencies over a large corpus and merges iteratively.


Initial tokens: ['T', 'h', 'i', 's', '_', ' ', 'i', 's', '_', ' ', 'a', '_', ' ', 's', 'i', 'm', 'p', 'l', 'e', '_', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '_', ' ', 't', 'o', '_', ' ', 'd', 'e', 'm', 'o', 'n', 's', 't', 'r', 'a', 't', 'e', '_', ' ', 'B', 'y', 't', 'e', 'P', 'a', 'i', 'r', 'E', 'n', 'c', 'o', 'd', 'i', 'n', 'g', '.', '_', ' ', 'B', 'y', 't', 'e', 'P', 'a', 'i', 'r', 'E', 'n', 'c', 'o', 'd', 'i', 'n', 'g', '_', ' ', 'i', 's', '_', ' ', 'e', 'f', 'f', 'e', 'c', 't', 'i', 'v', 'e', '.']
Merge #1: ('_', ' ') -> _ 
Merge #2: ('_', ' ') -> _ 
Merge #3: ('_', ' ') -> _ 
Merge #4: ('_', ' ') -> _ 
Merge #5: ('_', ' ') -> _ 
Tokens after BPE: ['T', 'h', 'i', 's', '_', ' ', 'i', 's', '_', ' ', 'a', '_', ' ', 's', 'i', 'm', 'p', 'l', 'e', '_', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '_', ' ', 't', 'o', '_', ' ', 'd', 'e', 'm', 'o', 'n', 's', 't', 'r', 'a', 't', 'e', '_', ' ', 'B', 'y', 't', 'e', 'P', 'a', 'i', 'r', 'E', 'n', 'c', 'o', 'd', 'i', 'n', 'g', '.', '_', ' ', 'B', 'y',

## WordPieceEncoding

### This very simple example, in fact it requires huge compute power. 

In [14]:
from collections import defaultdict, Counter
import re

def tokenize(text):
    """Tokenize the text into a sequence of characters, including spaces."""
    return ["_"] + list(text) + ["_"]

def build_vocab(texts):
    """Build initial vocabulary from input texts."""
    vocab = Counter()
    for text in texts:
        tokens = tokenize(text)
        vocab.update(tokens)
    return vocab

def find_best_pair(vocab):
    """Find the most frequent pair of tokens in the vocabulary."""
    pairs = defaultdict(int)
    for token, freq in vocab.items():
        symbols = token.split()
        for i in range(len(symbols)-1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return max(pairs, key=pairs.get) if pairs else None

def merge_vocab(pair, vocab):
    """Merge the most frequent pair in the vocabulary."""
    new_vocab = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    pattern = re.escape(bigram)
    replacement = replacement.replace(' ', '')
    for word in vocab:
        new_word = re.sub(pattern, replacement, word)
        new_vocab[new_word] = vocab[word]
    return new_vocab

def wordpiece_tokenize(vocab, text, num_merges=100):
    """Tokenize input text based on the WordPiece vocabulary."""
    for _ in range(num_merges):
        pair = find_best_pair(vocab)
        if not pair:
            break
        vocab = merge_vocab(pair, vocab)
    # Tokenize the text based on the final vocabulary
    tokens = tokenize(text)
    wordpiece_tokens = []
    for token in tokens:
        if token in vocab:
            wordpiece_tokens.append(token)
        else:
            wordpiece_tokens.append("[UNK]")
    return wordpiece_tokens

# Example usage
texts = ["This is a simple example to demonstrate WordPieceEncoding.",
         "WordPieceEncoding maximizes token frequency."]
vocab = build_vocab(texts)
print("Initial Vocabulary:", vocab)

# Tokenize a new text using the WordPiece vocabulary
new_text = "WordPiece tokenization example."
wordpiece_tokens = wordpiece_tokenize(vocab, new_text, num_merges=10)
print("WordPiece Tokens:", wordpiece_tokens)


Initial Vocabulary: Counter({'e': 13, ' ': 10, 'i': 9, 'o': 7, 'n': 7, 's': 5, 'm': 5, 'd': 5, 'c': 5, '_': 4, 'a': 4, 't': 4, 'r': 4, 'p': 2, 'l': 2, 'x': 2, 'W': 2, 'P': 2, 'E': 2, 'g': 2, '.': 2, 'T': 1, 'h': 1, 'z': 1, 'k': 1, 'f': 1, 'q': 1, 'u': 1, 'y': 1})
WordPiece Tokens: ['_', 'W', 'o', 'r', 'd', 'P', 'i', 'e', 'c', 'e', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', '_']


In [15]:
!pip install sentencepiece




## SentencePieceEncoding

### Again, this is very simple example to show SentencePieceEncoding.

In [16]:
import sentencepiece as spm
import os

# Prepare a sample text
sample_text = "This is a simple example to demonstrate how SentencePiece encoding works. SentencePiece can encode and decode text."
text_file = "sample_text.txt"

with open(text_file, "w") as f:
    f.write(sample_text)

# Train the SentencePiece model
spm.SentencePieceTrainer.Train(f'--input={text_file} --model_prefix=sentencepiece_model --vocab_size=32 --model_type=bpe')

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load("sentencepiece_model.model")

# Encode a text string
text_to_encode = "SentencePiece encoding is fascinating."
encoded_pieces = sp.EncodeAsPieces(text_to_encode)
encoded_ids = sp.EncodeAsIds(text_to_encode)

print("Encoded Pieces:", encoded_pieces)
print("Encoded IDs:", encoded_ids)

# Cleanup the generated files
os.remove(text_file)
os.remove("sentencepiece_model.model")
os.remove("sentencepiece_model.vocab")


Encoded Pieces: ['▁', 'S', 'en', 't', 'en', 'ce', 'Pi', 'e', 'ce', '▁', 'en', 'co', 'd', 'i', 'n', 'g', '▁', 'i', 's', '▁', 'f', 'a', 's', 'c', 'i', 'n', 'a', 't', 'i', 'n', 'g', '.']
Encoded IDs: [10, 22, 3, 14, 3, 4, 7, 9, 4, 10, 3, 6, 15, 16, 11, 30, 10, 16, 18, 10, 0, 17, 18, 12, 16, 11, 17, 14, 16, 11, 30, 20]
