# Tokenization Experiments

## 1. Setting Up The Notebook

In [1]:
# Import necessary libraries
import transformers
from transformers import AutoTokenizer, AutoModel

# Load a pre-trained model and its tokenizer
model_name = "bert-base-uncased"  # You can replace this with any model you're testing
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


## 2. Defining Custom Tokenization Function(s)

In [2]:
def custom_tokenizer(text, vocab):
    # Example function that tokenizes based on spaces and some simple rules
    tokens = text.split()
    return [token if token in vocab else 'UNK' for token in tokens]

# Define a sample vocabulary (in practice, this should be more comprehensive)
sample_vocab = {'hello', 'world', 'UNK'}

## 3. Using the Custom Tokenizer

In [3]:
# Test your tokenizer
sample_text = "hello world from Jupyter"
tokens = custom_tokenizer(sample_text, sample_vocab)
print("Tokens:", tokens)

Tokens: ['hello', 'world', 'UNK', 'UNK']


## 4. Comparing with Pre-trained Tokenizer

In [4]:
# Use the pre-trained tokenizer
pretrained_tokens = tokenizer.tokenize(sample_text)
print("Pre-trained Tokens:", pretrained_tokens)


Pre-trained Tokens: ['hello', 'world', 'from', 'ju', '##py', '##ter']


## 5. Function to Compare Tokenizers

In [5]:
def compare_tokenizers(text, custom_vocab):
    # Tokenize using the custom tokenizer
    custom_tokens = custom_tokenizer(text, custom_vocab)
    
    # Tokenize using the pre-trained tokenizer
    pretrained_tokens = tokenizer.tokenize(text)
    
    # Print both token lists for comparison
    print("Custom Tokens:", custom_tokens)
    print("Pre-trained Tokens:", pretrained_tokens)
    
    # Analyze the differences
    if custom_tokens == pretrained_tokens:
        print("Result: The tokenization is identical.")
    else:
        print("Result: There are differences in tokenization.")
        print("Custom vs. Pre-trained:")
        for ct, pt in zip(custom_tokens, pretrained_tokens):
            print(f"{ct} -> {pt}")

    # Optionally, add more detailed analysis or statistics here
    # e.g., token match rate, number of 'UNK' tokens, etc.

# Example usage of the comparison function
compare_tokenizers("hello world from Jupyter", sample_vocab)


Custom Tokens: ['hello', 'world', 'UNK', 'UNK']
Pre-trained Tokens: ['hello', 'world', 'from', 'ju', '##py', '##ter']
Result: There are differences in tokenization.
Custom vs. Pre-trained:
hello -> hello
world -> world
UNK -> from
UNK -> ju


In [6]:
from sklearn.metrics import f1_score


def token_accuracy(custom_tokens, pretrained_tokens):
    correct = sum(ct == pt for ct, pt in zip(custom_tokens, pretrained_tokens))
    total = len(pretrained_tokens)
    return correct / total if total > 0 else 0


def vocabulary_coverage(custom_tokens, pretrained_vocab):
    covered = sum(token in pretrained_vocab for token in custom_tokens)
    total = len(custom_tokens)
    return covered / total if total > 0 else 0


def oov_rate(custom_tokens):
    unk_tokens = custom_tokens.count('[UNK]')
    total = len(custom_tokens)
    return unk_tokens / total if total > 0 else 0


def calculate_f1_score(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='weighted')


def compare_tokenizers(text, custom_vocab, pretrained_vocab):
    custom_tokens = custom_tokenizer(text, custom_vocab)
    pretrained_tokens = tokenizer.tokenize(text)
    
    print("Custom Tokens:", custom_tokens)
    print("Pre-trained Tokens:", pretrained_tokens)
    
    # Calculate metrics
    accuracy = token_accuracy(custom_tokens, pretrained_tokens)
    coverage = vocabulary_coverage(custom_tokens, pretrained_vocab)
    oov = oov_rate(custom_tokens)
    
    print(f"Token Accuracy: {accuracy:.2f}")
    print(f"Vocabulary Coverage: {coverage:.2f}")
    print(f"OOV Rate: {oov:.2f}")

    # More detailed comparison or additional metrics could be added here

# Example usage
pretrained_vocab = set(tokenizer.vocab.keys())  # Assuming Hugging Face Transformers
compare_tokenizers("hello world from Jupyter", sample_vocab, pretrained_vocab)

Custom Tokens: ['hello', 'world', 'UNK', 'UNK']
Pre-trained Tokens: ['hello', 'world', 'from', 'ju', '##py', '##ter']
Token Accuracy: 0.33
Vocabulary Coverage: 0.50
OOV Rate: 0.00
