# Tokenization Experiments

## 1. Setting Up The Notebook

In [2]:
# Import necessary libraries
import transformers
from transformers import AutoTokenizer, AutoModel

# Load a pre-trained model and its tokenizer
model_name = "bert-base-uncased"  # You can replace this with any model you're testing
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

## 2. Defining Custom Tokenization Function(s)

In [3]:
def custom_tokenizer(text, vocab):
    # Example function that tokenizes based on spaces and some simple rules
    tokens = text.split()
    return [token if token in vocab else 'UNK' for token in tokens]

# Define a sample vocabulary (in practice, this should be more comprehensive)
sample_vocab = {'hello', 'world', 'UNK'}

## 3. Using the Custom Tokenizer

In [4]:
# Test your tokenizer
sample_text = "hello world from Jupyter"
tokens = custom_tokenizer(sample_text, sample_vocab)
print("Tokens:", tokens)

Tokens: ['hello', 'world', 'UNK', 'UNK']


## 4. Comparing with Pre-trained Tokenizer

In [5]:
# Use the pre-trained tokenizer
pretrained_tokens = tokenizer.tokenize(sample_text)
print("Pre-trained Tokens:", pretrained_tokens)


Pre-trained Tokens: ['hello', 'world', 'from', 'ju', '##py', '##ter']


## 5. Function to Compare Tokenizers

In [6]:
def compare_tokenizers(text, custom_vocab):
    # Tokenize using the custom tokenizer
    custom_tokens = custom_tokenizer(text, custom_vocab)
    
    # Tokenize using the pre-trained tokenizer
    pretrained_tokens = tokenizer.tokenize(text)
    
    # Print both token lists for comparison
    print("Custom Tokens:", custom_tokens)
    print("Pre-trained Tokens:", pretrained_tokens)
    
    # Analyze the differences
    if custom_tokens == pretrained_tokens:
        print("Result: The tokenization is identical.")
    else:
        print("Result: There are differences in tokenization.")
        print("Custom vs. Pre-trained:")
        for ct, pt in zip(custom_tokens, pretrained_tokens):
            print(f"{ct} -> {pt}")

    # Optionally, add more detailed analysis or statistics here
    # e.g., token match rate, number of 'UNK' tokens, etc.

# Example usage of the comparison function
compare_tokenizers("hello world from Jupyter", sample_vocab)


Custom Tokens: ['hello', 'world', 'UNK', 'UNK']
Pre-trained Tokens: ['hello', 'world', 'from', 'ju', '##py', '##ter']
Result: There are differences in tokenization.
Custom vs. Pre-trained:
hello -> hello
world -> world
UNK -> from
UNK -> ju


In [7]:
from sklearn.metrics import f1_score


def token_accuracy(custom_tokens, pretrained_tokens):
    correct = sum(ct == pt for ct, pt in zip(custom_tokens, pretrained_tokens))
    total = len(pretrained_tokens)
    return correct / total if total > 0 else 0


def vocabulary_coverage(custom_tokens, pretrained_vocab):
    covered = sum(token in pretrained_vocab for token in custom_tokens)
    total = len(custom_tokens)
    return covered / total if total > 0 else 0


def oov_rate(custom_tokens):
    unk_tokens = custom_tokens.count('[UNK]')
    total = len(custom_tokens)
    return unk_tokens / total if total > 0 else 0


def calculate_f1_score(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='weighted')


def compare_tokenizers(text, custom_vocab, pretrained_vocab):
    custom_tokens = custom_tokenizer(text, custom_vocab)
    pretrained_tokens = tokenizer.tokenize(text)
    
    print("Custom Tokens:", custom_tokens)
    print("Pre-trained Tokens:", pretrained_tokens)
    
    # Calculate metrics
    accuracy = token_accuracy(custom_tokens, pretrained_tokens)
    coverage = vocabulary_coverage(custom_tokens, pretrained_vocab)
    oov = oov_rate(custom_tokens)
    
    print(f"Token Accuracy: {accuracy:.2f}")
    print(f"Vocabulary Coverage: {coverage:.2f}")
    print(f"OOV Rate: {oov:.2f}")

    # More detailed comparison or additional metrics could be added here

# Example usage
pretrained_vocab = set(tokenizer.vocab.keys())  # Assuming Hugging Face Transformers
compare_tokenizers("hello world from Jupyter", sample_vocab, pretrained_vocab)

Custom Tokens: ['hello', 'world', 'UNK', 'UNK']
Pre-trained Tokens: ['hello', 'world', 'from', 'ju', '##py', '##ter']
Token Accuracy: 0.33
Vocabulary Coverage: 0.50
OOV Rate: 0.00


# Extra tests

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the SNLI dataset
dataset = load_dataset("snli")

# Load BERT's tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize data using BERT's tokenizer
def bert_tokenize_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding='max_length', max_length=128)

# Tokenize the data with BERT's tokenizer
encoded_dataset = dataset.map(bert_tokenize_function, batched=True)

# Define a simple whitespace tokenizer function
def whitespace_tokenize_function(examples):
    # Use simple whitespace tokenization and manually map to BERT's vocabulary indices
    premise_tokens = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(' '.join(word.split()))) for word in examples['premise']]
    hypothesis_tokens = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(' '.join(word.split()))) for word in examples['hypothesis']]
    return {'input_ids': premise_tokens, 'attention_mask': [[1] * len(tokens) for tokens in premise_tokens]}

# Tokenize the data using the simple whitespace tokenizer
encoded_dataset_whitespace = dataset.map(whitespace_tokenize_function, batched=True)

# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {'accuracy': accuracy_score(labels, preds), 'f1': np.mean(precision_recall_fscore_support(labels, preds, average='weighted'))}

# Initialize the trainer for BERT tokenizer
trainer_bert = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    compute_metrics=compute_metrics,
)

# Initialize the trainer for whitespace tokenizer
trainer_whitespace = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset_whitespace['train'],
    eval_dataset=encoded_dataset_whitespace['validation'],
    compute_metrics=compute_metrics,
)

# Train and evaluate with BERT tokenizer
print("Training with BERT tokenizer...")
trainer_bert.train()
results_bert = trainer_bert.evaluate()
print("Results with BERT tokenizer:", results_bert)

# Train and evaluate with whitespace tokenizer
print("Training with whitespace tokenizer...")
trainer_whitespace.train()
results_whitespace = trainer_whitespace.evaluate()
print("Results with whitespace tokenizer:", results_whitespace)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
from datasets import load_dataset

# Load the SNLI dataset
snli_dataset = load_dataset('snli')

# Access the train, validation, and test splits
train_data = snli_dataset['train']
validation_data = snli_dataset['validation']
test_data = snli_dataset['test']

# Example: Print the first example from the training set
print(test_data[0])

label_mapping = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}

# Example: Print the first example from the training set with label meaning
example = train_data[0]
example['label'] = label_mapping[example['label']]
print(example)