In [4]:
from typing import List, Tuple, Union
import re

def custom_tokenization(premise_hypothesis: Union[Tuple[str, str], List[str]], separator_marker: str="", **tokenization_args) -> Tuple[List[str], List[str]]:
    """
    Custom tokenization method that returns separate tokens for premise and hypothesis.

    Args:
        premise_hypothesis: Tuple or list containing (premise, hypothesis)
        separator_marker: Special character(s) used by tokenizers when splitting words
        tokenization_args: Additional tokenization arguments

    Returns:
        Tuple containing (premise_tokens, hypothesis_tokens)
    """
    def _tokenize_text(text: str, lengths: List[int] = [1, 2, 3]) -> List[str]:
        # Clean and split the text into words
        words = text.split()
        tokens = []

        # Generate tokens for each length
        for length in lengths:
            if length <= 0:
                continue
            # Loop through the words to create n-grams
            for i in range(len(words) - length + 1):
                token = ' '.join(words[i:i + length])
                tokens.append(token)

        return tokens

    # Get lengths from tokenization_args if provided
    lengths = tokenization_args.get('lengths', [1, 2, 3])

    # Tokenize both premise and hypothesis
    premise_tokens = _tokenize_text(premise_hypothesis[0], lengths)
    hypothesis_tokens = _tokenize_text(premise_hypothesis[1], lengths)

    return premise_tokens, hypothesis_tokens

def custom_char_tokenization(premise_hypothesis: Union[Tuple[str, str], List[str]], separator_marker: str="", **tokenization_args) -> Tuple[List[str], List[str]]:
    """
    Custom character-level tokenization method.

    Args:
        premise_hypothesis: Tuple or list containing (premise, hypothesis)
        separator_marker: Special character(s) used by tokenizers when splitting words
        tokenization_args: Additional tokenization arguments

    Returns:
        Tuple containing (premise_char_tokens, hypothesis_char_tokens)
    """
    def _tokenize_characters(text: str, lengths: List[int] = [2, 3]) -> List[str]:
        text = text.replace(" ", "")
        tokens = []

        for length in lengths:
            if length <= 0:
                continue
            for i in range(len(text) - length + 1):
                token = text[i:i + length]
                if separator_marker and i > 0:
                    token = f"{separator_marker}{token}"
                tokens.append(token)

        return tokens

    # Get lengths from tokenization_args if provided
    lengths = tokenization_args.get('lengths', [2, 3])

    # Tokenize both premise and hypothesis
    premise_tokens = _tokenize_characters(premise_hypothesis[0], lengths)
    hypothesis_tokens = _tokenize_characters(premise_hypothesis[1], lengths)

    return premise_tokens, hypothesis_tokens


In [10]:
def create_test_corpus():
    """
    Creates a small test corpus of premise-hypothesis pairs.

    Returns:
        List[Tuple[str, str]]: List of (premise, hypothesis) pairs
    """
    corpus = [
        ("The cat is sleeping on the mat.", "There is a cat resting."),
        ("Students attended the AI lecture.", "People were learning about artificial intelligence."),
        ("The restaurant serves Italian food.", "You can eat pasta at this place."),
        ("It's raining heavily outside.", "The weather is wet."),
        ("The computer is running slowly.", "The system performance is poor.")
    ]
    return corpus

# Test with the corpus
corpus = create_test_corpus()

print("\nProcessing corpus:")
print("-" * 50)

for i, (premise, hypothesis) in enumerate(corpus, 1):
    print(f"\nPair {i}:")
    print(f"Premise: {premise}")
    print(f"Hypothesis: {hypothesis}")

    # Word-level tokenization
    word_tokens_premise, word_tokens_hypothesis = custom_tokenization((premise, hypothesis))
    print(f"\nWord-level tokens (first 5):")
    print(f"Premise: {word_tokens_premise[:5]}")
    print(f"Hypothesis: {word_tokens_hypothesis[:5]}")

    # Character-level tokenization
    char_tokens_premise, char_tokens_hypothesis = custom_char_tokenization((premise, hypothesis))
    print(f"\nCharacter-level tokens (first 5):")
    print(f"Premise: {char_tokens_premise[:5]}")
    print(f"Hypothesis: {char_tokens_hypothesis[:5]}")


Processing corpus:
--------------------------------------------------

Pair 1:
Premise: The cat is sleeping on the mat.
Hypothesis: There is a cat resting.

Word-level tokens (first 5):
Premise: ['The', 'cat', 'is', 'sleeping', 'on']
Hypothesis: ['There', 'is', 'a', 'cat', 'resting.']

Character-level tokens (first 5):
Premise: ['Th', 'he', 'ec', 'ca', 'at']
Hypothesis: ['Th', 'he', 'er', 're', 'ei']

Pair 2:
Premise: Students attended the AI lecture.
Hypothesis: People were learning about artificial intelligence.

Word-level tokens (first 5):
Premise: ['Students', 'attended', 'the', 'AI', 'lecture.']
Hypothesis: ['People', 'were', 'learning', 'about', 'artificial']

Character-level tokens (first 5):
Premise: ['St', 'tu', 'ud', 'de', 'en']
Hypothesis: ['Pe', 'eo', 'op', 'pl', 'le']

Pair 3:
Premise: The restaurant serves Italian food.
Hypothesis: You can eat pasta at this place.

Word-level tokens (first 5):
Premise: ['The', 'restaurant', 'serves', 'Italian', 'food.']
Hypothesis: ['Yo

Now use the tokenizer with the models Roberta / BERT



In [14]:
#!pip install transformers
#!pip install torch
#!pip install sentencepiece

In [15]:
# Add these imports at the top
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, RobertaModel, BertModel
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the models and tokenizers
roberta_model_name = "roberta-base"
bert_model_name = "bert-base-uncased"

# Initialize transformer tokenizers
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

# Initialize models
roberta_model = RobertaModel.from_pretrained(roberta_model_name).to(device)
bert_model = BertModel.from_pretrained(bert_model_name).to(device)

class CustomTokenizerWithTransformers(CustomTokenizer):
    def __init__(self, lengths: List[int] = [1], transformer_tokenizer=None, transformer_model=None):
        super().__init__(lengths)
        self.transformer_tokenizer = transformer_tokenizer
        self.transformer_model = transformer_model

    def get_transformer_embeddings(self, text: str):
        """
        Get embeddings from transformer model for the input text.

        Args:
            text (str): Input text

        Returns:
            torch.Tensor: Last hidden state embeddings
        """
        if self.transformer_tokenizer is None or self.transformer_model is None:
            raise ValueError("Transformer tokenizer and model must be set")

        # Tokenize text
        inputs = self.transformer_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(device)

        # Get model outputs
        with torch.no_grad():
            outputs = self.transformer_model(**inputs)

        return outputs.last_hidden_state

# Example usage with both custom tokenization and transformer embeddings
if __name__ == "__main__":
    # Create tokenizers with transformers
    roberta_custom_tokenizer = CustomTokenizerWithTransformers(
        lengths=[1, 2, 3],
        transformer_tokenizer=roberta_tokenizer,
        transformer_model=roberta_model
    )

    bert_custom_tokenizer = CustomTokenizerWithTransformers(
        lengths=[1, 2, 3],
        transformer_tokenizer=bert_tokenizer,
        transformer_model=bert_model
    )

    sample_text = "Natural Language Inference is a task of NLI."

    # Get custom tokenization
    print("\nCustom Word-Level Tokens:", roberta_custom_tokenizer.tokenize(sample_text))
    print("\nCustom Character-Level Tokens:", roberta_custom_tokenizer.tokenize_characters(sample_text))

    # Get transformer embeddings
    print("\nGetting RoBERTa embeddings...")
    roberta_embeddings = roberta_custom_tokenizer.get_transformer_embeddings(sample_text)
    print(f"RoBERTa embedding shape: {roberta_embeddings.shape}")

    print("\nGetting BERT embeddings...")
    bert_embeddings = bert_custom_tokenizer.get_transformer_embeddings(sample_text)
    print(f"BERT embedding shape: {bert_embeddings.shape}")

    # Process test corpus with both tokenization methods
    corpus = create_test_corpus()
    print("\nProcessing corpus with both custom tokens and transformer embeddings:")
    print("-" * 50)

    for i, text in enumerate(corpus, 1):
        print(f"\nText {i}: {text}")

        # Custom tokenization
        word_tokens = roberta_custom_tokenizer.tokenize(text)
        char_tokens = roberta_custom_tokenizer.tokenize_characters(text)

        # Transformer embeddings
        roberta_emb = roberta_custom_tokenizer.get_transformer_embeddings(text)
        bert_emb = bert_custom_tokenizer.get_transformer_embeddings(text)

        print(f"Word tokens count: {len(word_tokens)}")
        print(f"Char tokens count: {len(char_tokens)}")
        print(f"RoBERTa embedding shape: {roberta_emb.shape}")
        print(f"BERT embedding shape: {bert_emb.shape}")

ModuleNotFoundError: No module named 'transformers'