In [25]:
from typing import List, Tuple, Union
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, RobertaModel, BertModel
import torch

def custom_tokenization(premise_hypothesis: Union[Tuple[str, str], List[str]], separator_marker: str="", **tokenization_args) -> Tuple[List[str], List[str]]:
    """
    Custom tokenization method that returns separate tokens for premise and hypothesis.

    Args:
        premise_hypothesis: Tuple or list containing (premise, hypothesis)
        separator_marker: Special character(s) used by tokenizers when splitting words
        tokenization_args: Additional tokenization arguments

    Returns:
        Tuple containing (premise_tokens, hypothesis_tokens)
    """
    def _tokenize_text(text: str, lengths: List[int] = [1, 2, 3]) -> List[str]:
        # Clean and split the text into words
        words = text.split()
        tokens = []

        # Generate tokens for each length
        for length in lengths:
            if length <= 0:
                continue
            # Loop through the words to create n-grams
            for i in range(len(words) - length + 1):
                token = ' '.join(words[i:i + length])
                tokens.append(token)

        return tokens

    # Get lengths from tokenization_args if provided
    lengths = tokenization_args.get('lengths', [1, 2, 3])

    # Tokenize both premise and hypothesis
    premise_tokens = _tokenize_text(premise_hypothesis[0], lengths)
    hypothesis_tokens = _tokenize_text(premise_hypothesis[1], lengths)

    return premise_tokens, hypothesis_tokens

def custom_char_tokenization(premise_hypothesis: Union[Tuple[str, str], List[str]], separator_marker: str="", **tokenization_args) -> Tuple[List[str], List[str]]:
    """
    Custom character-level tokenization method.

    Args:
        premise_hypothesis: Tuple or list containing (premise, hypothesis)
        separator_marker: Special character(s) used by tokenizers when splitting words
        tokenization_args: Additional tokenization arguments

    Returns:
        Tuple containing (premise_char_tokens, hypothesis_char_tokens)
    """
    def _tokenize_characters(text: str, lengths: List[int] = [2, 3]) -> List[str]:
        text = text.replace(" ", "")
        tokens = []

        for length in lengths:
            if length <= 0:
                continue
            for i in range(len(text) - length + 1):
                token = text[i:i + length]
                if separator_marker and i > 0:
                    token = f"{separator_marker}{token}"
                tokens.append(token)

        return tokens

    # Get lengths from tokenization_args if provided
    lengths = tokenization_args.get('lengths', [2, 3])

    # Tokenize both premise and hypothesis
    premise_tokens = _tokenize_characters(premise_hypothesis[0], lengths)
    hypothesis_tokens = _tokenize_characters(premise_hypothesis[1], lengths)

    return premise_tokens, hypothesis_tokens


ModuleNotFoundError: No module named 'transformers'

In [21]:
def create_test_corpus():
    """
    Creates a small test corpus of premise-hypothesis pairs.

    Returns:
        List[Tuple[str, str]]: List of (premise, hypothesis) pairs
    """
    corpus = [
        ("The cat is sleeping on the mat.", "There is a cat resting."),
        ("Students attended the AI lecture.", "People were learning about artificial intelligence."),
        ("The restaurant serves Italian food.", "You can eat pasta at this place."),
        ("It's raining heavily outside.", "The weather is wet."),
        ("The computer is running slowly.", "The system performance is poor.")
    ]
    return corpus

# Test with the corpus
corpus = create_test_corpus()



for i, (premise, hypothesis) in enumerate(corpus, 1):
    print(f"\nPair {i}:")
    print(f"Premise: {premise}")
    print(f"Hypothesis: {hypothesis}")

    # Word-level tokenization
    word_tokens_premise, word_tokens_hypothesis = custom_tokenization((premise, hypothesis))
    print(f"\nWord-level tokens (first 5):")
    print(f"Premise: {word_tokens_premise[:5]}")
    print(f"Hypothesis: {word_tokens_hypothesis[:5]}")

    # Character-level tokenization
    char_tokens_premise, char_tokens_hypothesis = custom_char_tokenization((premise, hypothesis))
    print(f"\nCharacter-level tokens (first 5):")
    print(f"Premise: {char_tokens_premise[:5]}")
    print(f"Hypothesis: {char_tokens_hypothesis[:5]}")

    # Golden-chunk tokenization
    golden_tokens_premise, golden_tokens_hypothesis = golden_chunk_tokenization(
        (premise, hypothesis),
        initial_length=2,
        ratio=1.618,
        max_chunk_length=8,
        rounding_mode='floor',
        reset_at_max=True
    )
    print(f"\nGolden-chunk tokens (first 5):")
    print(f"Premise: {golden_tokens_premise[:5]}")
    print(f"Hypothesis: {golden_tokens_hypothesis[:5]}")


Pair 1:
Premise: The cat is sleeping on the mat.
Hypothesis: There is a cat resting.

Word-level tokens (first 5):
Premise: ['The', 'cat', 'is', 'sleeping', 'on']
Hypothesis: ['There', 'is', 'a', 'cat', 'resting.']

Character-level tokens (first 5):
Premise: ['Th', 'he', 'ec', 'ca', 'at']
Hypothesis: ['Th', 'he', 'er', 're', 'ei']

Golden-chunk tokens (first 5):
Premise: ['Th', 'eca', 'tiss', 'leepin', 'go']
Hypothesis: ['Th', 'ere', 'isac', 'atrest', 'in']

Pair 2:
Premise: Students attended the AI lecture.
Hypothesis: People were learning about artificial intelligence.

Word-level tokens (first 5):
Premise: ['Students', 'attended', 'the', 'AI', 'lecture.']
Hypothesis: ['People', 'were', 'learning', 'about', 'artificial']

Character-level tokens (first 5):
Premise: ['St', 'tu', 'ud', 'de', 'en']
Hypothesis: ['Pe', 'eo', 'op', 'pl', 'le']

Golden-chunk tokens (first 5):
Premise: ['St', 'ude', 'ntsa', 'ttende', 'dt']
Hypothesis: ['Pe', 'opl', 'ewer', 'elearn', 'in']

Pair 3:
Premise: T

Now use the tokenizer with the models Roberta / BERT



In [26]:
!pip install transformers
#!pip install torch
#!pip install sentencepiece



In [28]:
def golden_chunk_tokenization(premise_hypothesis: Union[Tuple[str, str], List[str]], **tokenization_args) -> Tuple[List[str], List[str]]:
    """
    Golden-Chunk tokenization method that uses increasing chunk sizes based on the golden ratio.

    Args:
        premise_hypothesis: Tuple or list containing (premise, hypothesis)
        tokenization_args: Additional arguments including:
            - initial_length: Starting chunk size (default: 2)
            - ratio: Growth ratio (default: 1.618)
            - max_chunk_length: Maximum chunk size (default: 8)
            - rounding_mode: How to round chunk sizes ('floor', 'ceil', 'round') (default: 'floor')
            - reset_at_max: Whether to reset chunk size after hitting max (default: True)

    Returns:
        Tuple containing (premise_tokens, hypothesis_tokens)
    """
    def _golden_chunk_text(text: str, **args) -> List[str]:
        # Initialize parameters
        initial_length = args.get('initial_length', 2)
        ratio = args.get('ratio', 1.618)
        max_chunk_length = args.get('max_chunk_length', 8)
        rounding_mode = args.get('rounding_mode', 'floor')
        reset_at_max = args.get('reset_at_max', True)

        tokens = []
        i = 0
        current_length = initial_length
        text = text.replace(" ", "")  # Remove spaces

        while i < len(text):
            # Extract chunk
            chunk = text[i:i + current_length]
            if chunk:  # Only add non-empty chunks
                tokens.append(chunk)

            # Move pointer
            i += current_length

            # Calculate next chunk length
            next_length = current_length * ratio
            if rounding_mode == 'floor':
                next_length = int(next_length)
            elif rounding_mode == 'ceil':
                next_length = math.ceil(next_length)
            else:  # 'round'
                next_length = round(next_length)

            # Apply maximum length constraint
            if next_length > max_chunk_length:
                current_length = initial_length if reset_at_max else max_chunk_length
            else:
                current_length = max(1, next_length)  # Ensure at least length 1

        return tokens

    # Apply tokenization to both premise and hypothesis
    premise_tokens = _golden_chunk_text(premise_hypothesis[0], **tokenization_args)
    hypothesis_tokens = _golden_chunk_text(premise_hypothesis[1], **tokenization_args)

    return premise_tokens, hypothesis_tokens

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, RobertaModel, BertModel
import torch

def initialize_transformers():
    """
    Initialize transformer models and tokenizers.

    Returns:
        tuple: (roberta_tokenizer, bert_tokenizer, roberta_model, bert_model, device)
    """
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize the models and tokenizers
    roberta_model_name = "roberta-base"
    bert_model_name = "bert-base-uncased"

    print("\nLoading tokenizers...")
    roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

    print("Loading models...")
    roberta_model = RobertaModel.from_pretrained(roberta_model_name).to(device)
    bert_model = BertModel.from_pretrained(bert_model_name).to(device)

    # Set models to evaluation mode
    roberta_model.eval()
    bert_model.eval()

    return roberta_tokenizer, bert_tokenizer, roberta_model, bert_model, device

def process_text_with_transformers(text: str, tokenizers, models, device):
    """
    Process text with different tokenization methods and transformer models.

    Args:
        text (str): Input text
        tokenizers (tuple): (roberta_tokenizer, bert_tokenizer)
        models (tuple): (roberta_model, bert_model)
        device: torch device

    Returns:
        dict: Results including tokens and embeddings
    """
    roberta_tokenizer, bert_tokenizer = tokenizers
    roberta_model, bert_model = models

    results = {}

    # Word-level tokenization
    word_tokens, _ = custom_tokenization((text, ""))
    results['word_tokens'] = word_tokens[:5]  # First 5 tokens

    # Character-level tokenization
    char_tokens, _ = custom_char_tokenization((text, ""))
    results['char_tokens'] = char_tokens[:5]  # First 5 tokens

    # Golden-chunk tokenization
    golden_tokens, _ = golden_chunk_tokenization(
        (text, ""),
        initial_length=2,
        ratio=1.618,
        max_chunk_length=8,
        rounding_mode='floor',
        reset_at_max=True
    )
    results['golden_tokens'] = golden_tokens[:5]  # First 5 tokens

    # Get transformer embeddings
    with torch.no_grad():
        # RoBERTa
        roberta_inputs = roberta_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(device)
        roberta_outputs = roberta_model(**roberta_inputs)
        results['roberta_embeddings'] = roberta_outputs.last_hidden_state

        # BERT
        bert_inputs = bert_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(device)
        bert_outputs = bert_model(**bert_inputs)
        results['bert_embeddings'] = bert_outputs.last_hidden_state

    return results

# Example usage
if __name__ == "__main__":
    # Initialize transformers
    print("Initializing transformer models...")
    roberta_tokenizer, bert_tokenizer, roberta_model, bert_model, device = initialize_transformers()

    # Create test corpus
    corpus = create_test_corpus()

    print("\nProcessing corpus with all tokenization methods and transformers:")
    print("-" * 50)

    for i, (premise, hypothesis) in enumerate(corpus, 1):
        print(f"\nPair {i}:")
        print(f"Premise: {premise}")
        print(f"Hypothesis: {hypothesis}")

        # Process premise
        premise_results = process_text_with_transformers(
            premise,
            (roberta_tokenizer, bert_tokenizer),
            (roberta_model, bert_model),
            device
        )

        # Process hypothesis
        hypothesis_results = process_text_with_transformers(
            hypothesis,
            (roberta_tokenizer, bert_tokenizer),
            (roberta_model, bert_model),
            device
        )

        # Print results
        print("\nPremise processing:")
        print(f"Word tokens: {premise_results['word_tokens']}")
        print(f"Char tokens: {premise_results['char_tokens']}")
        print(f"Golden tokens: {premise_results['golden_tokens']}")
        print(f"RoBERTa embedding shape: {premise_results['roberta_embeddings'].shape}")
        print(f"BERT embedding shape: {premise_results['bert_embeddings'].shape}")

        print("\nHypothesis processing:")
        print(f"Word tokens: {hypothesis_results['word_tokens']}")
        print(f"Char tokens: {hypothesis_results['char_tokens']}")
        print(f"Golden tokens: {hypothesis_results['golden_tokens']}")
        print(f"RoBERTa embedding shape: {hypothesis_results['roberta_embeddings'].shape}")
        print(f"BERT embedding shape: {hypothesis_results['bert_embeddings'].shape}")

ModuleNotFoundError: No module named 'transformers'