In [12]:
from typing import List

class CustomTokenizer:
    def __init__(self, lengths: List[int] = [1]):
        """
        Initialize the tokenizer with desired token lengths.

        Args:
            lengths (List[int]): List of n-gram lengths to generate.
                                 For example, [1, 2, 3] generates unigrams, bigrams, and trigrams.
        """
        self.lengths = lengths

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenizes the input text into tokens of specified lengths.

        Args:
            text (str): Input text to tokenize.

        Returns:
            List[str]: List of tokens of specified lengths.
        """
        # Clean and split the text into words
        words = text.split()
        tokens = []

        # Generate tokens for each length
        for length in self.lengths:
            if length <= 0:
                continue
            # Loop through the words to create n-grams
            for i in range(len(words) - length + 1):
                token = ' '.join(words[i:i + length])
                tokens.append(token)

        return tokens

    def tokenize_characters(self, text: str) -> List[str]:
        """
        Tokenizes the input text into character n-grams of specified lengths.

        Args:
            text (str): Input text to tokenize.

        Returns:
            List[str]: List of character n-grams.
        """
        text = text.replace(" ", "")  # Remove spaces for character-level tokenization
        tokens = []

        # Generate character n-grams for each length
        for length in self.lengths:
            if length <= 0:
                continue
            for i in range(len(text) - length + 1):
                token = text[i:i + length]
                tokens.append(token)

        return tokens


# Example Usage:
if __name__ == "__main__":
    # Initialize tokenizer for word-level tokens of length 1, 2, and 3
    tokenizer = CustomTokenizer(lengths=[1, 2, 3])

    sample_text = "Natural Language Inference is a task of NLI."

    print("Word-Level Tokens:", tokenizer.tokenize(sample_text))

    # Character-level n-grams of lengths 2 and 3
    char_tokenizer = CustomTokenizer(lengths=[2, 3])
    print("Character-Level Tokens:", char_tokenizer.tokenize_characters(sample_text))


Word-Level Tokens: ['Natural', 'Language', 'Inference', 'is', 'a', 'task', 'of', 'NLI.', 'Natural Language', 'Language Inference', 'Inference is', 'is a', 'a task', 'task of', 'of NLI.', 'Natural Language Inference', 'Language Inference is', 'Inference is a', 'is a task', 'a task of', 'task of NLI.']
Character-Level Tokens: ['Na', 'at', 'tu', 'ur', 'ra', 'al', 'lL', 'La', 'an', 'ng', 'gu', 'ua', 'ag', 'ge', 'eI', 'In', 'nf', 'fe', 'er', 're', 'en', 'nc', 'ce', 'ei', 'is', 'sa', 'at', 'ta', 'as', 'sk', 'ko', 'of', 'fN', 'NL', 'LI', 'I.', 'Nat', 'atu', 'tur', 'ura', 'ral', 'alL', 'lLa', 'Lan', 'ang', 'ngu', 'gua', 'uag', 'age', 'geI', 'eIn', 'Inf', 'nfe', 'fer', 'ere', 'ren', 'enc', 'nce', 'cei', 'eis', 'isa', 'sat', 'ata', 'tas', 'ask', 'sko', 'kof', 'ofN', 'fNL', 'NLI', 'LI.']


In [13]:
# Add this code after the existing code

def create_test_corpus():
    """
    Creates a small test corpus of text data.

    Returns:
        List[str]: List of text samples
    """
    corpus = [
        "Natural language processing is very cool.",
        "I do need to study a lot more.",
        "My neural network is not working.",
        "Garbage in, garbage out.",
        "Please help me before the deadline."
    ]
    return corpus

# Test with the corpus
corpus = create_test_corpus()

# Initialize tokenizers
word_tokenizer = CustomTokenizer(lengths=[1, 2, 3])  # unigrams, bigrams, trigrams
char_tokenizer = CustomTokenizer(lengths=[2, 3])     # character bigrams and trigrams

# Process each text in the corpus
print("\nProcessing corpus:")
print("-" * 50)

for i, text in enumerate(corpus, 1):
    print(f"\nText {i}: {text}")

    # Word-level tokenization
    word_tokens = word_tokenizer.tokenize(text)
    print(f"\nWord-level tokens (first 5):", word_tokens[:5])
    print(f"Total word-level tokens:", len(word_tokens))

    # Character-level tokenization
    char_tokens = char_tokenizer.tokenize_characters(text)
    print(f"Character-level tokens (first 5):", char_tokens[:5])
    print(f"Total character-level tokens:", len(char_tokens))


Processing corpus:
--------------------------------------------------

Text 1: Natural language processing is very cool.

Word-level tokens (first 5): ['Natural', 'language', 'processing', 'is', 'very']
Total word-level tokens: 15
Character-level tokens (first 5): ['Na', 'at', 'tu', 'ur', 'ra']
Total character-level tokens: 69

Text 2: I do need to study a lot more.

Word-level tokens (first 5): ['I', 'do', 'need', 'to', 'study']
Total word-level tokens: 21
Character-level tokens (first 5): ['Id', 'do', 'on', 'ne', 'ee']
Total character-level tokens: 43

Text 3: My neural network is not working.

Word-level tokens (first 5): ['My', 'neural', 'network', 'is', 'not']
Total word-level tokens: 15
Character-level tokens (first 5): ['My', 'yn', 'ne', 'eu', 'ur']
Total character-level tokens: 53

Text 4: Garbage in, garbage out.

Word-level tokens (first 5): ['Garbage', 'in,', 'garbage', 'out.', 'Garbage in,']
Total word-level tokens: 9
Character-level tokens (first 5): ['Ga', 'ar', 'rb', 'b

Now use the tokenizer with the models Roberta / BERT



In [14]:
#!pip install transformers
#!pip install torch
#!pip install sentencepiece

In [15]:
# Add these imports at the top
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, RobertaModel, BertModel
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the models and tokenizers
roberta_model_name = "roberta-base"
bert_model_name = "bert-base-uncased"

# Initialize transformer tokenizers
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

# Initialize models
roberta_model = RobertaModel.from_pretrained(roberta_model_name).to(device)
bert_model = BertModel.from_pretrained(bert_model_name).to(device)

class CustomTokenizerWithTransformers(CustomTokenizer):
    def __init__(self, lengths: List[int] = [1], transformer_tokenizer=None, transformer_model=None):
        super().__init__(lengths)
        self.transformer_tokenizer = transformer_tokenizer
        self.transformer_model = transformer_model

    def get_transformer_embeddings(self, text: str):
        """
        Get embeddings from transformer model for the input text.

        Args:
            text (str): Input text

        Returns:
            torch.Tensor: Last hidden state embeddings
        """
        if self.transformer_tokenizer is None or self.transformer_model is None:
            raise ValueError("Transformer tokenizer and model must be set")

        # Tokenize text
        inputs = self.transformer_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(device)

        # Get model outputs
        with torch.no_grad():
            outputs = self.transformer_model(**inputs)

        return outputs.last_hidden_state

# Example usage with both custom tokenization and transformer embeddings
if __name__ == "__main__":
    # Create tokenizers with transformers
    roberta_custom_tokenizer = CustomTokenizerWithTransformers(
        lengths=[1, 2, 3],
        transformer_tokenizer=roberta_tokenizer,
        transformer_model=roberta_model
    )

    bert_custom_tokenizer = CustomTokenizerWithTransformers(
        lengths=[1, 2, 3],
        transformer_tokenizer=bert_tokenizer,
        transformer_model=bert_model
    )

    sample_text = "Natural Language Inference is a task of NLI."

    # Get custom tokenization
    print("\nCustom Word-Level Tokens:", roberta_custom_tokenizer.tokenize(sample_text))
    print("\nCustom Character-Level Tokens:", roberta_custom_tokenizer.tokenize_characters(sample_text))

    # Get transformer embeddings
    print("\nGetting RoBERTa embeddings...")
    roberta_embeddings = roberta_custom_tokenizer.get_transformer_embeddings(sample_text)
    print(f"RoBERTa embedding shape: {roberta_embeddings.shape}")

    print("\nGetting BERT embeddings...")
    bert_embeddings = bert_custom_tokenizer.get_transformer_embeddings(sample_text)
    print(f"BERT embedding shape: {bert_embeddings.shape}")

    # Process test corpus with both tokenization methods
    corpus = create_test_corpus()
    print("\nProcessing corpus with both custom tokens and transformer embeddings:")
    print("-" * 50)

    for i, text in enumerate(corpus, 1):
        print(f"\nText {i}: {text}")

        # Custom tokenization
        word_tokens = roberta_custom_tokenizer.tokenize(text)
        char_tokens = roberta_custom_tokenizer.tokenize_characters(text)

        # Transformer embeddings
        roberta_emb = roberta_custom_tokenizer.get_transformer_embeddings(text)
        bert_emb = bert_custom_tokenizer.get_transformer_embeddings(text)

        print(f"Word tokens count: {len(word_tokens)}")
        print(f"Char tokens count: {len(char_tokens)}")
        print(f"RoBERTa embedding shape: {roberta_emb.shape}")
        print(f"BERT embedding shape: {bert_emb.shape}")

ModuleNotFoundError: No module named 'transformers'