Basic Tokenizer Project

A Tokenizer is a function that breaks up a string of text into smaller components, such as words or subwords. Here's a basic Python implementation of a tokenizer that splits a sentence based on spaces, punctuation, and special characters.

In [27]:
import re
import numpy as np
from keras.preprocessing.text import Tokenizer
def basic_tokenizer(text):
    # Define a regular expression pattern for splitting on non-word characters (punctuation, spaces, etc.)
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Test with some data
data = [
    "Hello, world!",
    "This is a basic tokenizer example.",
    "Tokenize, split, and analyze data easily!",
    "It supports numbers like 123 and symbols like @ and #."
    "Hello, world! This is a test of the basic tokenizer"
    "Tokenizing can be quite useful in natural language processing (NLP)"
    "Let's see how this works on different sentences"
    "How about some complex sentences, such as this one, with commas, periods, and hyphens?"
    "12345 are numbers,Don’t forget about special characters: @#$%&*"

]

# Apply tokenizer to each sentence
for sentence in data:
    tokens = basic_tokenizer(sentence)
    print(f"Original: {sentence}")
    print(f"Tokens: {tokens}\n")


Original: Hello, world!
Tokens: ['hello', 'world']

Original: This is a basic tokenizer example.
Tokens: ['this', 'is', 'a', 'basic', 'tokenizer', 'example']

Original: Tokenize, split, and analyze data easily!
Tokens: ['tokenize', 'split', 'and', 'analyze', 'data', 'easily']

Original: It supports numbers like 123 and symbols like @ and #.Hello, world! This is a test of the basic tokenizerTokenizing can be quite useful in natural language processing (NLP)Let's see how this works on different sentencesHow about some complex sentences, such as this one, with commas, periods, and hyphens?12345 are numbers,Don’t forget about special characters: @#$%&*
Tokens: ['it', 'supports', 'numbers', 'like', '123', 'and', 'symbols', 'like', 'and', 'hello', 'world', 'this', 'is', 'a', 'test', 'of', 'the', 'basic', 'tokenizertokenizing', 'can', 'be', 'quite', 'useful', 'in', 'natural', 'language', 'processing', 'nlp', 'let', 's', 'see', 'how', 'this', 'works', 'on', 'different', 'sentenceshow', 'about'

Verify Access to the GPU

In [31]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  0


Load Data

In [32]:
import re

# Tokenizer function
def basic_tokenizer(text):
    """Tokenizes input text by splitting on words and non-whitespace characters."""
    tokens = re.findall(r'\w+|\S', text)
    return tokens

# Function to load data from a file
def load_data(file_path):
    """
    Reads text data from the specified file.
    
    :param file_path: Path to the text file to read.
    :return: The content of the file as a string.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print(f"FileNotFoundError: The file '{file_path}' was not found.")
    except IOError:
        print(f"IOError: An error occurred while trying to read the file '{file_path}'.")

# Main execution
if __name__ == "__main__":
    # Specify the path to your dataset
    file_path = 'data/sample_text.txt'  # Path to your sample text file

    # Load the data
    text_data = load_data(file_path)
    
    # Check if data was loaded successfully
    if text_data:
        # Tokenize the loaded data
        tokens = basic_tokenizer(text_data)
        # Print the tokens
        print(tokens)


FileNotFoundError: The file 'data/sample_text.txt' was not found.
