In [61]:
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer, word_tokenize
import nltk
nltk.download('punkt_tab')
import spacy
from transformers import BertTokenizer
import timeit

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\FPTSHOP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


ModuleNotFoundError: No module named 'pandas'

In [23]:
#load tokenizer for spacy
nlp = spacy.load('en_core_web_sm')

In [24]:
bert = BertTokenizer.from_pretrained("bert-base-uncased")

In [25]:
treebank = TreebankWordTokenizer()

In [26]:
tweet = TweetTokenizer()

In [57]:
def comparison(text):
    word_tokenize_tokens = word_tokenize(text)
    tweet_tokens = tweet.tokenize(text)
    treebank_tokens = treebank.tokenize(text)
    bert_tokens = bert.tokenize(text)
    spacy_tokens = [i.text for i in nlp(text)]
    # Measure tokenization time
    word_tokenize_tokens_time = timeit.timeit(lambda: word_tokenize(text), number=1000)
    tweet_tokens_time = timeit.timeit(lambda: tweet.tokenize(text), number=1000)
    treebank_tokens_time = timeit.timeit(lambda: treebank.tokenize(text), number=1000)
    bert_tokens_time = timeit.timeit(lambda: bert.tokenize(text), number=1000)
    spacy_tokens_time = timeit.timeit(lambda: [i.text for i in nlp(text)], number=1000)
    print(f"Word_Tokenize: {word_tokenize_tokens} | time: {word_tokenize_tokens_time}")
    print(f"TweetTokenizer: {tweet_tokens}  | time: {tweet_tokens_time}")
    print(f"TreebankWordTokenizer: {treebank_tokens}  | time: {treebank_tokens_time}")
    print(f"BertTokenizer: {bert_tokens}  | time: {bert_tokens_time}")
    print(f"Spacy (en_core_web_sm): {spacy_tokens} | time: {spacy_tokens_time}")
    print(f"Fastest Time: {min(word_tokenize_tokens_time, tweet_tokens_time, treebank_tokens_time, bert_tokens_time, spacy_tokens_time)}")
    
    
    

In [58]:
comparison("GoodMorning")

Word_Tokenize: ['GoodMorning'] | time: 0.03265140001894906
TweetTokenizer: ['GoodMorning']  | time: 0.020079600013559684
TreebankWordTokenizer: ['GoodMorning']  | time: 0.022092399973189458
BertTokenizer: ['good', '##mo', '##rn', '##ing']  | time: 0.10094299999764189
Spacy (en_core_web_sm): ['GoodMorning'] | time: 7.877144199999748
Fastest Time: 0.020079600013559684


In [59]:
comparison("I've never seen this before!")

Word_Tokenize: ['I', "'ve", 'never', 'seen', 'this', 'before', '!'] | time: 0.05664039999828674
TweetTokenizer: ["I've", 'never', 'seen', 'this', 'before', '!']  | time: 0.042168799991486594
TreebankWordTokenizer: ['I', "'ve", 'never', 'seen', 'this', 'before', '!']  | time: 0.028735400002915412
BertTokenizer: ['i', "'", 've', 'never', 'seen', 'this', 'before', '!']  | time: 0.1752991999965161
Spacy (en_core_web_sm): ['I', "'ve", 'never', 'seen', 'this', 'before', '!'] | time: 10.270850200002315
Fastest Time: 0.028735400002915412


In [60]:
comparison("Dr. Smith bought 1,000 shares of Acme Corp. on Jan. 5th, 2023! He said: 'I'm optimistic about the company's growth—especially in AI and ML.' Meanwhile, competitors are struggling with COVID-19 related delays.")


Word_Tokenize: ['Dr.', 'Smith', 'bought', '1,000', 'shares', 'of', 'Acme', 'Corp.', 'on', 'Jan.', '5th', ',', '2023', '!', 'He', 'said', ':', "'", 'I', "'m", 'optimistic', 'about', 'the', 'company', "'s", 'growth—especially', 'in', 'AI', 'and', 'ML', '.', "'", 'Meanwhile', ',', 'competitors', 'are', 'struggling', 'with', 'COVID-19', 'related', 'delays', '.'] | time: 0.41654840001137927
TweetTokenizer: ['Dr', '.', 'Smith', 'bought', '1,000', 'shares', 'of', 'Acme', 'Corp', '.', 'on', 'Jan', '.', '5th', ',', '2023', '!', 'He', 'said', ':', "'", "I'm", 'optimistic', 'about', 'the', "company's", 'growth', '—', 'especially', 'in', 'AI', 'and', 'ML', '.', "'", 'Meanwhile', ',', 'competitors', 'are', 'struggling', 'with', 'COVID', '-', '19', 'related', 'delays', '.']  | time: 0.3306933999992907
TreebankWordTokenizer: ['Dr.', 'Smith', 'bought', '1,000', 'shares', 'of', 'Acme', 'Corp.', 'on', 'Jan.', '5th', ',', '2023', '!', 'He', 'said', ':', "'I", "'m", 'optimistic', 'about', 'the', 'company'

TreebankWordTokenizer is the fastest tokenizer, and Spacy is the lowest (much lower than other tokenizers)

SpaCy does a lot more than just tokenize (POS tagging,...)

Bert breaks words into subwords -> important for DL 

Tweet works well for social media text