In [3]:
import nltk
import spacy

# Define test text

In [5]:
text = '''
The U.S. is one of the few industrialized nations that doesn't have a higher standard of regulation for the smooth, needle-like fibers such as crocidolite that are classified as amphobiles, according to Brooke T. Mossman, a professor of pathlogy at the University of Vermont College of Medicine.
'''

# Tokenizers in NLTK

## TreebankWordTokenizer
Uses regular expressions to tokenize text as in Penn Treebank.

In [8]:
from nltk.tokenize import TreebankWordTokenizer 
  
tokenizer = TreebankWordTokenizer() 
print('|'.join(tokenizer.tokenize(text))) 

The|U.S.|is|one|of|the|few|industrialized|nations|that|does|n't|have|a|higher|standard|of|regulation|for|the|smooth|,|needle-like|fibers|such|as|crocidolite|that|are|classified|as|amphobiles|,|according|to|Brooke|T.|Mossman|,|a|professor|of|pathlogy|at|the|University|of|Vermont|College|of|Medicine|.


## WordPunctTokenizer
Tokenize a text into a sequence of alphabetic and non-alphabetic characters, using the regexp \w+|[^\w\s]+.

In [10]:
from nltk.tokenize import WordPunctTokenizer 

tokenizer = WordPunctTokenizer() 
print('|'.join(tokenizer.tokenize(text)))

The|U|.|S|.|is|one|of|the|few|industrialized|nations|that|doesn|'|t|have|a|higher|standard|of|regulation|for|the|smooth|,|needle|-|like|fibers|such|as|crocidolite|that|are|classified|as|amphobiles|,|according|to|Brooke|T|.|Mossman|,|a|professor|of|pathlogy|at|the|University|of|Vermont|College|of|Medicine|.


## word_tokenize
Currently an improved TreebankWordTokenizer along with PunktSentenceTokenizer for the specified language.

In [43]:
from nltk.tokenize import word_tokenize 

print('|'.join(word_tokenize(text)))

The|U.S.|is|one|of|the|few|industrialized|nations|that|does|n't|have|a|higher|standard|of|regulation|for|the|smooth|,|needle-like|fibers|such|as|crocidolite|that|are|classified|as|amphobiles|,|according|to|Brooke|T.|Mossman|,|a|professor|of|pathlogy|at|the|University|of|Vermont|College|of|Medicine|.


# spaCy

In [29]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

print('|'.join([w.text for w in tokenizer(text)]))


|The|U.S.|is|one|of|the|few|industrialized|nations|that|doesn't|have|a|higher|standard|of|regulation|for|the|smooth,|needle-like|fibers|such|as|crocidolite|that|are|classified|as|amphobiles,|according|to|Brooke|T.|Mossman,|a|professor|of|pathlogy|at|the|University|of|Vermont|College|of|Medicine.|



# Design my own Regex Tokenzer

In [40]:
import re

abbr_list = ['U.S', 'T']

def preprocess(s):
    # handle some special cases
    s = s.strip()
    s = re.sub(r"n't", " n't", s) # e.g. won't -> wo n't
    s = re.sub(r"'s", " 's", s) # e.g. Bob's -> Bob 's
    s = re.sub(r"'re", " 're", s) # e.g. we're -> we 're
    s = re.sub(r"cannot", "can not", s)
    s = re.sub(r'\s"(\S+)\s', r' `` \1 ' , s) # e.g. he said, "xxx -> he said, `` xxx
    s = re.sub(r'^"(\S+)\s', r'`` \1 ' , s) # e.g. "Xxx -> `` Xxx
    s = re.sub(r'\s(\S+)"', r" \1 '' ", s) # e.g. xxx" -> xxx ''
    return s

def check_abbr(s):
    # end in punctuation except "."
    result = re.findall(r'''^([\w+\.]+)([,?!:;'])$''', s)
    if result:
        return list(result[0])
    # word end in ".", check if it's an abbr
    result = re.findall(r'^(\w+).?[.]$', s)
    if result:
        if result[0] in abbr_list: # abbr_list is a list containing predefined abbrevations
            return [s]
        else:
            return [result[0], '.']
    return [s]

def tokenizer(s):
    s = preprocess(s)
    words = s.split()
    token_list = []
    for word in words:
        token_list += check_abbr(word)
    return [w for w in token_list if w]

In [41]:
print('|'.join(tokenizer(text)))

The|U.S.|is|one|of|the|few|industrialized|nations|that|does|n't|have|a|higher|standard|of|regulation|for|the|smooth|,|needle-like|fibers|such|as|crocidolite|that|are|classified|as|amphobiles|,|according|to|Brooke|T.|Mossman|,|a|professor|of|pathlogy|at|the|University|of|Vermont|College|of|Medicine|.
