In [1]:
import re

In [2]:
def stem(word):
    """Return stem of word

    >>> stem('working')
    'work'
    >>> stem('works')
    'work'
    """
    return re.sub(r'(s|ing)$', '', word)

In [3]:
def tokenize(text):
    """Split text to words, ignoring stop words."""
    tokens = []
    for tok in re.findall('[a-zA-Z]+', text):
        tok = tok.lower()
        tok = stem(tok)
        if tok not in stop_words:
            tokens.append(tok)
    return tokens

In [4]:
stop_words = {
    'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
    'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been',
    'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had',
    'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
    'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
    'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no',
    'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
    'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
    'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
    'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we',
    'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why',
    'will', 'with', 'would', 'yet', 'you', 'your',
}


In [5]:
%run nlp.py

In [6]:
s = 'We will encourage you to develop the three gerat virtues of a programmer: laziness, impatience, and hubris'

In [7]:
tokenize(s)

['encourage',
 'develop',
 'three',
 'gerat',
 'virtue',
 'programmer',
 'lazines',
 'impatience',
 'hubri']

In [8]:
%prun tokenize(s)

 

         119 function calls in 0.000 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 {built-in method builtins.exec}
       17    0.000    0.000    0.000    0.000 {method 'sub' of 're.Pattern' objects}
        1    0.000    0.000    0.000    0.000 nlp.py:15(tokenize)
       18    0.000    0.000    0.000    0.000 __init__.py:272(_compile)
       17    0.000    0.000    0.000    0.000 __init__.py:178(sub)
        1    0.000    0.000    0.000    0.000 __init__.py:208(findall)
       17    0.000    0.000    0.000    0.000 nlp.py:4(stem)
        1    0.000    0.000    0.000    0.000 {method 'findall' of 're.Pattern' objects}
       18    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
       17    0.000    0.000    0.000    0.000 {method 'lower' of 'str' objects}
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        9    0.000    0.000    0

In [9]:
%%prun
for _ in range(10_000):
    tokenize(s)

 

         1160003 function calls in 0.541 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    10000    0.130    0.000    0.535    0.000 nlp.py:15(tokenize)
   170000    0.089    0.000    0.259    0.000 __init__.py:178(sub)
   180000    0.081    0.000    0.110    0.000 __init__.py:272(_compile)
   170000    0.066    0.000    0.066    0.000 {method 'sub' of 're.Pattern' objects}
   170000    0.063    0.000    0.322    0.000 nlp.py:4(stem)
    10000    0.044    0.000    0.044    0.000 {method 'findall' of 're.Pattern' objects}
   180000    0.029    0.000    0.029    0.000 {built-in method builtins.isinstance}
   170000    0.017    0.000    0.017    0.000 {method 'lower' of 'str' objects}
    90000    0.010    0.000    0.010    0.000 {method 'append' of 'list' objects}
        1    0.006    0.006    0.541    0.541 <string>:1(<module>)
    10000    0.006    0.000    0.056    0.000 __init__.py:208(findall)
        1    0.000    0.