In [70]:
import spacy

In [71]:
nlp = spacy.load('en')

In [72]:
def load(filename): 
    with open(filename) as f: 
        raw = f.read()
    return nlp(raw)

In [73]:
def reduceAndLemmatize(doc, cutoff=-8.0): 
    """ 
    Gets highly improbable words from the document (less than the cutoff), 
    and returns a list of numeric representations of those words' lemmas. 
    """
    probs = [w.prob for w in doc]
    improb = [w for w in doc if w.prob < cutoff ]
    lemmas = [w.lemma for w in improb]
    return lemmas

In [74]:
def write(data, filename): 
    stringData = ' '.join([str(lemma) for lemma in data])
    with open(filename, 'w') as f: 
        f.write(stringData)

In [75]:
def renumber(lemmas): 
    """ 
    Turns a list of numeric representations of lemmas like: 
    1030500, 10300082, etc. into manageable numbers like 1, 82, 300. 
    """
    # Renumber them with smaller numbers. 
    lookupTable = {x[1]: x[0] for x in enumerate(set(lemmas))}
    lemmasRenumbered = [lookupTable[lemma] for lemma in lemmas]
    return lemmasRenumbered

In [76]:
doc = load('garden-party.txt')

In [77]:
lemmas = reduceAndLemmatize(doc)

In [79]:
lemmasRenumbered = renumber(lemmas)

In [80]:
write(lemmasRenumbered, 'garden-lemmas-renumbered.txt')