In [107]:
import spacy
from collections import Counter
import textacy

In [71]:
nlp = spacy.load('en')

In [72]:
def load(filename): 
    with open(filename) as f: 
        raw = f.read()
    return nlp(raw)

In [129]:
def reduceAndLemmatize(doc, cutoff=-8.0): 
    """ 
    Gets highly improbable words from the document (less than the cutoff), 
    and returns a list of numeric representations of those words' lemmas. 
    """
#     probs = [w.prob for w in doc]
#     improb = [w for w in doc if w.prob < cutoff ]
    lemmas = [w.lemma for w in doc]
    return lemmas

In [74]:
def write(data, filename): 
    stringData = ' '.join([str(lemma) for lemma in data])
    with open(filename, 'w') as f: 
        f.write(stringData)

In [75]:
def renumber(lemmas): 
    """ 
    Turns a list of numeric representations of lemmas like: 
    1030500, 10300082, etc. into manageable numbers like 1, 82, 300. 
    """
    # Renumber them with smaller numbers. 
    lookupTable = {x[1]: x[0] for x in enumerate(set(lemmas))}
    lemmasRenumbered = [lookupTable[lemma] for lemma in lemmas]
    return lemmasRenumbered

In [76]:
doc = load('garden-party.txt')

In [130]:
lemmas = reduceAndLemmatize(doc)

In [131]:
lemmasRenumbered = renumber(lemmas)

In [80]:
write(lemmasRenumbered, 'garden-lemmas-renumbered.txt')

In [132]:
tdoc = textacy.doc.Doc(doc)

In [142]:
bigrams = tdoc.to_bag_of_terms(ngrams=2, lemmatize=False, as_strings=True, lowercase=True, named_entities=False)

In [143]:
for key in sorted(bigrams, key=bigrams.get, reverse=True): 
    print(key, bigrams[key])

mrs. sheridan 13
said laura 11
can't 7
godber's 7
's man 6
mother's 5
won't 5
tall fellow 5
let's 5
said laurie 4
little cottages 4
's voice 3
laura's 3
said mrs. 3
wee -ary 3
cream puffs 3
poor woman 3
turned round 3
n't understand 3
dear child 2
n't possibly 2
good morning 2
nice workmen 2
laura took 2
's left 2
n't hear 2
sweet hat 2
green baize 2
baize door 2
man's 2
'm sure 2
n't know 2
laura came 2
miss laura 2
said faintly 2
tell mother 2
miss jose 2
jose's 2
's face 2
tear–a sigh 2
chan -ges 2
said practical 2
practical jose 2
said cook 2
man killed 2
broad road 2
laura turned 2
big hat 2
velvet ribbon 2
laura said 2
mr. sheridan 2
n't want 2
arum lilies 2
mrs. scott 2
door opened 2
little woman 2
young lady 2
'll excuse 2
em's 2
's sister 2
perfect day 1
light gold 1
early summer 1
dark flat 1
flat rosettes 1
daisy plants 1
help feeling 1
impress people 1
literally hundreds 1
single night 1
green bushes 1
bushes bowed 1
men came 1
use asking 1
'm determined 1
honoured guest 1
