In [1]:
import nltk

In [44]:
from nltk.corpus import gutenberg

In [47]:
emmaTokens = gutenberg.words('austen-emma.txt')

In [164]:
emmaTokens = [w for w in emmaTokens if w.isalpha()] # Restrict to alphabetic words

In [165]:
emmaTags = nltk.pos_tag(emmaTokens)

In [190]:
def findPattern(needles, haystack):
    """ 
    Input: 
     - POS-tagged tokens "haystack", [("a", "DET"), ("list", "NN") ... ]
     - List of words/POSes, "needles", e.g. ["a", "very", "JJ", "man"]
     
    Output: 
     - a list of matching patterns, e.g., [('a', 'DT'), ('comfortable', 'JJ'), ('home', 'NN')]
    """ 
    matches = []
    for i in range(len(haystack)-len(needles)):
        # Gets a ngram of the same length as our pattern words. 
        miniHaystack = [haystack[i+j] for j in range(len(needles))]
        # Now let's compare all of our items, pairwise
        for i in range(len(needles)): 
            word, pos = miniHaystack[i] # Simultaneous assignment
            needle = needles[i]
            if needle != word and needle != pos: 
                break # Stop if something doesn't match
            if i == len(needles)-1: # We're at the end. Great success! 
                matches.append(miniHaystack)
    return matches

def getPatternStats(patterns): 
    """ 
    Input: A list of tuples as returned by findPattern()
    """
    pats = [str(pat) for pat in patterns]
    return nltk.FreqDist(pats).most_common()
        

In [191]:
pats = findPattern(['the', 'JJS', 'NN'], emmaTags)
pats

[[('the', 'DT'), ('greatest', 'JJS'), ('amusement', 'NN')],
 [('the', 'DT'), ('pleasantest', 'JJS'), ('proof', 'NN')],
 [('the', 'DT'), ('greatest', 'JJS'), ('pleasure', 'NN')],
 [('the', 'DT'), ('fairest', 'JJS'), ('way', 'NN')],
 [('the', 'DT'), ('earnest', 'JJS'), ('pressing', 'NN')],
 [('the', 'DT'), ('best', 'JJS'), ('size', 'NN')],
 [('the', 'DT'), ('smallest', 'JJS'), ('intermission', 'NN')],
 [('the', 'DT'), ('least', 'JJS'), ('suspecting', 'NN')],
 [('the', 'DT'), ('best', 'JJS'), ('judge', 'NN')],
 [('the', 'DT'), ('best', 'JJS'), ('friend', 'NN')],
 [('the', 'DT'), ('smallest', 'JJS'), ('doubt', 'NN')],
 [('the', 'DT'), ('luckiest', 'JJS'), ('woman', 'NN')],
 [('the', 'DT'), ('best', 'JJS'), ('antidote', 'NN')],
 [('the', 'DT'), ('completest', 'JJS'), ('proof', 'NN')],
 [('the', 'DT'), ('best', 'JJS'), ('charade', 'NN')],
 [('the', 'DT'), ('tenderest', 'JJS'), ('spirit', 'NN')],
 [('the', 'DT'), ('handsomest', 'JJS'), ('Henry', 'NN')],
 [('the', 'DT'), ('smallest', 'JJS'), (

In [192]:
getPatternStats(pats)

[("[('the', 'DT'), ('smallest', 'JJS'), ('doubt', 'NN')]", 4),
 ("[('the', 'DT'), ('least', 'JJS'), ('idea', 'NN')]", 4),
 ("[('the', 'DT'), ('greatest', 'JJS'), ('pleasure', 'NN')]", 3),
 ("[('the', 'DT'), ('best', 'JJS'), ('judge', 'NN')]", 2),
 ("[('the', 'DT'), ('best', 'JJS'), ('friend', 'NN')]", 2),
 ("[('the', 'DT'), ('greatest', 'JJS'), ('service', 'NN')]", 2),
 ("[('the', 'DT'), ('smallest', 'JJS'), ('difficulty', 'NN')]", 2),
 ("[('the', 'DT'), ('best', 'JJS'), ('society', 'NN')]", 2),
 ("[('the', 'DT'), ('highest', 'JJS'), ('promise', 'NN')]", 2),
 ("[('the', 'DT'), ('greatest', 'JJS'), ('satisfaction', 'NN')]", 2),
 ("[('the', 'DT'), ('warmest', 'JJS'), ('friendship', 'NN')]", 2),
 ("[('the', 'DT'), ('greatest', 'JJS'), ('amusement', 'NN')]", 1),
 ("[('the', 'DT'), ('pleasantest', 'JJS'), ('proof', 'NN')]", 1),
 ("[('the', 'DT'), ('fairest', 'JJS'), ('way', 'NN')]", 1),
 ("[('the', 'DT'), ('earnest', 'JJS'), ('pressing', 'NN')]", 1),
 ("[('the', 'DT'), ('best', 'JJS'), ('si