In [15]:
%%bash
if [[ ! -d "stanford-corenlp-full-2015-12-09" ]]; then
    wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip
    unzip stanford-corenlp-full-2015-12-09.zip
fi

if [[ ! -d "stanford_corenlp_pywrapper" ]]; then
    git clone https://github.com/brendano/stanford_corenlp_pywrapper
    cd stanford_corenlp_pywrapper
    pip install .
    cd ..
fi

pip install zss nltk



In [16]:
import os
from nltk.corpus import stopwords, wordnet as wn
from zss import simple_distance, Node
# Java version >= "1.8"
from stanford_corenlp_pywrapper import CoreNLP

# Setup our jvm parser
proc = CoreNLP('parse', corenlp_jars=['stanford-corenlp-full-2015-12-09/*'])

In [28]:
proc.parse_doc("How do these parsers work anyway?")['sentences']

[{u'char_offsets': [[0, 3],
   [4, 6],
   [7, 12],
   [13, 20],
   [21, 25],
   [26, 32],
   [32, 33]],
  u'deps_basic': [[u'root', -1, 4],
   [u'det', 3, 2],
   [u'advmod', 4, 0],
   [u'aux', 4, 1],
   [u'nsubj', 4, 3],
   [u'advmod', 4, 5],
   [u'punct', 4, 6]],
  u'deps_cc': [[u'root', -1, 4],
   [u'det', 3, 2],
   [u'advmod', 4, 0],
   [u'aux', 4, 1],
   [u'nsubj', 4, 3],
   [u'advmod', 4, 5],
   [u'punct', 4, 6]],
  u'lemmas': [u'how', u'do', u'these', u'parser', u'work', u'anyway', u'?'],
  u'parse': u'(ROOT (SBARQ (WHADVP (WRB How)) (SQ (VBP do) (NP (DT these) (NNS parsers)) (VP (VBP work) (ADVP (RB anyway)))) (. ?)))',
  u'pos': [u'WRB', u'VBP', u'DT', u'NNS', u'VBP', u'RB', u'.'],
  u'tokens': [u'How', u'do', u'these', u'parsers', u'work', u'anyway', u'?']}]

In [17]:
def convert_tag(tag):
    '''
    Simplify many tags to four basic types: noun, verb, adjective, adverb
    '''
    if tag in ['NN', 'NNP', 'NNS', 'NNPS']:
        return 'NN'
    if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 'VB'
    if tag in ['RB', 'RBR', 'RBS']:
        return 'RB'
    if tag in ['JJ', 'JJR', 'JJS']:
        return 'JJ'
    return tag

def pos_dep_tree(parsed):
    '''
    Convert a parse from Stanford dependency parser into a directional pos/relation zss tree
    '''
    root = Node('root')
    pos_nodes = { -1: root }
    for i, pos in enumerate(parsed['pos']):
        pos_nodes[i] = Node(convert_tag(parsed['pos'][i]))
    
    for dependency_edge in parsed['deps_cc']:
        relation, src_index, dest_index = dependency_edge
        relation_node = Node(relation)
        # NNS -> advmod -> VBP
        pos_nodes[src_index].addkid(relation_node)
        relation_node.addkid(pos_nodes[dest_index])
    return root

def sent_dep_tree(sent):
    return pos_dep_tree(proc.parse_doc(sent)['sentences'][0])

def dep_tree_similarity(dep1, dep2, smoothing=4.0):
    return smoothing / (smoothing + simple_distance(dep1, dep2))

def sentence_similarity(sent1, sent2, smoothing=4.0):
    return dep_tree_similarity(sent_dep_tree(sent1), sent_dep_tree(sent2), smoothing)

In [19]:
p1 = proc.parse_doc("How do these parsers work anyway?")['sentences'][0]
p2 = proc.parse_doc("How do these parsers work?")['sentences'][0]
p3 = proc.parse_doc("Some wild changed thing")['sentences'][0]

(simple_distance(pos_dep_tree(p1), pos_dep_tree(p2)),
 simple_distance(pos_dep_tree(p1), pos_dep_tree(p3)),
 simple_distance(pos_dep_tree(p2), pos_dep_tree(p3)))

(2, 13, 11)

In [20]:
(sentence_similarity("How do these parsers work anyway?", "How do these parsers work anyway?"),
 sentence_similarity("How do these parsers work anyway?", "How do these parsers work?"),
 sentence_similarity("How do these parsers work anyway?", "Some wild changed thing"))

(1.0, 0.6666666666666666, 0.23529411764705882)

In [21]:
def sentence_match_scores(sent, potential_sents, smoothing=4.0):
    tree = sent_dep_tree(sent)
    sent_trees = zip(potential_sents, map(sent_dep_tree, potential_sents))
    for check_sent, check_tree in sent_trees:
        yield check_sent, dep_tree_similarity(tree, check_tree, smoothing)

In [22]:
list(sentence_match_scores("How do these parsers work anyway?",
    ["How do these parsers work anyway?", "How do these parsers work?", "Some wild changed thing"]))

[('How do these parsers work anyway?', 1.0),
 ('How do these parsers work?', 0.6666666666666666),
 ('Some wild changed thing', 0.23529411764705882)]

In [23]:
from operator import itemgetter

STOP_WORDS = set(stopwords.words('english'))

def sentence_synset_expansion(sent):
    for word in sent.split():
        if word not in STOP_WORDS:
            # Take first example as a simplification
            yield ((syn.name(), syn.examples()[0])
                    for syn in wn.synsets(word) if syn.examples())
                
def sentence_synset_matches(sent):
    for syn_named_examples in map(tuple, sentence_synset_expansion(sent)):
        if not syn_named_examples:
            continue
        names, examples = zip(*syn_named_examples)
        # Very slow, we're repeating a lot of work on each synset example sentence
        scores = map(itemgetter(1), sentence_match_scores(sent, examples))
        total = sum(scores)
        for name, score in zip(names, scores):
            yield name, score / total

In [24]:
list(sentence_synset_matches("How do these parsers work anyway?"))

[(u'parser.n.01', 1.0),
 (u'work.n.01', 0.02404719589263042),
 (u'work.n.02', 0.01827586887839912),
 (u'employment.n.02', 0.035145901689229084),
 (u'study.n.02', 0.02404719589263042),
 (u'work.n.05', 0.028556045122498628),
 (u'workplace.n.01', 0.030459781463998536),
 (u'oeuvre.n.01', 0.028556045122498628),
 (u'work.v.01', 0.02404719589263042),
 (u'work.v.02', 0.057112090244997256),
 (u'work.v.03', 0.01986507486782513),
 (u'function.v.01', 0.01986507486782513),
 (u'work.v.05', 0.028556045122498628),
 (u'exercise.v.03', 0.03263548013999843),
 (u'make.v.36', 0.022844836097998904),
 (u'work.v.08', 0.030459781463998536),
 (u'work.v.09', 0.022844836097998904),
 (u'work.v.10', 0.028556045122498628),
 (u'bring.v.03', 0.03263548013999843),
 (u'work.v.12', 0.04153606563272528),
 (u'cultivate.v.02', 0.035145901689229084),
 (u'work.v.14', 0.035145901689229084),
 (u'influence.v.01', 0.02538315121999878),
 (u'work.v.16', 0.035145901689229084),
 (u'work.v.17', 0.03263548013999843),
 (u'work.v.18', 0.

In [25]:
from collections import defaultdict

def synset_counts(sentences):
    counts = defaultdict(float)
    for sent in sentences:
        for name, score in sentence_synset_matches(sent):
            counts[name] += score
    return counts

def top_k(counter, k):
    return sorted(counter.items(), key=itemgetter(1), reverse=True)[:k]

In [26]:
top_k(synset_counts([
    "How do these parsers work anyway?",
    "How do these parsers work anyway?",
    "How do these parsers work?",
    "Some wild changed thing"
]), 20)

[(u'parser.n.01', 3.0),
 (u'some.a.01', 0.24174121293230966),
 (u'some.s.04', 0.2197647390293724),
 (u'approximately.r.01', 0.2014510107769247),
 (u'some.s.02', 0.18595477917869974),
 (u'changed.s.03', 0.15857623202002766),
 (u'some.s.03', 0.15108825808269352),
 (u'thing.n.03', 0.13052873953703442),
 (u'changed.a.01', 0.11532816874183831),
 (u'work.v.02', 0.11422418048999451),
 (u'thing.n.11', 0.10877394961419533),
 (u'matter.n.01', 0.10877394961419533),
 (u'thing.n.01', 0.1004067227207957),
 (u'wild.s.04', 0.09847637083584403),
 (u'thing.n.04', 0.09323481395502457),
 (u'switch.v.03', 0.0906149897257301),
 (u'thing.n.07', 0.08701915969135626),
 (u'thing.n.10', 0.08701915969135626),
 (u'work.v.12', 0.08307213126545056),
 (u'solve.v.01', 0.08307213126545056)]

In [27]:
from nltk import Text, corpus

'''
Each of these sentence parse calls are slow
We can cache and optimize these queries to get solid performance improvements
And we can split these sentences into separate parser instances and paralellize the work
'''
emma = map(lambda s: ' '.join(s), Text(corpus.gutenberg.sents('austen-emma.txt')))[4:6]
top_k(synset_counts(emma), 20)

[(u'two.s.01', 1.0),
 (u'affectionate.s.01', 1.0),
 (u'affection.n.01', 1.0),
 (u'excellent.s.01', 1.0),
 (u'daughter.n.01', 1.0),
 (u'indistinct.a.01', 1.0),
 (u'beget.v.01', 0.6465923944749823),
 (u'mother.n.05', 0.5135599429266327),
 (u'consequence.n.03', 0.5034965034965034),
 (u'mother.v.01', 0.5013323252379033),
 (u'caress.v.01', 0.5),
 (u'caress.n.01', 0.5),
 (u'ago.s.01', 0.5),
 (u'ago.r.01', 0.5),
 (u'consequence.n.01', 0.49650349650349646),
 (u'mother.n.01', 0.49543429788216325),
 (u'indulgent.s.02', 0.33901988482016737),
 (u'sister.n.01', 0.33617021276595743),
 (u'sister.n.03', 0.33617021276595743),
 (u'indulgent.s.03', 0.33467347604042164)]

In [None]:
# If we load sc into our notebook we can now parallelize this without changing our code:

def load_sentences(pdf_name):
    # Assume this file has been downloaded -- we could pull from s3 here first or load from HDFS
    txt_name = pdf_file.replace('.pdf', '.txt')
    pdftotext = Popen(['pdftotext', pdf_name, txt_name], shell=False, stdout=PIPE)
    pdftotext.wait()
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    with open(txt_name) as txt_file:
        return tokenizer.tokenize(txt_file.read())
    
# Collect some small identifier and pass into the spark distributed store
pdf_names = [f for f in os.listdir('./budgets/') if '.pdf' in f.lower()]
pdf_names_rdd = sc.parallelize(pdf_names)

# Do the transformation from file name into rows of sentences
pdf_sent_rdd = pdf_names_rdd.flatMap(load_sentences)
pdf_all_sense_rdd = pdf_sent_rdd.flatMap(lambda s: synset_counts(s).items())

# Aggregate each distributed count of synsets
adder = lambda a, b: a + b
pdf_sense_rdd = pdf_all_sense_rdd.aggregateByKey(0, adder, adder)
pdf_sense_rdd.take(5)
'''
(sense_1, global_count),
(sense_2, global_count),
(sense_3, global_count),
(sense_4, global_count),
(sense_5, global_count)
'''