In [1]:
from nltk.corpus import gutenberg, reuters, brown
from nltk.tree import Tree
import nltk
import html
import spacy
import bllipparser
import pandas as pd
import graphviz
import os
import time
from dask import delayed

In [2]:
rrp = bllipparser.RerankingParser.fetch_and_load('WSJ+Gigaword-v2', verbose=True)
nlp = spacy.en.English()

Model directory: /Users/kesslej/.local/share/bllipparser/WSJ+Gigaword-v2
Model directory already exists, not reinstalling


In [3]:
nltk.download('gutenberg')
nltk.download('reuters')
nltk.download('brown')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/kesslej/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package reuters to /Users/kesslej/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package brown to /Users/kesslej/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [4]:
len(brown.fileids())

500

In [5]:
def get_coordinated_nps(tree):
    if isinstance(tree, Tree):
        if tree.label() == 'NP':
            for coordinates in get_coordinates(tree):
                yield coordinates
        for subtree in tree:
            for coordinates in get_coordinated_nps(subtree):
                yield coordinates
   
def adjust_for_sticky_final_nps(coordinates):
    if len(coordinates) > 1:
        if coordinates[-1].label() == 'NP' and len(coordinates[-1]) > 2:
            # match for NP|NN* (,) CC NP|NN*
            if (coordinates[-1][0].label()[:2] in ['NP', 'NN']
                and coordinates[-1][-1].label()[:2] in ['NP', 'NN']
                and coordinates[-1][-2].label()[:2] == 'CC'
                and (len(coordinates[-1]) == 3
                     or (len(coordinates[-1]) == 4 and coordinates[-1][1].label()[:2] == ','))):
                coordinates = coordinates[:-1] + [coordinates[-1][0], coordinates[-1][-1]]
    return coordinates

def get_coordinates(tree):
    coordinates = []
    last_constituent_conjunct = True
    for subtree in tree:
        if last_constituent_conjunct and subtree.label()[:2] in ('NP', 'NN'):
            coordinates.append(subtree)
            last_constituent_conjunct = False
        elif subtree.label()[:2] in ('CC', ','):
            last_constituent_conjunct = True
        else:
            break
    # Common errors: ((John), (Bill and Scott)) or (John, (Bill, and Scott))
    # should be ((John), (Bill) and (Scott))
    coordinates = adjust_for_sticky_final_nps(coordinates)
    if len(coordinates) > 2:
        yield coordinates
        
def get_number_from_np(tree):
    pos_list = [pos for orth, pos in tree.pos()]
    if pos_list[-1][-1] == 'S' or 'CC' in pos_list:
        return 'P'
    return 'S'

def get_number_from_np(tree):
    pos_list = [pos for orth, pos in tree.pos()]
    if type(tree[0]) == str:
        if tree.label() in ('NNS', 'NNPS'):
            return 'P'
        else:
            return 'S'
    if 'CC' in pos_list:
        return 'P'
    for subsubtree in reversed(tree):
        if subsubtree.label() == 'NP':
            return get_number_from_np(subsubtree)
        else:
            if subsubtree.label() in ('NNS', 'NNPS'):
                return 'P'
            if subsubtree.label() in ('NN', 'NNP'):
                return 'S'
    return 'S'

def which_is_more_ambiguous(numbers):
    if numbers[-3:] == ['S', 'S', 'S']: return 'Oxford' # my mother, Jill, and Sam
    if numbers[-3:] == ['S', 'S', 'P']: return 'Oxford' # my mother, Jill, and the Smiths
    if numbers[-3:] == ['S', 'P', 'S']: return 'Neither' # my mother, the Smiths, and Sam
    if numbers[-3:] == ['S', 'P', 'P']: return 'Neither' # my mother, the Smiths, and the Joneses
    if numbers[-3:] == ['P', 'S', 'S']: return 'Lack of Oxford' # my parents, Jill and Sam
    if numbers[-3:] == ['P', 'P', 'S']: return 'Lack of Oxford' # my family, the Smiths and Sam
    if numbers[-3:] == ['P', 'S', 'P']: return 'Lack of Oxford' # my family, Sam and the Smiths
    if numbers[-3:] == ['P', 'P', 'P']: return 'Neither' # my family, the Smiths and the Joneses

In [None]:

# patch to get the bllipparser to work with dask
'''
import dask.bag as db
import sys
sys.modules['JohnsonReranker'] = bllipparser.JohnsonReranker
'''
def toks2text(toks):
    return (' '.join(toks).replace(" 's", "'s").replace(' , ', ', ')
            .replace(" .", '.').replace(' - ', '-').replace('( ', '(').replace(' )', ')'))


data = []
for fileid in brown.fileids()[:2]:
    print(fileid)
    #raw_documents_less_headline = '\n'.join(brown.raw(fileid).split('\n'))
    for sent in brown.sents(fileid):
        #print(sent)
        reformated_sentence = html.unescape(toks2text(sent))
        data.append({'fileid': fileid, 'sent':reformated_sentence})
        try:
            tree = rrp.parse(reformated_sentence).fuse().as_nltk_tree()
        except:
            print('bad sentence')
            print(reformated_sentence)
            continue
        #print(reformated_sentence)
        for coord in get_coordinated_nps(tree):
            nps = '~~~'.join([' '.join(np.leaves()) for np in c®foord])
            try:
                numbers = [get_number_from_np(np) for np in coord]
            except:
                import pdb; pdb.set_trace()
                continue
            ambiguity = which_is_more_ambiguous(numbers)
            entry = {'fileid': fileid, 
                     #'tree': tree,
                     'sent': reformated_sentence,
                     'nps': nps,
                     'numbers': numbers,
                     'ambiguity': ambiguity}
            print(entry)
            data.append(entry)


ca01
{'nps': 'the widespread interest in the election~~~the number of voters~~~the size of this city', 'sent': "`` Only a relative handful of such reports was received '', the jury said, `` considering the widespread interest in the election, the number of voters and the size of this city ''.", 'fileid': 'ca01', 'numbers': ['S', 'S', 'S'], 'ambiguity': 'Oxford'}


In [9]:
df = pd.DataFrame(data)
df.to_csv('brown_ambiguity.csv', index=False)

In [16]:
df['ambiguity'].value_counts()

Oxford            9
Lack of Oxford    4
Neither           3
Name: ambiguity, dtype: int64