In [1]:
from collections import namedtuple

from pycorenlp import StanfordCoreNLP

In [2]:
nlp = StanfordCoreNLP('http://localhost:9001')

In [3]:
Constituency = namedtuple("Constituency", ["tag", "startIndex", "endIndex", "depth"])


def skip_blank_space(parse, pointer):
    while pointer < len(parse) and parse[pointer].isspace():
        pointer += 1
    return pointer


def read_tag(parse, pointer):
    tag_start = pointer
    while pointer < len(parse) and not parse[pointer].isspace():
        pointer += 1
    return parse[tag_start:pointer], skip_blank_space(parse, pointer)


def read_token(parse, pointer, tokens):
    token_start = pointer
    while pointer < len(parse) and parse[pointer] != ")":
        pointer += 1
    tokens.append(parse[token_start:pointer])
    return pointer
    
    
def read_body(parse, pointer, constituencies, tokens, depth):
    if parse[pointer] == "(":
        return read_constituency(parse, pointer, constituencies, tokens, depth)
    else:
        return None, read_token(parse, pointer, tokens)
        
        
def read_constituency(parse, pointer, constituencies, tokens, depth):
    assert parse[pointer] == "("
    pointer += 1
    tag, pointer = read_tag(parse, pointer)
    first_child, pointer = read_body(parse, pointer, constituencies, tokens, depth + 1)
    if first_child is None:
        constituency = Constituency(
            tag=tag, startIndex=len(tokens), endIndex=len(tokens) + 1, depth=depth
        )
        assert parse[pointer] == ")"
    else:
        child = first_child
        while parse[pointer] != ")":
            child, pointer = read_body(parse, pointer, constituencies, tokens, depth + 1)
        constituency = Constituency(
            tag=tag, startIndex=first_child.startIndex, endIndex=child.endIndex, depth=depth
        )
    pointer += 1
    constituencies.append(constituency)  
    return constituency, skip_blank_space(parse, pointer)


def read_constituencies(parse):
    constituencies = []
    tokens = []
    read_constituency(parse, 0, constituencies, tokens, 0)
    return constituencies

In [4]:
def get_subject(parse):
    constituencies = read_constituencies(parse)
    np_constituencies = [c for c in constituencies if c.tag == "NP"]
    if len(np_constituencies) == 0:
        return None
    else:
        top_np_constituency = sorted(np_constituencies, key=lambda c: c.depth)[0]
        return top_np_constituency.startIndex, top_np_constituency.endIndex

In [12]:
def create_resolution_map(corefs):
    resolution_map = {}

    for _, coref_group in corefs.items():
        representative_mention = None
        keys = []
        for coref in coref_group:
            if coref["isRepresentativeMention"]:
                representative_mention = coref["text"]
            keys.append((coref["sentNum"], coref["startIndex"], coref["endIndex"]))
        for key in keys:
            resolution_map[key] = representative_mention    
            
    return resolution_map


def resolve_subject(sentence_num, subject, resolution_map, tokens):
    key = (sentence_num, *subject)
    if key in resolution_map:
        return resolution_map[key]
    else:
        return " ".join(
            [tokens[index - 1]["originalText"] for index in range(*subject)]
        )

In [13]:
def get_resolved_subjects(text, nlp=nlp):
    result = nlp.annotate(
        text,
        properties={
           'annotators': 'parse,coref',
           'outputFormat': 'json',
           'timeout': 60000,
        }
    )
    
    resolution_map = create_resolution_map(result["corefs"])
    
    resolved_subjects = []
    for i, sent in enumerate(result["sentences"]):
        subject = get_subject(sent["parse"])
        if subject is None:
            resolved_subject = None
        else:
            resolved_subject = resolve_subject(
                i + 1,
                subject,
                resolution_map,
                result["sentences"][i]["tokens"]
            )
        resolved_subjects.append(resolved_subject)
    
    return resolved_subjects

In [20]:
def test__get_resolved_subjects():
    text = "This movie was actually neither that funny, nor super witty. The movie was meh. I liked watching that movie. If I had a choice, I would not watch that movie again."
    assert get_resolved_subjects(text, nlp) == ["This movie", "This movie", "I", "I"]
    

test__get_resolved_subjects()

In [None]:
def move_st(text, st, n_prev_sentences):
    sent_counter = 0
    while st > 0:
        if text[st] in {".", "!", "?"}:
            sent_counter += 1
            if sent_counter > n_prev_sentences:
                return st + 1, sent_counter - 1
        st -= 1
    return 0, sent_counter