### New revision of rule-based Numerical Relation Extractor.

Changes w.r.t. previous (Thesis) extractor:

Uses spaCy rather than quantulum to find quantities, and identifies values/units independently.

Identifies certain linked quantities (e.g. from x to y) and can infer units for one of the quantities even if not explicit.

Builds verb phrases.

Sanity checks for entity phrase - improves precision.

Entirely new code.

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import csv
from quantulum3 import parser
import unicodedata
import spacy
nlp = spacy.load('en_core_web_sm')
from itertools import groupby
from operator import itemgetter

#### Defining functions

In [2]:
# Finds a token's parent noun positioned to the right of the token.
# Used to compensate for spaCy not identifying a unit as part of the quantity phrase. "500 points"
# Input: token
# Output: parent noun
def get_parent_noun(token):
    try:
        if token.head.i > token.i and (token.head.tag_[:2] == 'NN' or token.head.pos_ == 'NUM'):
            token = token.head
            return get_parent_noun(token)
        else:
            return token
    except:
        return token

In [3]:
# Finds a token's largest noun phrase (either from noun chunking or entity recognition).
# Input: token
# Output: token's noun chunk
def get_chunk(token):
    doc = token.doc
    try:
        tok_chunk = next(iter([chunk for chunk in doc.noun_chunks if token in chunk]), [])
        tok_ent = next(iter([ent for ent in doc.ents if token in ent]), [])
        if tok_ent and tok_ent[0].tag_ == "IN":
            tok_ent = doc[tok_ent.start + 1:tok_ent.end]
        if tok_chunk and len(tok_chunk)>len(tok_ent):
            out_chunk = tok_chunk
        elif tok_ent and len(tok_chunk)<=len(tok_ent):
            out_chunk = tok_ent
        else:
            out_chunk = doc[token.i: token.i+1]
        
        if doc[out_chunk.end].text == 'per':
            out_chunk = doc[out_chunk.start:get_chunk(doc[out_chunk.end + 1]).end]
        
        return out_chunk
    except:
        return doc[token.i: token.i+1]

In [4]:
# Finds the closest parent verb to a token in terms of dependency distance. 
# Input: token
# Output: parent verb
def get_parent_verb(token):
    try:
        if token == token.head:
            return token
        else:
            token = token.head
            if token.tag_[:2] not in ['VB','RB']: #token.pos_ != 'VERB' or 
                token = get_parent_verb(token)
            return token
    except:
        return token

In [5]:
# Constructs a verb phrase by looking for consecutive verbs and adverbs. 
# Input: verb
# Output: verb phrase

def prev_verb(verb):
    doc = verb.doc
    if doc[verb.i-1].tag_[:2] in ["VB","RB"]:
        return prev_verb(doc[verb.i-1])
    else:
        return verb

def next_verb(verb):
    doc = verb.doc
    if doc[verb.i+1].tag_[:2] in ["VB","RB"]:
        return next_verb(doc[verb.i+1])
    else:
        return verb

def get_verb_chunk(verb):
    doc = verb.doc
    return doc[prev_verb(verb).i : next_verb(verb).i+1]

In [6]:
# Finds units before or after a quantity that spaCy may miss. 
# Identifies a quantity phrase starting with "from" and finds its second quantity by searching for "to".
# Input: spaCy doc, quantity start/end indices, quantity phrase
# Output: quantity phrase, second quantity (if exists)

def get_missing_units(doc,q_ranges,q):
    q2 = False
    units_before = [child.i for child in q.root.subtree if child.i < q.start]
    try:
        out_phrase = doc[units_before[0]:get_parent_noun(q[-1]).i + 1]
    except:
        out_phrase = doc[q.start:get_parent_noun(q[-1]).i + 1]
    
    try:
        if doc[out_phrase.start - 1].text == 'from':
            temp = next(token for token in doc[out_phrase.start - 1].subtree if token.text == 'to')
            q2 = next(q2 for q2 in doc[out_phrase.start - 1].subtree if (q2.i> temp.i) and (q2.tag_ == 'CD'))
            temp_q2 = next(doc[tup[0]:tup[1]+1] for tup in q_ranges if tup[0] == q2.i)
            temp_q2 = doc[temp_q2.start : get_chunk(temp_q2[-1]).end]
            temp_q2 = doc[temp_q2.start:get_parent_noun(temp_q2[-1]).i + 1]
            out_phrase = doc[out_phrase.start: temp_q2.end]
    except:
        pass
    
    return out_phrase, q2

In [7]:
# Finds a parent relation noun phrase to the left of the quantity. 
# Input: quantity phrase, verb
# Output: list of noun phrases
def get_ancestor_rel(quantity, verb):
    tok_list = []
    itter = 1
    token = quantity.root
    while token != verb and itter < len(token.doc):
        if (token.head.i < quantity.start) and (token.head.tag_[:2] == 'NN'):
            tok_list.append((get_chunk(token.head),(get_chunk(token.head).start,get_chunk(token.head).end)) )
        token = token.head
        itter += 1
    return tok_list

# Finds a relation noun phrase dependant on, and to the right of the quantity. 
# Input: quantity phrase
# Output: list of noun phrases

def get_descendant_rel(quantity):
    token = quantity.root
    return [(get_chunk(t),(get_chunk(t).start,get_chunk(t).end)) \
            for t in token.subtree if (t.i > quantity.end) and (t.tag_[:2] == 'NN')]

In [8]:
# Splits a phrase consisting of two quantities into two phrases, and finds and compares their units.
# Input: quantity phrase, values of both quantities
# Output: units of both quantities
def split_multi_q(q_phrase, value1, value2):
    doc = q_phrase.doc
    try:
        split_ind = next(token.i for token in q_phrase if token.lower_ in ['and','or','to'])
    except:
        return
    
    q1_phrase = doc[q_phrase.start:split_ind]
    q2_phrase = doc[split_ind + 1:q_phrase.end]
    
    
    " ".join([token.text for token in q1_phrase if not token.is_stop]).replace(value1.text,'').strip()
    
    q1_unit = " ".join([token.text for token in q1_phrase if not token.is_stop or token.lower_ == 'per']).replace(value1.text,'').strip()
            #" ".join(q1_phrase.text.replace(value1.text,'').strip().split())
    q2_unit = " ".join([token.text for token in q2_phrase if not token.is_stop or token.lower_ == 'per']).replace(value2.text,'').strip()
            #" ".join(q2_phrase.text.replace(value2.text,'').strip().split())
        
    if not q1_unit:
        out_unit_1 = q2_unit
        out_unit_2 = q2_unit
    
    elif not q2_unit:
        out_unit_1 = q1_unit
        out_unit_2 = q1_unit
    
    else:
        out_unit_1 = q1_unit
        out_unit_2 = q2_unit
    
    return out_unit_1, out_unit_2

In [9]:
# Builds the numerical relation extraction tuple given a quantity phrase
# Input: doc, quantity phrase, number of quantities in the phrase, indices of the numbers, indices of the quantities
# Output: extraction tuple for the quantity phrase

def get_extraction(doc, q_phrase, rev_q_phrases_dict, q_phrases_list, q_ranges):

    # IDENTIFYING VERB
    
    verb = get_parent_verb(q_phrase.root)
    
    if verb.tag_[:2] == 'RB':
        out_verb = get_verb_chunk(verb)
        verb = get_parent_verb(verb)
    elif verb.tag_[:2] == 'VB':
        out_verb = doc[verb.i : verb.i + 1]
    else:
        raise ValueError("Quantity has no parent verb.")
    
    #print('verb is', verb)
    
    # FINDING RELATION WORDS
    
    rel_A_list = get_ancestor_rel(q_phrase, verb)
    rel_D_list = get_descendant_rel(q_phrase)
    
    try:
        a = [bool(set(q_phrases_list).intersection(range(*tup[1]))) for tup in rel_A_list].index(False)
    except:
        a = 10
    try:
        b = [bool(set(q_phrases_list).intersection(range(*tup[1]))) for tup in rel_D_list].index(False)
    except:
        b = 10
    
    if a < b:
        out_relation = rel_A_list[a][0]
    elif b < a:
        out_relation = rel_D_list[b][0]
    elif (a == b) and (a != 10):
        out_relation = rel_A_list[a][0]
    else:
        out_relation = ''
    
    #print('relation is', out_relation)
    
    # FINDING ENTITY
    
    try:
        nsubj = next(child for child in verb.subtree if (child.dep_ in ['nsubj','nsubjpass']) and child.i < verb.i)
    except:
        verb = get_parent_verb(verb)
        nsubj = next(child for child in verb.subtree \
                     if (child.dep_ in ['nsubj','nsubjpass']) and child.i < verb.i)
    
    if (get_verb_chunk(verb).start > 0) and doc[get_verb_chunk(verb).start - 1].dep_ == 'punct':
        out_entity = get_chunk(nsubj)
    else:
        try:
            out_entity = [get_chunk(child) for child in verb.subtree \
                          if (child.tag_[:2]=='NN' or child.dep_ in ['nsubj','nsubjpass'])\
                          and (child.i < verb.i)][-1]
            if out_relation == '' and out_entity.root.dep_ not in ['nsubj','nsubjpass']:
                out_relation = get_chunk(nsubj)
        except:
            out_entity = get_chunk(nsubj)

    #print('entity is', out_entity)
    
    # HANDLING QUANTITIES
    
    if len(rev_q_phrases_dict[q_phrase]) == 1:
        out_value_1 = doc[rev_q_phrases_dict[q_phrase][0][0]: rev_q_phrases_dict[q_phrase][0][1]+1]
        out_unit_1 = " ".join([token.text for token in q_phrase if (not token.is_stop) or token.lower_ == 'per']).replace(out_value_1.text,'').strip()
                    #" ".join(q_phrase.text.replace(out_value_1.text,'').strip().split())
        out_value_2, out_unit_2 = '', ''
        try:
            # use quantulum to standardize value
            out_value_1 = parser.parse(out_value_1.text)[0].value
        except:
            pass
        
    elif len(rev_q_phrases_dict[q_phrase]) == 2:
        out_value_1 = doc[rev_q_phrases_dict[q_phrase][0][0]: rev_q_phrases_dict[q_phrase][0][1]+1]
        out_value_2 = doc[rev_q_phrases_dict[q_phrase][1][0]: rev_q_phrases_dict[q_phrase][1][1]+1]
        out_unit_1, out_unit_2 = split_multi_q(q_phrase, out_value_1, out_value_2)
        
        try:
            # use quantulum to standardize values
            out_value_1 = parser.parse(out_value_1.text)[0].value
            out_value_2 = parser.parse(out_value_2.text)[0].value
        except:
            pass
    

    # QUANTULUM OUTPUTS
    
    quants = parser.parse(q_phrase.text)
    try:
        quantulum_v1, quantulum_u1, quantulum_e1 = quants[0].value, quants[0].unit.name, quants[0].unit.entity.name
    except:
        quantulum_v1, quantulum_u1, quantulum_e1 = '', '', ''
    try:
        quantulum_v2, quantulum_u2, quantulum_e2 = quants[1].value, quants[1].unit.name, quants[1].unit.entity.name
    except:
        quantulum_v2, quantulum_u2, quantulum_e2 = '', '', ''
    
    # check if entity occurs in any other output, raise error if it is.
    if out_entity and out_verb and (out_entity.start in \
                                                 [*range(out_verb.start,out_verb.end),\
                                                  *range(q_phrase.start,q_phrase.end)]):
        raise ValueError("Entity is not distinct.")
    elif str(out_entity) in [str(out_verb), str(out_relation), str(q_phrase)]:
        raise ValueError("Entity is not distinct.")
    
    
    # form output
    try:
        arg1_f = " ".join([token.text for token in nlp(str(out_entity)) if not token.is_stop])
    except:
        arg1_f = out_entity
    try:
        rel_f = " ".join([token.text for token in nlp(str(out_relation)) if not token.is_stop])
    except:
        rel_f = out_relation
    
    out_extr = [doc, out_entity, out_verb, out_relation, q_phrase, out_value_1, out_unit_1,\
                out_value_2, out_unit_2, \
                quantulum_e1, quantulum_v1, quantulum_u1, quantulum_e2, quantulum_v2, quantulum_u2, arg1_f, rel_f]
    
    out_extr = [str(x) for x in out_extr]
    
    
    return out_extr

In [10]:
# Loops over a text to identify quantities and calls the extraction function for each quantity phrase.
# Input: text
# Output: list of extractions for each quantity phrase in the text.

def get_NRE(txt):

    txt = unicodedata.normalize('NFKD',txt)
    doc = nlp(' '.join(txt.split()))
    #print(doc)
    
    q_phrases_list = [token.i for token in doc \
                  if token.ent_type_ in ['TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'CARDINAL', 'DATE'] \
                  and token.tag_ == "CD" and not (token.ent_type_ == 'DATE' and token.shape_ == 'dddd')]
    #print(q_phrases_list)
    
    q_ranges =[]
    for k,g in groupby(enumerate(q_phrases_list),lambda x:x[0]-x[1]):
        group = (map(itemgetter(1),g))
        group = list(map(int,group))
        q_ranges.append((group[0],group[-1]))
    #print(q_ranges)

    q_phrases_dict = {}
    for i in q_ranges:
        quantity = doc[i[0]:i[1] +1]
        quantity_chunk = doc[get_chunk(quantity[0]).start : get_chunk(quantity[-1]).end]
        q_phrases_dict[i] = quantity_chunk
    #print(q_phrases_dict)

    temp_dict = {}
    for key, value in q_phrases_dict.items():
        phrase, q2 = get_missing_units(doc,q_ranges,value)
        if q2:
            temp_dict[key] = phrase
            for k in q_phrases_dict.keys():
                if k[0] == q2.i:
                    temp_dict[k] = phrase
        elif key in temp_dict:
                pass
        else:
            temp_dict[key] = phrase
    #print(temp_dict)

    rev_q_phrases_dict = {}
    for key, value in temp_dict.items():
        rev_q_phrases_dict.setdefault(value, list()).append(key)
    
    #print(rev_q_phrases_dict)

    out_extract_list = []
    for q_phrase in rev_q_phrases_dict.keys():
        try:
            #print(q_phrase)
            extraction = get_extraction(doc, q_phrase, rev_q_phrases_dict, q_phrases_list, q_ranges)
            out_extract_list.append(extraction)
        except:
            continue
    
    return out_extract_list

#### Output to CSV

In [11]:
f_out = open("finance_NewExtractions.csv", mode="a", newline='', encoding="utf-8")
extract_writer = csv.writer(f_out, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
# process corpus 100 documents at a time into pandas dataframe.
for row in range(2001,4001,100):
    df = pd.read_csv("articles.csv", usecols=[1,3,4], nrows=100, encoding="utf-8", skiprows=range(1, row))
    df = df.dropna().reset_index(drop=True) # drop empty rows
    df = df[df.body.str.contains('\d+')].reset_index(drop=True) # drop any documents not containing any numbers
    df.body = df.body.map(lambda txt: ' '.join(txt.split())) # remove extra whitespaces from each document
    print('processing row ',row,' to row ', row+100)
    
    # loop extraction over each document in corpus
    for txt in df.body:
        txt = unicodedata.normalize('NFKD',txt)
        doc = nlp(txt)
        
        # document-level coreference resolution
        # txt = doc._.coref_resolved
        # doc = nlp(txt)
        # split document into sentences
        sentences = [sent.text for sent in doc.sents]
        for sent in sentences:
            try:
                extractions = get_NRE(str(sent))
                for e in extractions:
                    extract_writer.writerow(e)
            except:
                pass

f_out.close()

processing row  2001  to row  2101
processing row  2101  to row  2201
processing row  2201  to row  2301
processing row  2301  to row  2401
processing row  2401  to row  2501
processing row  2501  to row  2601
processing row  2601  to row  2701
processing row  2701  to row  2801
processing row  2801  to row  2901
processing row  2901  to row  3001
processing row  3001  to row  3101
processing row  3101  to row  3201
processing row  3201  to row  3301
processing row  3301  to row  3401
processing row  3401  to row  3501
processing row  3501  to row  3601
processing row  3601  to row  3701
processing row  3701  to row  3801
processing row  3801  to row  3901
processing row  3901  to row  4001


In [13]:
f_in = open("astrophysics_corpus.txt", "r", newline='', encoding="utf-8")
f_out = open("astro_NewExtractions.csv", mode="w", newline='', encoding="utf-8")
extract_writer = csv.writer(f_out, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)

rownum = 1
for row in f_in:
    
    if rownum in range(1,200002, 5000):
        print(rownum)
    
    if rownum < 200002:
        try:
            extractions = get_NRE(str(row))
            for e in extractions:
                extract_writer.writerow(e)
        except:
            pass
    else:
        pass
    
    rownum +=1

f_in.close()
f_out.close()

1
5001
10001
15001
20001
25001
30001
35001
40001
45001
50001
55001
