Rule based Open Numerical Relation Extractor v1 (as described in Thesis)

In [1]:
# import libraries
import numpy as np
import pandas as pd
from quantulum3 import parser
#import re
import spacy
nlp = spacy.load('en_core_web_sm')
import unicodedata
#import neuralcoref
#neuralcoref.add_to_pipe(nlp)
import csv

Functions for forming extractions

In [3]:
# returns noun chunk of a passed token
def get_chunk(token):
    try:
        return [chunk for chunk in doc.noun_chunks if token in chunk][0]
    except:
        return doc[token.i: token.i+1]

In [4]:
# returns noun chunk of a passed phrase
def get_chunk2(text):
    try:
        return doc[[chunk for chunk in doc.noun_chunks if text[0] in chunk][0].start : text.end]

    except:
        return text

In [5]:
# returns parent verb of a passed token
def get_parent_verb(token):
    try:
        token = token.head
        if token.pos_ != 'VERB':
            token = get_parent_verb(token)
        return token
    except:
        return token

In [6]:
# returns start and end token indicies of quantities in a doc
def get_indices(doc, quants):

    quant_starts = []
    quant_ends = []
    
    for token in doc:
        for quant in quants:
            if token.idx == quant.span[0]:
                quant_starts.append(token.i)
            if token.idx == quant.span[1] or token.idx == quant.span[1] + 1:
                quant_ends.append(token.i)
                
    qSpans = list(zip(quant_starts,quant_ends))
    
    return qSpans

In [7]:
# returns joined consecutive noun phrases
def combine_phrase(ind, prev_or_next,noun_list):
    if prev_or_next == 'prev':
        if noun_list[ind-1][0].pos_ in ['NOUN','PROPN']: 
            return combine_phrase(ind-1,'prev',noun_list)
        else:
            return (noun_list[ind][2],ind)
    
    if prev_or_next == 'next':
        
        try:
            if noun_list[ind+1][0].pos_ in ['NOUN','PROPN']: 
                try:
                    return combine_phrase(ind+1,'next',noun_list)
                except:
                    return (noun_list[ind+1][2],ind+1)
            else:
                return (noun_list[ind][2],ind)
        except:
            return (noun_list[ind][2],ind)

In [8]:
# Function and keywords used to extract in BONIE test data
# returns extraction given a document and keywords.

keywords = {
    'length':{
        'height':['height','high','highest', 'elevation', 'altitude', 'tall'],
        'length':['length','long','longest'],
        'width':['width','wide','breadth'],
        'distance':['distance','away','furthest'],
        'depth':['depth','deep']
    },
    'mass':{
        'weight':['weight','weighs'],
        'mass':['mass']
    }}

def extract(q,quants,qSpans,doc,keywords):
    qroot = get_chunk(doc[qSpans[q][0]:qSpans[q][1]].root).root
    verb = get_parent_verb(doc[qSpans[q][0]])

    if not [t for t in verb.lefts if t.pos_ in ['NOUN','PROPN','PRON']]:
        verb = get_parent_verb(get_parent_verb(doc[qSpans[q][0]]))

    # create a list
    noun_list = [(token,token.dep_,token.i) for token in verb.subtree 
                 if (token.dep_ in ['poss','pobj','dobj','attr','nsubj','nsubjpass','punct','det','cc']
                     or token in [qroot,verb]) and token.dep_ != 'compound']


    #--------------------------------------------------------------
    # remove subphrases
    sub_phrase = [(token,token.dep_,token.i) for token in verb.subtree 
         if token.text == ',' or token in [qroot,verb] or (token.head == verb and token.dep_ in ['nsubj','nsubjpass'])]

    for i in range(len(sub_phrase) -1):
        if sub_phrase[i][1] == 'punct' and sub_phrase[i+1][1] == 'punct':
            del noun_list[noun_list.index(sub_phrase[i]):noun_list.index(sub_phrase[i+1])+1]
    try:
        if sub_phrase[0][1] == 'punct':
            del noun_list[:noun_list.index(sub_phrase[0])]
    except:
        pass
    try:
        if sub_phrase[-1][1] == 'punct':
            del noun_list[noun_list.index(sub_phrase[-1]):]
    except:
        pass
    #--------------------------------------------------------------
    q_ind = noun_list.index((qroot, qroot.dep_,qroot.i))
    v_ind = noun_list.index((verb, verb.dep_,verb.i))
    kword = []
    kword_fixed = []

    
    if qroot.dep_ in ['nsubj','nsubjpass']:

        try:
            p_ind = noun_list.index([i for i in noun_list[v_ind:] if i[1] == 'punct'][0])
        except:
            p_ind = len(doc)-1
        #-------------------------------------------------------------------------------------#
        # split noun chunks containing a possessive to get entity's keyword
        #-------------------------------------------------------------------------------------#
        if noun_list[q_ind+1][0].pos_ in ['DET','PROPN','NOUN']:
            ent_ind = q_ind+1
            entity = doc[noun_list[ent_ind][2]:combine_phrase(ent_ind,'next',noun_list)[0] + 1]
            kword.append(doc[verb.i + 1 :noun_list[p_ind][2]])

        else:
            try:
                ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'poss'][0])
                entity = doc[combine_phrase(ent_ind,'prev',noun_list)[0] :noun_list[ent_ind][2] + 1]

                kword_fixed.append(doc[noun_list[ent_ind+1][2] : combine_phrase(ent_ind+1,'next',noun_list)[0]+1])
            except:
            #-------------------------------------------------------------------------------------#
            # If > 1 determiner before verb, first is relation, second is entity
            #-------------------------------------------------------------------------------------#
                try:
                    ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'det'][1])
                except:
                    try:
                        ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[0].pos_ == 'PROPN' and i[1] == 'pobj'][0])
                    except:
                        try: 
                            ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'pobj'][0])
                        except:
                            ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[0].pos_ == 'NOUN'][0])  ### added this

                entity = doc[noun_list[ent_ind][2]:combine_phrase(ent_ind,'next',noun_list)[0] + 1]

            ent_ind = noun_list.index((entity[0],entity[0].dep_,entity.start))

            temp = [i for i in noun_list[v_ind + 1 :ent_ind] if i[0].pos_ == 'NOUN']
            try:
                kword.append(doc[get_chunk(doc[temp[0][2]]).start:temp[-1][2]+1])
            except:
                pass

    else:
        try:
            p_ind = noun_list.index([i for i in noun_list[:v_ind] if i[1] == 'punct'][-1])
        except:
            p_ind = 0
        #-------------------------------------------------------------------------------------#
        # split noun chunks containing a possessive to get entity's keyword
        #-------------------------------------------------------------------------------------#
        try:
            ent_ind = noun_list.index([i for i in noun_list[p_ind:v_ind] if i[1] == 'poss' and doc[i[2]+1].tag_ == 'POS'][-1])
            kword_fixed.append(doc[noun_list[ent_ind+1][2] : combine_phrase(ent_ind+1,'next',noun_list)[0]+1])
        except:
            ent_ind = v_ind-1

        #-------------------------------------------------------------------------------------#
        # If > 1 determiner before verb, first is relation, second is entity
        #-------------------------------------------------------------------------------------#
        if len([i for i in noun_list[p_ind:ent_ind] if i[1] == 'det'])>1:
            entity = doc[combine_phrase(ent_ind,'prev',noun_list)[0] :noun_list[ent_ind][2] + 1]
        else:         
            try:
                entity = doc[[i for i in noun_list[p_ind:ent_ind] if i[0].pos_ == 'PROPN'][0][2]:noun_list[ent_ind][2] + 1]
            except:
                entity = doc[noun_list[ent_ind][2]:noun_list[ent_ind][2]+1]
        ent_ind = noun_list.index((entity[0],entity[0].dep_,entity.start)) 

        #------------------------#
        temp = [i for i in noun_list[p_ind:ent_ind] if i[0].pos_ == 'NOUN']
        try:
            kword.append(doc[get_chunk(doc[temp[0][2]]).start:temp[-1][2]+1])
        except:
            pass

    #-------------------------------------------------------------------------------------#
    # any nouns before or after the quantity are keywords
    #-------------------------------------------------------------------------------------#

        try: 
            if noun_list[q_ind - 1][1] in ['pobj','dobj']:
                kword.append(doc[combine_phrase(q_ind-1,'prev',noun_list)[0] :noun_list[q_ind-1][2] + 1])
        except:
            pass

        
        try: 
            if noun_list[q_ind +1][1] in ['pobj','dobj','det']:
                kword.append(doc[noun_list[q_ind][2] +1 :combine_phrase(q_ind+1,'next',noun_list)[0] + 1])
        except:
            pass

    # want to output quantity's parent verb if different from entity's
    verb = get_parent_verb(doc[qSpans[q][0]])

    #-------------------------------------------------------------------------------------#
    # handling change words, building quantity output using units from quantulum if present
    #-------------------------------------------------------------------------------------#
    change_w = ['incline','grow','increase','surge','rise','climb','decline','decrease','fall','tumble','decline','crash']

    if quants[q].unit.entity.name != 'dimensionless' and verb.lemma_ not in change_w:
        verb_out = 'is'
    else:
        verb_out = verb.text

    if doc[get_chunk(qroot).start - 1].pos_ == 'ADP' and verb_out != 'is':
        verb_out = verb_out +' '+ doc[get_chunk(qroot).start - 1].text

    try:
        if doc[qroot.i + 1].pos_ == 'ADJ':
            kword.append(doc[qroot.i+1:qroot.i+2])
    except:
        pass

    if quants[q].unit.name != 'dimensionless':


        quantity = doc[qSpans[q][0]:[word.i for word in doc[qSpans[q][0]:qroot.i + 1] if word.text in quants[q].surface][-1] + 1]
        quant_out = quantity.text.replace(quants[q].surface ,str(quants[q].value) + ' ' + quants[q].unit.name)
        ###################
        quant_value, quant_unit = quants[q].value, quants[q].unit.name
        ###################
        try:
            kword_fixed.append(doc[quantity.end:qroot.i + 1])
        except:
            pass

    else:
        quant_out = doc[qSpans[q][0]:qroot.i + 1].text
        quant_value, quant_unit = quants[q].value, doc[qSpans[q][0]:qroot.i + 1].root

    entity_out = doc[get_chunk(entity[0]).start : entity.end]

    #------------------------------------------------
    #Building relation using keywords + unit
    #------------------------------------------------

    try:
        kword_fixed = [doc[[word.i for word in get_chunk2(kword_fixed[0]) if word.dep_ == 'case'][-1]+1:get_chunk2(kword_fixed[0]).end]]
    except:
        pass

    rel_list = kword_fixed + [get_chunk2(text) for text in kword]

    templist = []
    for chunk in rel_list:
        if doc[chunk.start-1].pos_ == 'ADP':
            templist.append(doc[chunk.start - 1:chunk.start])
        templist.append(chunk)


    rel_list = [word for chunk in templist for word in chunk if word.pos_ != 'DET']

    try:
        rel_unit = [key for key, value in keywords[quants[q].unit.entity.name].items() for w in rel_list 
                    for v in value if v in w.lower_][0] + ' '

    except:
        rel_unit = ''
        #if quants[q].unit.entity.name not in ['dimensionless','currency','unknown']:
        #    rel_unit = quants[q].unit.entity.name + ' '
        #else:
        #    rel_unit = ''

    if rel_list and rel_list[-1].tag_ == 'JJR':
        verb_out = str(verb_out) + ' ' + rel_list[-1].text

    verb_out = str(verb_out)

    try:
        if rel_list[-1].text in keywords[quants[q].unit.entity.name][rel_unit.strip()] and rel_list[-1].pos_ == 'ADJ':
            rel_list = rel_list[:-1]
    except:
        pass

    if not rel_list:
        relation = rel_unit.strip()

    elif rel_unit.strip() not in " ".join(map(str,rel_list)):
        if rel_list[0].pos_ == 'ADP':
            relation = rel_unit + " ".join(map(str,rel_list))
        else:
            relation = rel_unit + 'of '+ " ".join(map(str,rel_list))
    else:
        if rel_list[0].pos_ == 'ADP' and rel_list[0].lower_ != 'per':
            relation = " ".join(map(str,rel_list[1:]))
        else:
            relation = " ".join(map(str,rel_list))
    #-----------------------------------    
    return [doc.text, entity_out, relation, verb_out, quant_out, quant_value, quant_unit, quants[q].unit.entity.name]


Read in data and form extractions

In [9]:
# Cell outputs extractions in the form (entity, relation, verb, quantity, q_value, q_unit) to a csv file.
# for txt files

f_in = open("astrophysics_corpus.txt", "r", newline='', encoding="utf-8")
f_out = open("astro_MyExtr.csv", mode="w", newline='', encoding="utf-8")
extract_writer = csv.writer(f_out, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)

for row in f_in:
    
    txt = unicodedata.normalize('NFKD',str(row))
    doc = nlp(txt)

    # document-level coreference resolution
    #txt = doc._.coref_resolved
    #doc = nlp(txt)
    
    # split document into sentences
    sentences = [sent.text for sent in doc.sents]
    for sent in sentences:
        try:
            txt = unicodedata.normalize('NFKD',sent)
            doc = nlp(txt)

            # use quantulum parser to identify quantities we wish to extract 
            quants = parser.parse(doc.text)

            # remove long-form numbers and dates from list of quantities to extract
            quants = [quant for quant in quants if any(char.isdigit() for char in quant.surface)]
            q_dates = [quant for token in doc for quant in quants if (token.idx == quant.span[0]) and (token.ent_type_ == 'DATE')]
            quants = [x for x in quants if x not in q_dates]

            # get token index of quantities from their string character index
            qSpans = get_indices(doc, quants)


            # output extraction for each detected quantity 
            for q in range(len(qSpans)):
                try:
                    a = extract(q, quants, qSpans,doc,keywords)
                    extract_writer.writerow(a)
                except:
                    pass
        except:
            pass

f_in.close()
f_out.close()

In [8]:
# Cell outputs extractions in the form (entity, relation, verb, quantity, q_value, q_unit) to a csv file.
# for dataframes.

f_out = open("extractions.csv", mode="w", newline='', encoding="utf-8")
extract_writer = csv.writer(f_out, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)

# process corpus 100 documents at a time into pandas dataframe.
for row in range(101,1500,100):
    df = pd.read_csv("articles.csv", usecols=[1,3,4], nrows=100, encoding="utf-8", skiprows=range(1, row))
    df = df.dropna().reset_index(drop=True) # drop empty rows
    df = df[df.body.str.contains('\d+')].reset_index(drop=True) # drop any documents not containing any numbers
    df.body = df.body.map(lambda txt: ' '.join(txt.split())) # remove extra whitespaces from each document
    print('processing row ',row,' to row ', row+100)
    
    # loop extraction over each document in corpus
    for txt in df.body:
        txt = unicodedata.normalize('NFKD',txt)
        doc = nlp(txt)
        
        # document-level coreference resolution
        txt = doc._.coref_resolved
        doc = nlp(txt)
        # split document into sentences
        sentences = [sent.text for sent in doc.sents]
        for sent in sentences:
            txt = unicodedata.normalize('NFKD',sent)
            doc = nlp(txt)
            
            # use quantulum parser to identify quantities we wish to extract 
            quants = parser.parse(doc.text)
            
            # remove long-form numbers and dates from list of quantities to extract
            quants = [quant for quant in quants if any(char.isdigit() for char in quant.surface)]
            q_dates = [quant for token in doc for quant in quants if (token.idx == quant.span[0]) and (token.ent_type_ == 'DATE')]
            quants = [x for x in quants if x not in q_dates]
            
            # get token index of quantities from their string character index
            qSpans = get_indices(doc, quants)


            # output extraction for each detected quantity 
            for q in range(len(qSpans)):
                try:
                    a = extract(q, quants, qSpans,doc,keywords)
                    extract_writer.writerow(a)
                except:
                    pass

f_out.close()

processing row  101  to row  201
processing row  201  to row  301
processing row  301  to row  401
processing row  401  to row  501
processing row  501  to row  601
processing row  601  to row  701
processing row  701  to row  801
processing row  801  to row  901
processing row  901  to row  1001
processing row  1001  to row  1101
processing row  1101  to row  1201
processing row  1201  to row  1301
processing row  1301  to row  1401
processing row  1401  to row  1501
