Rule based Open Numerical Relation Extractor (as described in Thesis)

In [None]:
# import libraries
import numpy as np
import pandas as pd
from quantulum3 import parser
import spacy
nlp = spacy.load('en_core_web_sm')
import unicodedata
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import neuralcoref
neuralcoref.add_to_pipe(nlp)

Numerical Relation Extraction

In [2]:
# returns noun chunk of a passed token
def get_chunk(token):
    try:
        return [chunk for chunk in doc.noun_chunks if token in chunk][0]
    except:
        return doc[token.i: token.i+1]

In [3]:
# returns noun chunk of a passed phrase
def get_chunk2(text):
    try:
        return doc[[chunk for chunk in doc.noun_chunks if text[0] in chunk][0].start : text.end]

    except:
        return text

In [4]:
# returns parent verb of a passed token
def get_parent_verb(token):
    try:
        token = token.head
        if token.pos_ != 'VERB':
            token = get_parent_verb(token)
        return token
    except:
        return token

In [5]:
# returns start and end token indicies of quantities in a doc
def get_indices(doc, quants):

    quant_starts = []
    quant_ends = []
    
    for token in doc:
        for quant in quants:
            if token.idx == quant.span[0]:
                quant_starts.append(token.i)
            if token.idx == quant.span[1] or token.idx == quant.span[1] + 1:
                quant_ends.append(token.i)
                
    qSpans = list(zip(quant_starts,quant_ends))
    
    return qSpans

In [6]:
# returns joined consecutive noun phrases
def combine_phrase(ind, prev_or_next,noun_list):
    if prev_or_next == 'prev':
        if noun_list[ind-1][0].pos_ in ['NOUN','PROPN']: 
            return combine_phrase(ind-1,'prev',noun_list)
        else:
            return (noun_list[ind][2],ind)
    
    if prev_or_next == 'next':
        
        try:
            if noun_list[ind+1][0].pos_ in ['NOUN','PROPN']: 
                try:
                    return combine_phrase(ind+1,'next',noun_list)
                except:
                    return (noun_list[ind+1][2],ind+1)
            else:
                return (noun_list[ind][2],ind)
        except:
            return (noun_list[ind][2],ind)

In [88]:
# Function and keywords used to extract in BONIE test data
# returns extraction given a document and keywords.

keywords = {
    'length':{
        'height':['height','high','highest', 'elevation', 'above sea level', 'altitude', 'tall'],
        'length':['length','long','longest'],
        'width':['width','wide','breadth'],
        'distance':['distance','away','furthest']
    },
    'currency':{
        'GDP':['gdp', 'gross domestic product'],
        'stock price': ['stock price']
    },
    'percentage':{
        'interest rate':['interest rate'],
        'unemployment rate':['unemployment rate']
    }}

def extract(q,quants,qSpans,doc,keywords):
    qroot = get_chunk(doc[qSpans[q][0]:qSpans[q][1]].root).root
    verb = get_parent_verb(doc[qSpans[q][0]])

    if not [t for t in verb.lefts if t.pos_ in ['NOUN','PROPN','PRON']]:
        verb = get_parent_verb(get_parent_verb(doc[qSpans[q][0]]))

    # create a list
    noun_list = [(token,token.dep_,token.i) for token in verb.subtree 
                 if (token.dep_ in ['poss','pobj','dobj','attr','nsubj','nsubjpass','punct','det','cc']
                     or token in [qroot,verb]) and token.dep_ != 'compound']


    #--------------------------------------------------------------
    # remove subphrases
    sub_phrase = [(token,token.dep_,token.i) for token in verb.subtree 
         if token.text == ',' or token in [qroot,verb] or (token.head == verb and token.dep_ in ['nsubj','nsubjpass'])]

    for i in range(len(sub_phrase) -1):
        if sub_phrase[i][1] == 'punct' and sub_phrase[i+1][1] == 'punct':
            del noun_list[noun_list.index(sub_phrase[i]):noun_list.index(sub_phrase[i+1])+1]
    try:
        if sub_phrase[0][1] == 'punct':
            del noun_list[:noun_list.index(sub_phrase[0])]
    except:
        pass
    try:
        if sub_phrase[-1][1] == 'punct':
            del noun_list[noun_list.index(sub_phrase[-1]):]
    except:
        pass
    #--------------------------------------------------------------
    q_ind = noun_list.index((qroot, qroot.dep_,qroot.i))
    v_ind = noun_list.index((verb, verb.dep_,verb.i))
    kword = []
    kword_fixed = []

    if qroot.dep_ in ['nsubj','nsubjpass']:

        try:
            p_ind = noun_list.index([i for i in noun_list[v_ind:] if i[1] == 'punct'][0])
        except:
            p_ind = len(doc)-1
        #-------------------------------------------------------------------------------------#
        # split noun chunks containing a possessive to get entity's keyword
        #-------------------------------------------------------------------------------------#
        if noun_list[q_ind+1][0].pos_ in ['DET','PROPN','NOUN']:
            ent_ind = q_ind+1
            entity = doc[noun_list[ent_ind][2]:combine_phrase(ent_ind,'next',noun_list)[0] + 1]
            kword.append(doc[verb.i + 1 :noun_list[p_ind][2]])

        else:
            try:
                ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'poss'][0])
                entity = doc[combine_phrase(ent_ind,'prev',noun_list)[0] :noun_list[ent_ind][2] + 1]

                kword_fixed.append(doc[noun_list[ent_ind+1][2] : combine_phrase(ent_ind+1,'next',noun_list)[0]+1])
            except:
            #-------------------------------------------------------------------------------------#
            # If > 1 determiner before verb, first is relation, second is entity
            #-------------------------------------------------------------------------------------#
                try:
                    ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'det'][1])
                except:
                    try:
                        ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[0].pos_ == 'PROPN' and i[1] == 'pobj'][0])
                    except:
                        try: 
                            ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'pobj'][0])
                        except:
                            ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[0].pos_ == 'NOUN'][0])  ### added this

                entity = doc[noun_list[ent_ind][2]:combine_phrase(ent_ind,'next',noun_list)[0] + 1]

            ent_ind = noun_list.index((entity[0],entity[0].dep_,entity.start))

            temp = [i for i in noun_list[v_ind + 1 :ent_ind] if i[0].pos_ == 'NOUN']
            try:
                kword.append(doc[get_chunk(doc[temp[0][2]]).start:temp[-1][2]+1])
            except:
                pass

    else:
        try:
            p_ind = noun_list.index([i for i in noun_list[:v_ind] if i[1] == 'punct'][-1])
        except:
            p_ind = 0
        #-------------------------------------------------------------------------------------#
        # split noun chunks containing a possessive to get entity's keyword
        #-------------------------------------------------------------------------------------#
        try:
            ent_ind = noun_list.index([i for i in noun_list[p_ind:v_ind] if i[1] == 'poss' and doc[i[2]+1].tag_ == 'POS'][-1])
            kword_fixed.append(doc[noun_list[ent_ind+1][2] : combine_phrase(ent_ind+1,'next',noun_list)[0]+1])
        except:
            ent_ind = v_ind-1

        #-------------------------------------------------------------------------------------#
        # If > 1 determiner before verb, first is relation, second is entity
        #-------------------------------------------------------------------------------------#
        if len([i for i in noun_list[p_ind:ent_ind] if i[1] == 'det'])>1:
            entity = doc[combine_phrase(ent_ind,'prev',noun_list)[0] :noun_list[ent_ind][2] + 1]
        else:         
            try:
                entity = doc[[i for i in noun_list[p_ind:ent_ind] if i[0].pos_ == 'PROPN'][0][2]:noun_list[ent_ind][2] + 1]
            except:
                entity = doc[noun_list[ent_ind][2]:noun_list[ent_ind][2]+1]
        ent_ind = noun_list.index((entity[0],entity[0].dep_,entity.start)) 

        #------------------------#
        temp = [i for i in noun_list[p_ind:ent_ind] if i[0].pos_ == 'NOUN']
        try:
            kword.append(doc[get_chunk(doc[temp[0][2]]).start:temp[-1][2]+1])
        except:
            pass

    #-------------------------------------------------------------------------------------#
    # any nouns before or after the quantity are keywords
    #-------------------------------------------------------------------------------------#

        try: 
            if noun_list[q_ind - 1][1] in ['pobj','dobj']:
                kword.append(doc[combine_phrase(q_ind-1,'prev',noun_list)[0] :noun_list[q_ind-1][2] + 1])
        except:
            pass
        
        try: 
            if noun_list[q_ind +1][1] in ['pobj','dobj','det']:
                kword.append(doc[noun_list[q_ind][2] +1 :combine_phrase(q_ind+1,'next',noun_list)[0] + 1])
        except:
            pass

    #-------------------------------------------------------------------------------------#
    # want to output quantity's parent verb if different from entity's
    #-------------------------------------------------------------------------------------#
    verb = get_parent_verb(doc[qSpans[q][0]])

    change_w = ['incline','grow','increase','surge','rise','climb','decline','decrease','fall','tumble','decline','crash']

    if quants[q].unit.entity.name != 'dimensionless' and verb.lemma_ not in change_w:
        verb_out = 'is'
    else:
        verb_out = verb.text

    if doc[get_chunk(qroot).start - 1].pos_ == 'ADP' and verb_out != 'is':
        verb_out = verb_out +' '+ doc[get_chunk(qroot).start - 1].text

    try:
        if doc[qroot.i + 1].pos_ == 'ADJ':
            kword.append(doc[qroot.i+1:qroot.i+2])
    except:
        pass

    if quants[q].unit.name != 'dimensionless':


        quantity = doc[qSpans[q][0]:[word.i for word in doc[qSpans[q][0]:qroot.i + 1] if word.text in quants[q].surface][-1] + 1]
        quant_out = quantity.text.replace(quants[q].surface ,str(quants[q].value) + ' ' + quants[q].unit.name)
        try:
            kword_fixed.append(doc[quantity.end:qroot.i + 1])
        except:
            pass

    else:
        quant_out = doc[qSpans[q][0]:qroot.i + 1].text
    entity_out = doc[get_chunk(entity[0]).start : entity.end]

    #------------------------------------------------
    #Building relation using keywords + unit
    #------------------------------------------------

    try:
        kword_fixed = [doc[[word.i for word in get_chunk2(kword_fixed[0]) if word.dep_ == 'case'][-1]+1:get_chunk2(kword_fixed[0]).end]]
    except:
        pass

    rel_list = kword_fixed + [get_chunk2(text) for text in kword]

    templist = []
    for chunk in rel_list:
        if doc[chunk.start-1].pos_ == 'ADP':
            templist.append(doc[chunk.start - 1:chunk.start])
        templist.append(chunk)


    rel_list = [word for chunk in templist for word in chunk if word.pos_ != 'DET']

    try:
        rel_unit = [key for key, value in keywords[quants[q].unit.entity.name].items() for w in rel_list 
                    for v in value if v in w.lower_][0] + ' '

    except:
        if quants[q].unit.entity.name not in ['dimensionless','currency']:
            rel_unit = quants[q].unit.entity.name + ' '
        else:
            rel_unit = ''

    if rel_list and rel_list[-1].tag_ == 'JJR':
        verb_out = str(verb_out) + ' ' + rel_list[-1].text

    verb_out = str(verb_out)

    try:
        if rel_list[-1].text in keywords[quants[q].unit.entity.name][rel_unit.strip()] and rel_list[-1].pos_ == 'ADJ':
            rel_list = rel_list[:-1]
    except:
        pass

    if not rel_list:
        relation = rel_unit.strip()

    elif rel_unit.strip() not in " ".join(map(str,rel_list)):
        if rel_list[0].pos_ == 'ADP':
            relation = rel_unit + " ".join(map(str,rel_list))
        else:
            relation = rel_unit + 'of '+ " ".join(map(str,rel_list))
    else:
        if rel_list[0].pos_ == 'ADP' and rel_list[0].lower_ != 'per':
            relation = " ".join(map(str,rel_list[1:]))
        else:
            relation = " ".join(map(str,rel_list))
    #-----------------------------------    
    if verb_out == 'is':
        return (entity_out,relation,verb_out,quant_out)
    else:
        if qroot.dep_ in ['nsubj','nsubjpass']:
            return (quant_out,verb_out,relation,entity_out)
        else:
            return (entity_out,relation,verb_out,quant_out)  


In [89]:
# output test extractions to file
f_in = open("BONIE_test_sentences.txt", "r")
f_out = open("my_extractions.txt", "a")
for line in f_in.readlines():
    txt = unicodedata.normalize('NFKD',line)
    doc = nlp(txt)
    quants = parser.parse(doc.text)
    qSpans = get_indices(doc, quants)
    
    f_out.write(doc.text)
    for q in range(len(qSpans)):
        try:
            a = extract(q, quants, qSpans,doc,keywords)
            f_out.write(str(a))
            f_out.write('\n')
        except:
            pass
    
    f_out.write('\n')
f_in.close()
f_out.close()

TOPIC MODELING

In [7]:
# Function and keywords used to extract in Topic modelling data
# returns extraction given a document and keywords.

keywords = {
    'length':{
        'height':['height','high','highest', 'elevation', 'above sea level', 'altitude', 'tall'],
        'length':['length','long','longest'],
        'width':['width','wide','breadth'],
        'distance':['distance','away','furthest'],
        'depth':['depth','deep']
    }}

def extract(q,quants,qSpans,doc,keywords):
    qroot = get_chunk(doc[qSpans[q][0]:qSpans[q][1]].root).root
    verb = get_parent_verb(doc[qSpans[q][0]])

    if not [t for t in verb.lefts if t.pos_ in ['NOUN','PROPN','PRON']]:
        verb = get_parent_verb(get_parent_verb(doc[qSpans[q][0]]))

    # create a list
    noun_list = [(token,token.dep_,token.i) for token in verb.subtree 
                 if (token.dep_ in ['poss','pobj','dobj','attr','nsubj','nsubjpass','punct','det','cc']
                     or token in [qroot,verb]) and token.dep_ != 'compound']


    #--------------------------------------------------------------
    # remove subphrases
    sub_phrase = [(token,token.dep_,token.i) for token in verb.subtree 
         if token.text == ',' or token in [qroot,verb] or (token.head == verb and token.dep_ in ['nsubj','nsubjpass'])]

    for i in range(len(sub_phrase) -1):
        if sub_phrase[i][1] == 'punct' and sub_phrase[i+1][1] == 'punct':
            del noun_list[noun_list.index(sub_phrase[i]):noun_list.index(sub_phrase[i+1])+1]
    try:
        if sub_phrase[0][1] == 'punct':
            del noun_list[:noun_list.index(sub_phrase[0])]
    except:
        pass
    try:
        if sub_phrase[-1][1] == 'punct':
            del noun_list[noun_list.index(sub_phrase[-1]):]
    except:
        pass
    #--------------------------------------------------------------
    q_ind = noun_list.index((qroot, qroot.dep_,qroot.i))
    v_ind = noun_list.index((verb, verb.dep_,verb.i))
    kword = []
    kword_fixed = []
    #print(noun_list)
    #############################################################################
    if qroot.dep_ in ['nsubj','nsubjpass']:

        try:
            p_ind = noun_list.index([i for i in noun_list[v_ind:] if i[1] == 'punct'][0])
        except:
            p_ind = len(doc)-1
        #-------------------------------------------------------------------------------------#
        # split noun chunks containing a possessive to get entity's keyword
        #-------------------------------------------------------------------------------------#
        if noun_list[q_ind+1][0].pos_ in ['DET','PROPN','NOUN']:
            ent_ind = q_ind+1
            entity = doc[noun_list[ent_ind][2]:combine_phrase(ent_ind,'next',noun_list)[0] + 1]
            kword.append(doc[verb.i + 1 :noun_list[p_ind][2]])

        else:
            try:
                ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'poss'][0])
                entity = doc[combine_phrase(ent_ind,'prev',noun_list)[0] :noun_list[ent_ind][2] + 1]

                kword_fixed.append(doc[noun_list[ent_ind+1][2] : combine_phrase(ent_ind+1,'next',noun_list)[0]+1])
            except:
            #-------------------------------------------------------------------------------------#
            # If > 1 determiner before verb, first is relation, second is entity
            #-------------------------------------------------------------------------------------#
                try:
                    ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'det'][1])
                except:
                    try:
                        ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[0].pos_ == 'PROPN' and i[1] == 'pobj'][0])
                    except:
                        try: 
                            ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[1] == 'pobj'][0])
                        except:
                            ent_ind = noun_list.index([i for i in noun_list[v_ind:p_ind] if i[0].pos_ == 'NOUN'][0])  ### added this

                entity = doc[noun_list[ent_ind][2]:combine_phrase(ent_ind,'next',noun_list)[0] + 1]

            ent_ind = noun_list.index((entity[0],entity[0].dep_,entity.start))

            temp = [i for i in noun_list[v_ind + 1 :ent_ind] if i[0].pos_ == 'NOUN']
            try:
                kword.append(doc[get_chunk(doc[temp[0][2]]).start:temp[-1][2]+1])
            except:
                pass

    else:
        try:
            p_ind = noun_list.index([i for i in noun_list[:v_ind] if i[1] == 'punct'][-1])
        except:
            p_ind = 0
        #-------------------------------------------------------------------------------------#
        # split noun chunks containing a possessive to get entity's keyword
        #-------------------------------------------------------------------------------------#
        try:
            ent_ind = noun_list.index([i for i in noun_list[p_ind:v_ind] if i[1] == 'poss' and doc[i[2]+1].tag_ == 'POS'][-1])
            kword_fixed.append(doc[noun_list[ent_ind+1][2] : combine_phrase(ent_ind+1,'next',noun_list)[0]+1])
        except:
            ent_ind = v_ind-1

        #-------------------------------------------------------------------------------------#
        # If > 1 determiner before verb, first is relation, second is entity
        #-------------------------------------------------------------------------------------#
        if len([i for i in noun_list[p_ind:ent_ind] if i[1] == 'det'])>1:
            entity = doc[combine_phrase(ent_ind,'prev',noun_list)[0] :noun_list[ent_ind][2] + 1]
        else:         
            try:
                entity = doc[[i for i in noun_list[p_ind:ent_ind] if i[0].pos_ == 'PROPN'][0][2]:noun_list[ent_ind][2] + 1]
            except:
                entity = doc[noun_list[ent_ind][2]:noun_list[ent_ind][2]+1]
        ent_ind = noun_list.index((entity[0],entity[0].dep_,entity.start)) 

        #------------------------#
        temp = [i for i in noun_list[p_ind:ent_ind] if i[0].pos_ == 'NOUN']
        try:
            kword.append(doc[get_chunk(doc[temp[0][2]]).start:temp[-1][2]+1])
        except:
            pass

    #-------------------------------------------------------------------------------------#
    # any nouns before or after the quantity are keywords
    #-------------------------------------------------------------------------------------#

        try: 
            if noun_list[q_ind - 1][1] in ['pobj','dobj']:
                kword.append(doc[combine_phrase(q_ind-1,'prev',noun_list)[0] :noun_list[q_ind-1][2] + 1])
        except:
            pass
        
#        try:
#            kword.append(doc[verb.i:verb.i+1])
#        except:
#            pass
        
        try: 
            if noun_list[q_ind +1][1] in ['pobj','dobj','det']:
                kword.append(doc[noun_list[q_ind][2] +1 :combine_phrase(q_ind+1,'next',noun_list)[0] + 1])
        except:
            pass

    #-------------------------------------------------------------------------------------#
    # want to output quantity's parent verb if different from entity's
    #-------------------------------------------------------------------------------------#
    verb = get_parent_verb(doc[qSpans[q][0]])

    change_w = ['incline','grow','increase','surge','rise','climb','decline','decrease','fall','tumble','decline','crash']

    if quants[q].unit.entity.name != 'dimensionless' and verb.lemma_ not in change_w:
        verb_out = 'is'
    else:
        verb_out = verb.text

    if doc[get_chunk(qroot).start - 1].pos_ == 'ADP' and verb_out != 'is':# and verb.lemma_ not in ['be','have'] :
        verb_out = verb_out +' '+ doc[get_chunk(qroot).start - 1].text

    try:
        if doc[qroot.i + 1].pos_ == 'ADJ':
            kword.append(doc[qroot.i+1:qroot.i+2])
    except:
        pass

    if quants[q].unit.name != 'dimensionless':


        quantity = doc[qSpans[q][0]:[word.i for word in doc[qSpans[q][0]:qroot.i + 1] if word.text in quants[q].surface][-1] + 1]
        quant_out = quantity.text.replace(quants[q].surface ,str(quants[q].value) + ' ' + quants[q].unit.name)
        try:
            kword_fixed.append(doc[quantity.end:qroot.i + 1])
        except:
            pass

    else:
        quant_out = doc[qSpans[q][0]:qroot.i + 1].text
    entity_out = doc[get_chunk(entity[0]).start : entity.end]
#    if len([token for token in entity_out if token.pos_ in ['NOUN','PROPN','PRON']]) <1:
#        return doc

    #------------------------------------------------
    #Building relation using keywords + unit
    #------------------------------------------------

    try:
        kword_fixed = [doc[[word.i for word in get_chunk2(kword_fixed[0]) if word.dep_ == 'case'][-1]+1:get_chunk2(kword_fixed[0]).end]]
    except:
        pass

    rel_list = kword_fixed + [get_chunk2(text) for text in kword]

    templist = []
    for chunk in rel_list:
        if doc[chunk.start-1].pos_ == 'ADP':
            templist.append(doc[chunk.start - 1:chunk.start])
        templist.append(chunk)


    rel_list = [word for chunk in templist for word in chunk if word.pos_ != 'DET']

    try:
        rel_unit = [key for key, value in keywords[quants[q].unit.entity.name].items() for w in rel_list 
                    for v in value if w.lower_ in v][0] + ' '

    except:
        if quants[q].unit.entity.name not in ['dimensionless','currency']:  #### added currency
            rel_unit = quants[q].unit.entity.name + ' '
        else:
            rel_unit = ''

    if rel_list and rel_list[-1].tag_ == 'JJR':
        verb_out = str(verb_out) + ' ' + rel_list[-1].text

    verb_out = str(verb_out)

    try:
        if rel_list[-1].text in keywords[quants[q].unit.entity.name][rel_unit.strip()] and rel_list[-1].pos_ == 'ADJ':
            rel_list = rel_list[:-1]
    except:
        pass

    if not rel_list:
        relation = rel_unit.strip()

    elif rel_unit.strip() not in " ".join(map(str,rel_list)):
        if rel_list[0].pos_ == 'ADP':
            relation = rel_unit + " ".join(map(str,rel_list))
        else:
            relation = rel_unit + 'of '+ " ".join(map(str,rel_list))
    else:
        if rel_list[0].pos_ == 'ADP' and rel_list[0].lower_ != 'per':
            relation = " ".join(map(str,rel_list[1:]))
        else:
            relation = " ".join(map(str,rel_list))
    #-----------------------------------    
# for full extraction
    return (entity_out," ".join(map(str,keyword)),verb_out,quant_out)

# for only relation and unit (quantity is filtered out along with stop words later)
#    return relation + " " + quant_out


In [1]:
# 33 documents from Wikipedia used as test data for topic modelling
full_data = """The Sun is the star at the center of the Solar System. It is a nearly perfect sphere of hot plasma, with internal convective motion that generates a magnetic field via a dynamo process. It is by far the most important source of energy for life on Earth. Its diameter is about 1.39 million kilometers, or 109 times that of Earth, and its mass is about 330,000 times that of Earth. It accounts for about 99.86% of the total mass of the Solar System. Roughly 73% of the Sun's mass consists of hydrogen; 25% is mostly helium, with much smaller quantities of heavier elements, including oxygen, carbon, neon, and iron. The Sun is a G-type main-sequence star based on its spectral class. As such, it is informally and not completely accurately referred to as a yellow dwarf (its light is closer to white than yellow). It formed approximately 4.6 billion years ago from the gravitational collapse of matter within a region of a large molecular cloud. Most of this matter gathered in the center, whereas the rest flattened into an orbiting disk that became the Solar System. The central mass became so hot and dense that it eventually initiated nuclear fusion in its core. It is thought that almost all stars form by this process. The Sun currently fuses about 600 million tons of hydrogen into helium every second, converting 4 million tons of matter into energy every second as a result. This energy, which can take between 10,000 and 170,000 years to escape from its core, is the source of the Sun's light and heat. When hydrogen fusion in its core has diminished to the point at which the Sun is no longer in hydrostatic equilibrium, its core will undergo a marked increase in density and temperature while its outer layers expand, eventually transforming the Sun into a red giant. It is calculated that the Sun will become sufficiently large to engulf the current orbits of Mercury and Venus, and render Earth uninhabitable – but not for about five billion years. After this, it will shed its outer layers and become a dense type of cooling star known as a white dwarf, and no longer produce energy by fusion, but still glow and give off heat from its previous fusion. The enormous effect of the Sun on Earth has been recognized since prehistoric times, and the Sun has been regarded by some cultures as a deity. The synodic rotation of Earth and its orbit around the Sun are the basis of solar calendars, one of which is the predominant calendar in use today.
Jupiter is the fifth planet from the Sun and the largest in the Solar System. It is a gas giant with a mass one-thousandth that of the Sun, and is one of the brightest objects visible to the naked eye in the night sky. Jupiter is composed primarily of gaseous and liquid matter. It is the largest planet in the Solar System. It has a diameter of 142,984 km at its equator. Jupiter is much larger than Earth and considerably less dense: its volume is that of about 1,321 Earths, but it is only 318 times as massive.
Earth is the third planet from the Sun and the only astronomical object known to harbor life. Earth's gravity interacts with other objects in space, especially the Sun and the Moon, which is Earth's only natural satellite. Earth orbits around the Sun in 365.256 days, a period known as an Earth sidereal year. About 71% of Earth's surface is covered with water, mostly by oceans. The remaining 29% is land consisting of continents and islands that together contain many lakes, rivers and other fresh water, which, together with the oceans, constitute the hydrosphere.
Mercury is the smallest and innermost planet in the Solar System. Its orbit around the Sun takes 87.97 days, the shortest of all the planets in the Solar System. It is the smallest planet in the Solar System, with a mean equatorial diameter of 4,880 kilometers. Mercury is one of four terrestrial planets in the Solar System, and is a rocky body like Earth. Mercury appears to have a solid silicate crust and mantle overlying a solid, iron sulfide outer core layer, a deeper liquid core layer, and a solid inner core.
Venus is the second planet from the Sun. It orbits the Sun every 224.7 Earth days. Venus is one of the four terrestrial planets in the Solar System, meaning that it is a rocky body like Earth. It is similar to Earth in size and mass, and is often described as Earth's "sister" or "twin". The diameter of Venus is 12,103.6 km and its mass is 81.5% of Earth's.
Mars is the fourth planet from the Sun and the second-smallest planet in the Solar System after Mercury. Mars is a terrestrial planet with a thin atmosphere, having surface features reminiscent both of the impact craters of the Moon and the valleys, deserts, and polar ice caps of Earth. Mars is approximately half the diameter of Earth, with a surface area only slightly less than the total area of Earth's dry land. Mars is less dense than Earth, having about 15% of Earth's volume and 11% of Earth's mass, resulting in about 38% of Earth's surface gravity.
Saturn is the sixth planet from the Sun and the second-largest in the Solar System, after Jupiter. It is a gas giant with an average diameter about nine times that of Earth. It has only one-eighth the average density of Earth; however, with its larger volume, Saturn is over 95 times more massive. Saturn is a gas giant because it is predominantly composed of hydrogen and helium. It lacks a definite surface, though it may have a solid core.
Neptune is the eighth and farthest known planet from the Sun in the Solar System. In the Solar System, it is the fourth-largest planet by diameter, the third-most-massive planet, and the densest giant planet. Neptune is 17 times the mass of Earth, slightly more massive than its near-twin Uranus. Neptune is denser and physically smaller than Uranus because its greater mass causes more gravitational compression of its atmosphere. Neptune orbits the Sun once every 164.8 years at an average distance of 30.1 au.
Uranus is the seventh planet from the Sun. Uranus' mass is roughly 14.5 times that of Earth, making it the least massive of the giant planets. Its diameter is slightly larger than Neptune's at roughly four times that of Earth. Uranus' atmosphere is similar to Jupiter's and Saturn's in its primary composition of hydrogen and helium, but it contains more "ices" such as water, ammonia, and methane, along with traces of other hydrocarbons. It has the coldest planetary atmosphere in the Solar System, with a minimum temperature of 49 Kelvin, and has a complex, layered cloud structure with water thought to make up the lowest clouds and methane the uppermost layer of clouds. The interior of Uranus is mainly composed of ices and rock.
Pluto is the ninth-largest and tenth-most-massive known object directly orbiting the Sun. It is the largest known trans-Neptunian object by volume but is less massive than Eris. Like other Kuiper belt objects, Pluto is primarily made of ice and rock and is relatively small. It has a moderately eccentric and inclined orbit during which it ranges from 30 to 49 astronomical units or AU from the Sun. This means that Pluto periodically comes closer to the Sun than Neptune, but a stable orbital resonance with Neptune prevents them from colliding. Light from the Sun takes about 5.5 hours to reach Pluto at its average distance.
Carbon is a chemical element with the symbol C. It has an atomic number 6. It belongs to group 14 of the periodic table. Three isotopes occur naturally, 12C and 13C being stable, while 14C is a radionuclide, decaying with a half-life of about 5,730 years. Carbon is one of the few elements known since antiquity.
Chlorine is a chemical element with the symbol Cl. It has an atomic number of 17. The second-lightest of the halogens, it appears between fluorine and bromine in the periodic table and its properties are mostly intermediate between them. Chlorine is a yellow-green gas at room temperature. It is an extremely reactive element and a strong oxidising agent: among the elements, it has the highest electron affinity and the third-highest electronegativity on the Pauling scale, behind only oxygen and fluorine.
Helium is a chemical element with the symbol He. Helium has the atomic number 2. It is a colorless, odorless, tasteless, non-toxic, inert, monatomic gas, the first in the noble gas group in the periodic table. Its boiling point is the lowest among all the elements. Helium is the second lightest and second most abundant element in the observable universe (hydrogen is the lightest and most abundant).
Hydrogen is the chemical element with the symbol H. It has an atomic number of 1. With a standard atomic weight of 1.008, hydrogen is the lightest element in the periodic table. Hydrogen is the most abundant chemical substance in the Universe, constituting roughly 75% of all baryonic mass.
Iron is a chemical element with symbol Fe. It has atomic number 26. It is a metal that belongs to the first transition series and group 8 of the periodic table. It is by mass the most common element on Earth, forming much of Earth's outer and inner core. It is the fourth most common element in the Earth's crust.
Oxygen is the chemical element with the symbol O. Oxygen has an atomic number of 8. It is a member of the chalcogen group in the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium.
Sodium is a chemical element with the symbol Na. It has atomic number 11. It is a soft, silvery-white, highly reactive metal. Sodium is an alkali metal, being in group 1 of the periodic table, because it has a single electron in its outer shell, which it readily donates, creating a positively charged ion—the Na+ cation. The free metal does not occur in nature, and must be prepared from compounds. Sodium is the sixth most abundant element in the Earth's crust and exists in numerous minerals such as feldspars, sodalite, and rock salt. Many salts of sodium are highly water-soluble: sodium ions have been leached by the action of water from the Earth's minerals over eons, and thus sodium and chlorine are the most common dissolved elements by weight in the oceans.
Gold is a chemical element with the symbol Au. It has an atomic number of 79, making it one of the higher atomic number elements that occur naturally. In its purest form, it is a bright, slightly reddish yellow, dense, soft, malleable, and ductile metal. Chemically, gold is a transition metal and a group 11 element. It is one of the least reactive chemical elements and is solid under standard conditions. Gold often occurs in free elemental form, as nuggets or grains, in rocks, in veins, and in alluvial deposits. It occurs in a solid solution series with the native element silver and also naturally alloyed with copper and palladium. Less commonly, it occurs in minerals as gold compounds, often with tellurium.
Copper is a chemical element with the symbol Cu. Copper has an atomic number 29. It is a soft, malleable, and ductile metal with very high thermal and electrical conductivity. A freshly exposed surface of pure copper has a pinkish-orange color. Copper is used as a conductor of heat and electricity, as a building material, and as a constituent of various metal alloys, such as sterling silver used in jewelry, cupronickel used to make marine hardware and coins, and constantan used in strain gauges and thermocouples for temperature measurement.
Mercury is a chemical element with the symbol Hg. It has atomic number 80. It is commonly known as quicksilver and was formerly named hydrargyrum. A heavy, silvery d-block element, mercury is the only metallic element that is liquid at standard conditions for temperature and pressure; the only other element that is liquid under these conditions is the halogen bromine, though metals such as caesium, gallium, and rubidium melt just above room temperature.
The United States of America, commonly known as the United States or America, is a country consisting of 50 states, a federal district, five major self-governing territories, and various possessions. At 9.8 million km2, it is the world's third or fourth-largest country by total area and is slightly smaller than the entire continent of Europe. Most of the country is located in central North America between Canada and Mexico. With an estimated population of over 328 million, the U.S. is the third most populous country in the world. The capital is Washington, D.C., and the most populous city is New York City.
North America is a continent entirely within the Northern Hemisphere and almost all within the Western Hemisphere. It is also considered by some to be a northern subcontinent of the Americas. It is bordered to the north by the Arctic Ocean, to the east by the Atlantic Ocean, to the west and south by the Pacific Ocean, and to the southeast by South America and the Caribbean Sea. North America covers an area of about 24,709,000 square kilometers, about 16.5% of the earth's land area and about 4.8% of its total surface. North America is the third largest continent by area, following Asia and Africa, and the fourth by population after Asia, Africa, and Europe. Its population was estimated at nearly 579 million people or about 7.5% of the world's population.
Europe is a continent located entirely in the Northern Hemisphere and mostly in the Eastern Hemisphere. It comprises the westernmost part of Eurasia and is bordered by the Arctic Ocean to the north, the Atlantic Ocean to the west, the Mediterranean Sea to the south, and Asia to the east. It is the 6th largest continent in the world. Europe covers about 10,180,000 square kilometers, or 2% of the Earth's surface (6.8% of land area). Politically, Europe is divided into about fifty sovereign states of which Russia is the largest and most populous. Europe had a total population of about 741 million (about 11% of the world population).
Asia is Earth's largest and most populous continent, located primarily in the Eastern and Northern Hemispheres. It shares the continental landmass of Eurasia with the continent of Europe and the continental landmass of Afro-Eurasia with both Europe and Africa. Asia covers an area of 44,579,000 square kilometers, about 30% of Earth's total land area and 8.7% of the Earth's total surface area. The continent, which has long been home to the majority of the human population, was the site of many of the first civilizations. Asia is notable for not only its overall large size and population, but also dense and large settlements, as well as vast barely populated regions. Its 4.5 billion people constitute roughly 60% of the world's population, more than all other continents combined.
Africa is the world's second-largest and second-most populous continent, after Asia. At about 30.3 million km2 including adjacent islands, it covers 6% of Earth's total surface area and 20% of its land area. With 1.3 billion people, it accounts for about 16% of the world's human population. The continent is surrounded by the Mediterranean Sea to the north, the Isthmus of Suez and the Red Sea to the northeast, the Indian Ocean to the southeast and the Atlantic Ocean to the west. The continent includes Madagascar and various archipelagos. It contains 54 fully recognised sovereign states, four territories and two de facto independent states with limited or no recognition. The majority of the continent and its countries are in the Northern Hemisphere, with a substantial portion and number of countries in the Southern Hemisphere.
Russia, or the Russian Federation, is a European country located in Eastern Europe with a vast expanse of territory that stretches across Northern Asia. At 17,125,200 square kilometers, it is by far the largest country in the world by area, covering more than one-eighth of the Earth's inhabited land area, spanning eleven time zones, and bordering 16 sovereign nations. The territory of Russia extends from the Baltic Sea in the west to the Pacific Ocean in the east, and from the Arctic Ocean in the north to the Black Sea and the Caucasus in the south. With 146.7 million inhabitants, Russia is the most populous nation in Europe and the ninth-most populous nation in the world.
South America is a continent in the Western Hemisphere, mostly in the Southern Hemisphere, with a relatively small portion in the Northern Hemisphere. South America has an area of 17,840,000 square kilometers. Its population has been estimated at more than 423 million. South America ranks fourth in area and fifth in population. Brazil is by far the most populous South American country, with more than half of the continent's population, followed by Colombia, Argentina, Venezuela and Peru. In recent decades Brazil has also concentrated half of the region's GDP and has become a first regional power.
Germany is in Western and Central Europe, bordering Denmark to the north, Poland and the Czech Republic to the east, Austria to the southeast, and Switzerland to the south-southwest. France, Luxembourg and Belgium are situated to the west, with the Netherlands to the northwest. Germany includes 16 constituent states, covers 357,021 km2, consisting of 349,223 km2 of land and 7,798 km2 of water, and has a largely temperate seasonal climate. With 83 million inhabitants, it is the second most populous state of Europe after Russia, the most populous state lying entirely in Europe, as well as the most populous member state of the European Union. Germany is a very decentralised country. Its capital and largest metropolis is Berlin, while Frankfurt serves as its financial capital and has the country's busiest airport. It lies mostly between latitudes 47° and 55° N and longitudes 5° and 16° E. Germany is also bordered by the North Sea and, at the north-northeast, by the Baltic Sea. With Switzerland and Austria, Germany also shares a border on the fresh-water Lake Constance, the third largest lake in Central Europe. It is the seventh largest country by area in Europe and the 64th largest in the world. Elevation ranges from the mountains of the Alps in the south to the shores of the North Sea in the northwest and the Baltic Sea in the northeast. The forested uplands of central Germany and the lowlands of northern Germany are traversed by such major rivers as the Rhine, Danube and Elbe. Germany's alpine glaciers are experiencing deglaciation. Significant natural resources include iron ore, coal, potash, timber, lignite, uranium, copper, natural gas, salt, nickel, arable land and water.
The Atlantic Ocean is the second largest of the world's oceans, with an area of about 106,460,000 square kilometers. It covers approximately 20 percent of Earth's surface and about 29 percent of its water surface area. The Atlantic Ocean occupies an elongated, S-shaped basin extending longitudinally between Europe and Africa to the east, and the Americas to the west. Including its marginal seas, the Atlantic has a volume of 310,410,900 km3 or 23.3% of the total volume of the earth's oceans. The average depth is 3,646 meters and the maximum depth, the Milwaukee Deep in the Puerto Rico Trench, is 8,376 m.
The Pacific Ocean is the largest and deepest of Earth's oceanic divisions. It extends from the Arctic Ocean in the north to the Southern Ocean in the south and is bounded by the continents of Asia and Australia in the west and the Americas in the east. At 165,250,000 square kilometers in area, the Pacific Ocean covers about 46% of Earth's water surface, making it larger than all of Earth's land area combined. Its mean depth is 4,000 meters. Challenger Deep in the Mariana Trench, located in the western north Pacific, is the deepest point in the world, reaching a depth of 10,928 meters.
The Indian Ocean is the third-largest of the world's oceanic divisions, covering 70,560,000 km2 or 19.8% of the water on the Earth's surface; its volume is 264,000,000 km3 or 19.8% of the world's oceans' volume; it has an average depth of 3,741 meters and a maximum depth of 7,906 meters. It is bounded by Asia to the north, Africa to the west, and Australia to the east.
The Mediterranean Sea is a sea connected to the Atlantic Ocean, surrounded by the Mediterranean Basin and almost completely enclosed by land: on the north by Southern Europe, on the south by North Africa and on the east by the Levant.It covers an area of about 2,500,000 km2, representing 0.7% of the global ocean surface. The Mediterranean Sea has an average depth of 1,500 m and the deepest recorded point is 5,267 m in the Calypso Deep in the Ionian Sea.
The Caspian Sea is the world's largest inland body of water, variously classed as the world's largest lake or a full-fledged sea. It is an endorheic basin located between Europe and Asia, to the east of the Caucasus Mountains and to the west of the broad steppe of Central Asia. The sea has a surface area of 371,000 km2 and a volume of 78,200 km3. It has a salinity of approximately 1.2%, about a third of the salinity of most seawater. It is bounded by Kazakhstan to the northeast, Russia to the northwest, Azerbaijan to the west, Iran to the south, and Turkmenistan to the southeast.
"""

In [157]:
# Prep for topic modeling with our models extractions
newdata = []
fulltxt = []
for j in full_data.splitlines():
    txt = unicodedata.normalize('NFKD',j.strip())
    fulltxt.append(txt)
    doc = nlp(txt)
    newdata.append(" ".join([token.lemma_ for token in doc if not token.like_num and not token.is_stop and not token.is_punct]))

In [10]:
# Prep for topic modeling with other
newdata = []
fulltxt = []
for j in data.splitlines():
    txt = unicodedata.normalize('NFKD',j.strip())
    fulltxt.append(txt)
    doc = nlp(txt)
    newdata.append(" ".join([token.lemma_ for token in doc if not token.like_num and not token.is_stop and not token.is_punct]))

In [176]:
data = newdata
# build document word matrix
vectorizer = CountVectorizer(token_pattern='(?u)\w+') # consider each word, no n-grams
data_vectorized = vectorizer.fit_transform(data)

# define number of topics to detect
NUM_TOPICS = 4

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=50, doc_topic_prior=0.5)
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS, init='nndsvda', solver='mu', beta_loss='frobenius')
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])

In [178]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        for i in topic.argsort()[:-top_n - 1:-1]:
            print(vectorizer.get_feature_names()[i], topic[i])

In [None]:
print("LDA Model:")
print_topics(lda_model, vectorizer,10)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer,10)
print("=" * 20)

In [None]:
print(lda_Z)

In [None]:
for i in lda_Z:
    print(np.where(i>= 0.3)[0])

In [None]:
print(nmf_Z)

In [None]:
for i in nmf_Z:
    print(np.where(i>= 0.3)[0])