In [1]:
import spacy
import textacy
from allennlp.predictors.predictor import Predictor
from tqdm import tqdm
import re
import networkx as nx


In [2]:
nlp = spacy.load('en_core_web_lg')
model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz'
predictor = Predictor.from_path(model_url)


2021-11-13 20:30:42,945 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2021-11-13 20:30:43,428 - INFO - cached_path - cache of https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz is up-to-date
2021-11-13 20:30:43,431 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz from cache at /home/hadeer/.allennlp/cache/0f6b052811b20b13280e609a96efe71ebc636b9c823a5c906ba24459e6e68af9.c1dab61d84cc7c3f7d6751c260040607cb7023a002778ba8f9b9d196b6539174
2021-11-13 20:30:43,433 - INFO - allennlp.models.archival - extracting archive file /home/hadeer/.allennlp/cache/0f6b052811b20b13280e609a96efe71ebc636b9c823a5c906ba24459e6e68af9.c1dab61d84cc7c3f7d6751c260040607cb7023a002778ba8f9b9d196b6539174 to temp dir /tmp/tmpqaak9y04
2021-11-13 20:30:51,304 - INFO - allennlp.common.params - dataset_reader.type = coref
2021-11-13 20:30:51,305 - INFO - allennl

In [3]:
def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
        if (token.dep_ == "ROOT"):
            root_token = token
    return root_token

In [4]:
def find_other_verbs(doc, root_token):
    other_verbs = []
    for token in doc:
        ancestors = list(token.ancestors)
        if (token.pos_ == "VERB" and len(ancestors) == 1\
            and ancestors[0] == root_token):
            other_verbs.append(token)
    return other_verbs

In [5]:
def get_clause_token_span_for_verb(verb, doc, all_verbs):
    first_token_index = len(doc)
    last_token_index = 0
    this_verb_children = list(verb.children)
    for child in this_verb_children:
        if (child not in all_verbs):
            if (child.i < first_token_index):
                first_token_index = child.i
            if (child.i > last_token_index):
                last_token_index = child.i
    return(first_token_index, last_token_index)

In [6]:
verb_patterns = [[{"POS":"AUX"}, {"POS":"VERB"}, 
                  {"POS":"ADP"}], 
                 [{"POS":"AUX"}],
                [{'POS': 'VERB', 'OP': '?'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'VERB', 'OP': '+'}]]

# pattern = [{'POS': 'VERB', 'OP': '?'},
#            {'POS': 'ADV', 'OP': '*'},
#            {'POS': 'VERB', 'OP': '+'}]

In [7]:
def contains_root(verb_phrase, root):
    vp_start = verb_phrase.start
    vp_end = verb_phrase.end
    if (root.i >= vp_start and root.i <= vp_end):
        return True
    else:
        return False

In [8]:
def get_verb_phrases(doc):
    root = find_root_of_sentence(doc)
    verb_phrases = textacy.extract.matches.token_matches(doc, verb_patterns)
                                           # verb_patterns)
    # print(verb_phrases)spacy.matcher.Matcher.
    new_vps = []
    for verb_phrase in verb_phrases:
        if (contains_root(verb_phrase, root)):
            new_vps.append(verb_phrase)
    return new_vps

In [9]:
def longer_verb_phrase(verb_phrases):
    longest_length = 0
    longest_verb_phrase = None
    for verb_phrase in verb_phrases:
        if len(verb_phrase) > longest_length:
            longest_verb_phrase = verb_phrase
    return longest_verb_phrase

In [10]:
def find_noun_phrase(verb_phrase, noun_phrases, side):
    for noun_phrase in noun_phrases:
        if (side == "left" and \
            noun_phrase.start < verb_phrase.start):
            return noun_phrase
        elif (side == "right" and \
              noun_phrase.start > verb_phrase.start):
            return noun_phrase

In [11]:
def find_triplet(sentence):
    doc = nlp(sentence)
    # print(doc)
    verb_phrases = get_verb_phrases(doc)
    noun_phrases = doc.noun_chunks
    verb_phrase = None
    if (len(verb_phrases) > 1):
        verb_phrase = \
        longer_verb_phrase(list(verb_phrases))
    else:
        # print(verb_phrase)
        verb_phrase = verb_phrases[0]
    left_noun_phrase = find_noun_phrase(verb_phrase, 
                                        noun_phrases, 
                                        "left")
    right_noun_phrase = find_noun_phrase(verb_phrase, 
                                         noun_phrases, 
                                         "right")
    return (left_noun_phrase, verb_phrase, 
            right_noun_phrase)

In [12]:
def get_pairs(text):
    text = re.sub(r'\n+', '.', text)  # replace multiple newlines with period
    text = re.sub(r'\[\d+\]', ' ', text)  # remove reference numbers
    text = predictor.coref_resolved(text)
    text=nlp(text)
    sens= [str(i).lstrip() for i in list(text.sents)]
    ent_pairs=[]
    for sent in sens:
        try:
            pair= list(find_triplet(sent.lower()))
            if None in pair:
                pass
            else:
                ent_pairs.append(pair)
            
            
                # print( list(textacy.extract.triples.subject_verb_object_triples(nlp(sent))))
        except (IndexError, AttributeError) as e:
            pass
    pairs = pd.DataFrame(ent_pairs, columns=['subject', 'relation', 'object'])
    pairs['subject'] = pairs['subject'].astype(str)
    pairs['relation'] = pairs['relation'].astype(str)
    pairs['object'] = pairs['object'].astype(str)
    return(pairs)
        

In [13]:
def draw_kg(pairs):
    k_graph = nx.from_pandas_edgelist(pairs, 'subject','object',
            create_using=nx.MultiDiGraph())
    node_deg = nx.degree(k_graph)
    layout = nx.spring_layout(k_graph, k=0.15, iterations=20)
    plt.figure(num=None, figsize=(120, 90), dpi=80)
    nx.draw_networkx(
        k_graph,
        node_size=[int(deg[1]) * 500 for deg in node_deg],
        arrowsize=20,
        linewidths=1.5,
        pos=layout,
        edge_color='red',
        edgecolors='black',
        node_color='white',
        )
    labels = dict(zip(list(zip(pairs.subject, pairs.object)),
                  pairs['relation'].tolist()))
    nx.draw_networkx_edge_labels(k_graph, pos=layout, edge_labels=labels,
                                 font_color='red')
    # print(nx.to_dict_of_dicts(k_graph))
    plt.axis('off')
    plt.show()

In [14]:
def get_graph(pairs):
    
    attr_graph = nx.MultiDiGraph()
    for index, row in pairs.iterrows():
        attr_graph.add_nodes_from([
        (row['subject'], {"type": "subject"}),
        (row['object'], {"type": "object"}),
        ])
        attr_graph.add_edges_from([(row['subject'], row['object'], {"type": "verb","label":row['relation']})])
    return attr_graph
    

In [15]:
from pyvis import network as pvnet

def plot_g_pyviz(G, name='out.html', height='300px', width='500px'):
    g = G.copy() # some attributes added to nodes
    net = pvnet.Network(notebook=True, directed=True, height=height, width=width)
    opts = '''
        var options = {
          "physics": {
            "forceAtlas2Based": {
              "gravitationalConstant": -100,
              "centralGravity": 0.11,
              "springLength": 100,
              "springConstant": 0.09,
              "avoidOverlap": 1
            },
            "minVelocity": 0.75,
            "solver": "forceAtlas2Based",
            "timestep": 0.22
          }
        }
    '''

    net.set_options(opts)
    # uncomment this to play with layout
    # net.show_buttons(filter_=['physics'])
    net.from_nx(g)
    return net.show(name)

In [31]:
plot_g_pyviz(get_graph(p))

This set of functions will extract one word only as subject or object instead of phrases

In [32]:
def get_sen_structure(doc):
    structure = {}
    for token in nlp(doc):
        ancestors = [t.text for t in token.ancestors]
        children = [t.text for t in token.children]
        structure[token.i] = {"text" : token.text,"dep":token.dep_}
                              # "children":children,"ancestors":ancestors}
        # print(token.text, "\t", token.i, "\t", 
        #       token.pos_, "\t", token.dep_, "\t", 
        #       ancestors, "\t", children)
    return structure

In [33]:
def get_single_pairs(text):
    Sentences = [str(i).lstrip()for i in list(nlp(predictor.coref_resolved(text)).sents)]
    pairs=[]
    for i in Sentences:
        # print(i)
        dic = get_sen_structure(i.lower())
        # print (dic)
        for key in dic :
            if (dic[key]['dep']) == 'ROOT':
                verb  = dic[key]['text']
            elif (dic[key]['dep']) == 'nsubj' or  (dic[key]['dep']) == 'csubj':
                subject = dic[key]['text']
            elif (dic[key]['dep']) == 'dobj' or  (dic[key]['dep']) == 'pobj' or  (dic[key]['dep']) == 'attr':
                obj = dic[key]['text']
            else:
                pass
        if "" in [subject,verb,obj]:
            pass
        else:
            pairs.append([subject,verb,obj]) 
            verb,subject,obj= "","",""
    allpairs = pd.DataFrame(pairs, columns=['subject', 'relation', 'object'])
    return allpairs

In [34]:
plot_g_pyviz(get_graph(get_single_pairs(text)))

In [35]:
# s = get_graph(get_single_pairs(text))

In [36]:
def get_graph_dict(graph):
    return nx.to_dict_of_dicts(graph)

In [50]:
import os

In [59]:
def save_graph_pickle(graph):
    return nx.write_gpickle(graph,"graph.gpickle")

In [60]:
def load_graph_pickle(path):
    return nx.read_gpickle(path)

In [29]:
# # object and subject constants
# OBJECT_DEPS = {"dobj", "dative", "attr", "oprd"}
# SUBJECT_DEPS = {"nsubj", "nsubjpass", "csubj", "agent", "expl"}
# # tags that define wether the word is wh-
# WH_WORDS = {"WP", "WP$", "WRB"}

# # extract the subject, object and verb from the input
# def extract_svo(doc):
#     doc=nlp(doc)
#     sub = []
#     at = []
#     ve = []
#     for token in doc:
#         # is this a verb?
#         if token.dep_ == "ROOT" :
#             ve.append(token.text)
#             # print(token.text)
#         # is this the object?
#         if token.dep_ in OBJECT_DEPS or token.head.dep_ in OBJECT_DEPS:
#             at.append(token.text)
#         # is this the subject?
#         if token.dep_ in SUBJECT_DEPS or token.head.dep_ in SUBJECT_DEPS:
#             sub.append(token.text)
#     return " ".join(sub).strip().lower(), " ".join(ve).strip().lower(), " ".join(at).strip().lower()

# def get_pairs_3rd(text):
#     Sentences = [str(i).lstrip()for i in list(nlp(predictor.coref_resolved(text)).sents)]
#     pairs=[]
#     for i in Sentences:
#         pairs.append(list(extract_svo(i.lower()))) 
#     allpairs = pd.DataFrame(pairs, columns=['subject', 'relation', 'object'])
#     return allpairs

# get_pairs_3rd("Traveling to have a business meeting takes the fun out of the trip")
#     # enron.text[1])

In [39]:
sample="Elizabeth, I need to update several of our faculty and teaching staff on the changes in the English department’s process for assigning teaching schedules to Graduate Teaching Assistants. As you know, the old process gave preference to more senior TAs, which led to numerous complaints of unfairness and an overall lack of clarity. Starting next semester, the English department will transition to a new rolling system that divides TAs into four groups alphabetically and ensures each group will be given first preference of teaching times once every two academic years. This change will go into effect starting January 5, 2015. In your message I would like you to outline the new process and explain the reasons why we need to make this change. Please provide contact information for the assistant department head in case anyone needs further information. Oh, and this information will also need to be copied to the appropriate associate dean. I appreciate your help with this."

In [42]:
pairs_1= get_pairs(sample)
pairs_1

Unnamed: 0,subject,relation,object
0,i,need,our faculty and teaching staff
1,elizabeth,gave,preference
2,the english department,transition,a new rolling system
3,transition,go,effect
4,elizabeth's message,given,first preference
5,contact information,need,the appropriate associate dean
6,i,appreciate,elizabeth's help


In [44]:
pairs_2 = get_single_pairs(sample)
pairs_2

Unnamed: 0,subject,relation,object
0,i,need,assistants
1,which,gave,clarity
2,that,transition,years
3,transition,go,effect
4,we,given,transition
5,anyone,provide,information
6,information,need,dean
7,i,appreciate,this


In [46]:
Graph = get_graph(pairs_1)

In [47]:
#Draw the graph
plot_g_pyviz(Graph)

In [63]:
#get the graph dict
get_graph_dict(Graph)

{'i': {'our faculty and teaching staff': {0: {'type': 'verb',
    'label': 'need'}},
  "elizabeth's help": {0: {'type': 'verb', 'label': 'appreciate'}}},
 'our faculty and teaching staff': {},
 'elizabeth': {'preference': {0: {'type': 'verb', 'label': 'gave'}}},
 'preference': {},
 'the english department': {'a new rolling system': {0: {'type': 'verb',
    'label': 'transition'}}},
 'a new rolling system': {},
 'transition': {'effect': {0: {'type': 'verb', 'label': 'go'}}},
 'effect': {},
 "elizabeth's message": {'first preference': {0: {'type': 'verb',
    'label': 'given'}}},
 'first preference': {},
 'contact information': {'the appropriate associate dean': {0: {'type': 'verb',
    'label': 'need'}}},
 'the appropriate associate dean': {},
 "elizabeth's help": {}}

In [49]:
#save the graph dict
save_graph_pickle(Graph)

In [61]:
#load the graph
G = load_graph_pickle(path='graph.gpickle')