In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
# sample sentences
candidate_sentences = "the drawdown process is governed by astm standard d823"
doc = nlp(candidate_sentences)

In [4]:
for tok in doc:
    print(tok.text, "...", tok.dep_)

the ... det
drawdown ... amod
process ... nsubjpass
is ... auxpass
governed ... ROOT
by ... agent
astm ... compound
standard ... amod
d823 ... pobj


### Entity Pairs Extraction

In [5]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [6]:
get_entities("the film had 200 patents")

['film', '200  patents']

### Entity Relation Extraction

In [7]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [8]:
get_relation("John completed the task")

'completed'

### Combining above both for triplets

#### Example:1

In [29]:
text="John completed the task"

In [30]:
ent=get_entities(text)
rel=get_relation(text)

In [31]:
ent

['John', 'task']

In [32]:
new_list=[]
if len(ent)==2:
    for i,n in enumerate(ent):
        #print(i,n)
        if i==1:
            new_list.append(rel) 
        else:
            new_list.append(n)
    new_list.append(ent[1])
print(new_list)
    

['John', 'completed', 'task']


#### Example:2

In [33]:
def triplets(text):
    text="the drawdown process is governed by astm standard d823"
    ent=get_entities(text)
    rel=get_relation(text)
    new_list=[]
    if len(ent)==2:
        for i,n in enumerate(ent):
            #print(i,n)
            if i==1:
                new_list.append(rel) 
            else:
                new_list.append(n)
        new_list.append(ent[1])
    return new_list
    print(new_list)

['drawdown  process', 'governed by', 'astm standard astm d823']
