In [1]:
# https://www.analyticsvidhya.com/blog/2019/09/introduction-information-extraction-python-spacy/
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

In [2]:
# load spaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
# sample text 
text = "GDP in developing countries such as Vietnam will continue growing at a high rate." 

# create a spaCy object 
doc = nlp(text)

In [4]:
# print token, dependency, POS tag 
for tok in doc: 
    print(tok.text, "-->",tok.dep_,"-->", tok.pos_)

GDP --> nsubj --> PROPN
in --> prep --> ADP
developing --> amod --> VERB
countries --> pobj --> NOUN
such --> amod --> ADJ
as --> prep --> SCONJ
Vietnam --> pobj --> PROPN
will --> aux --> VERB
continue --> ROOT --> VERB
growing --> xcomp --> VERB
at --> prep --> ADP
a --> det --> DET
high --> amod --> ADJ
rate --> pobj --> NOUN
. --> punct --> PUNCT


In [10]:
#define the pattern 
# Note: The key ‘OP’: ‘?’ in the pattern above means that the modifier (‘amod’) can occur once or not at all.

pattern = [{'DEP':'amod', 'OP':"?"},
           {'POS':'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'}] #proper noun]

In [11]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", None, pattern) 

matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 

print(span.text)

developing countries such as Vietnam


In [12]:
def subtree_matcher(doc):
  subjpass = 0

  for i,tok in enumerate(doc):
    # find dependency tag that contains the text "subjpass"    
    if tok.dep_.find("subjpass") == True:
      subjpass = 1

  x = ''
  y = ''

  # if subjpass == 1 then sentence is passive
  if subjpass == 1:
    for i,tok in enumerate(doc):
      if tok.dep_.find("subjpass") == True:
        y = tok.text

      if tok.dep_.endswith("obj") == True:
        x = tok.text
  
  # if subjpass == 0 then sentence is not passive
  else:
    for i,tok in enumerate(doc):
      if tok.dep_.endswith("subj") == True:
        x = tok.text

      if tok.dep_.endswith("obj") == True:
        y = tok.text

  return x,y

In [14]:
subtree_matcher(nlp("Tableau was recently acquired by Salesforce."))

('Salesforce', 'Tableau')

In [15]:
import spacy
ja = spacy.blank('ja')
for word in ja('日本語ですよ'):
    print(word, word.lemma_, word.tag_, word.pos_)

日本 日本 名詞-固有名詞-地名-国 PROPN
語 語 名詞-普通名詞-一般 NOUN
です です 助動詞 AUX
よ よ 助詞-終助詞 PART
