In [1]:
#Importing libraries
import spacy
from spacy.matcher import Matcher
from spacy import displacy

In [2]:
#load spacy model
nlp=spacy.load('en_core_web_sm')

In [20]:
# pattern 1: X such as Y
#deine string
text="Developed countries such as USA have better healthcare infrastructure compared to developing countries."
#create spacy object
doc=nlp(text)

In [4]:
#print POS tag, and dependency tags
for token in doc:
    print(token.text,'->',token.pos_,'->',token.dep_)

Developed -> ADJ -> amod
countries -> NOUN -> nsubj
such -> ADJ -> amod
as -> ADP -> prep
USA -> PROPN -> pobj
have -> VERB -> ROOT
better -> ADJ -> amod
healthcare -> NOUN -> compound
infrastructure -> NOUN -> dobj
compared -> VERB -> prep
to -> ADP -> prep
developing -> VERB -> amod
countries -> NOUN -> pobj
. -> PUNCT -> punct


In [5]:
# Pattern: Noun+ such +as +PROPN

In [7]:
#Define the pattern
pattern=[{'POS':'NOUN'},
         {'LOWER':'such'},
         {'LOWER':'as'},
         {'POS':'PROPN'}
         ]

In [8]:
#create matcher object
matcher=Matcher(nlp.vocab)

In [13]:
#Add the rule to matcher
matcher.add("Pattern X such as Y",[pattern])

In [14]:
matches=matcher(doc)

In [15]:
matches

[(17711623125337077988, 1, 5)]

<class 'list'>


In [21]:
#extracting matched string
span=doc[matches[0][1]:matches[0][2]]
print(span)
# countries is superset

countries such as USA


In [25]:
#define another string
text="I love eating healthy fruits such as Apple. I like supercars such as Lamborghini."

In [26]:
#create doc object
doc=nlp(text)
matches=matcher(doc)
for match in matches:
  print(doc[match[1]:match[2]])

fruits such as Apple
supercars such as Lamborghini


In [35]:
# pattern 2: X and/ or other Y
doc=nlp("In this way you can disinfect your car and other vehicles")

In [28]:
for token in doc:
  print(token.text,'->',token.pos_,'->',token.dep_)

In -> ADP -> prep
this -> DET -> det
way -> NOUN -> pobj
you -> PRON -> nsubj
can -> AUX -> aux
disinfect -> VERB -> ROOT
your -> PRON -> poss
car -> NOUN -> dobj
and -> CCONJ -> cc
other -> ADJ -> amod
vehicle -> NOUN -> conj


In [30]:
#Pattern: NOUN+and+other+NOUN

In [31]:
#define the pattern
pattern=[{'POS':'NOUN'},
         {'LOWER':'and'},
         {'LOWER':'other'},
         {'POS':'NOUN'}]

In [32]:
#creeate matcher object
matcher=Matcher(nlp.vocab)
#add the rule to matcher
matcher.add("Pattern X and other Y",[pattern])

In [36]:
#running pattern on text
matches=matcher(doc)
for match in matches:
  print(doc[match[1]:match[2]])

car and other vehicles


**Quantifiers**


*   !-Negation
*   ?-Optional
*   +-1 or more
*   *-0 or more



In [50]:
#Define the pattern
pattern = [{'POS':'NOUN'},
           {'LOWER': 'and','OP':'?'},
           {'LOWER':'or','OP':'?'},
           {'LOWER': 'other'},
           {'POS': 'NOUN'}]

In [54]:
# Create Matcher Object
matcher=Matcher(nlp.vocab)

# Add the rule to matcher
matcher.add("Pattern X and/or Y",[pattern])

In [55]:
#Running pattern on text
matches=matcher(doc)
for match in matches:
  print(doc[match[1]:match[2]])

car and other vehicles


In [None]:
#drawbacks:
# hides the implementation details
#doesn't work on patterns in which words do not immediately follow each other.
# subject+auxiliary+verb+....+object

In [None]:
#Using custom functions


In [56]:
# pattern 3: x going y
doc=nlp("John is going to Berlin.")

In [57]:
displacy.render(doc,style="dep",jupyter=True)

In [65]:
# Function for extracting the relation
def x_going_y(doc):
  # Proceed if at least 3 words are present
  if len(doc)>=3:
    matches=[]

    for i in range(len(doc)):
      x=''
      y=''

      # Checking if the root word is going
      if ((doc[i].dep_=='ROOT') and (doc[i].text=='going')):
        # Accessing subtree
        for token in doc[i].subtree:
          # Checking if token is subject
          if token.dep_.startswith('nsubj'):
            x=token.text
          # Checking if token is object
          if token.dep_.endswith('obj'):
            y=token.text
        # Appending the triple into list
        matches.append((x,doc[i].text,y))
    return matches
  return []

In [66]:
x_going_y(doc)

[('John', 'going', 'Berlin')]

In [67]:
x_going_y(nlp('Sam was going on vacation to Mumbai. But, the train was going to Goa.'))

[('Sam', 'going', 'Mumbai'), ('train', 'going', 'Goa')]