# Token based matching

In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.strings import StringStore
from spacy import attrs
nlp = spacy.load("en_core_web_md")

In [2]:
doc = nlp("Good Morning, I want to reserve a ticket")

In [3]:
matcher = Matcher(vocab=nlp.vocab)
#? matcher need to initialized by the vocab

In [4]:
for attr in dir(attrs):
    if not (attr.startswith("FLAG") or attr.startswith("__")):
        print(attr)

#? these are the available token attribute.


DEP
ENT_ID
ENT_IOB
ENT_KB_ID
ENT_TYPE
Errors
HEAD
ID
IDS
IDX
IOB_STRINGS
IS_ALPHA
IS_ASCII
IS_BRACKET
IS_CURRENCY
IS_DIGIT
IS_LEFT_PUNCT
IS_LOWER
IS_OOV_DEPRECATED
IS_PUNCT
IS_QUOTE
IS_RIGHT_PUNCT
IS_SPACE
IS_STOP
IS_TITLE
IS_UPPER
LANG
LEMMA
LENGTH
LIKE_EMAIL
LIKE_NUM
LIKE_URL
LOWER
MORPH
NAMES
NORM
ORTH
POS
PREFIX
SENT_START
SHAPE
SPACY
SUFFIX
TAG
intify_attr
intify_attrs
key
value


## Lower and is_punct

In [5]:
pattern1 = [{attrs.LOWER:"good"},{attrs.LOWER:"morning"},{attrs.IS_PUNCT:True}]
matcher.add(key="morningGreeting",patterns=[pattern1])

In [6]:
matches = matcher(doc)
for match_id,start,end in matches:
    m_span = doc[start:end]
    print(start,end,m_span.text)

0 3 Good Morning,


In [7]:
pattern2 = [{attrs.LOWER:"good"},{attrs.LOWER:"evening"},{attrs.IS_PUNCT:True}]
matcher.add(key="eveningGreeting",patterns=[pattern2])

In [8]:
doc = nlp("Good morning, I want to reserve a ticket. I will then say good evening!")
matches = matcher(doc)

In [9]:
for match_id,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

Good morning,
good evening!


## ORTH

it matches the text with same exact case sensitive.

In [10]:
pattern3 = [{attrs.ORTH:"i"}]
matcher.add("i",patterns=[pattern3])

In [11]:
matches = matcher(doc)
for match_id,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

Good morning,
good evening!


In [12]:
matcher._patterns

{11099143382904271360: [[{66: 'good'}, {66: 'morning'}, {5: True}]],
 13992137771284315935: [[{66: 'good'}, {66: 'evening'}, {5: True}]],
 5097672513440128799: [[{65: 'i'}]]}

In [13]:
matcher.remove("i")

In [14]:
matcher._patterns

{11099143382904271360: [[{66: 'good'}, {66: 'morning'}, {5: True}]],
 13992137771284315935: [[{66: 'good'}, {66: 'evening'}, {5: True}]]}

In [15]:
pattern3 = [{attrs.ORTH:"I"}]
matcher.add("I",patterns=[pattern3])

In [16]:
matches = matcher(doc)
for match_id,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

Good morning,
I
I
good evening!


## Length


In [17]:
pattern4 = [{attrs.LENGTH:4}]
matcher.add("length of 4",patterns=[pattern4])

In [18]:
matches = matcher(doc)
for match_id,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

Good
Good morning,
I
want
I
will
then
good
good evening!


In [19]:
match_id

13992137771284315935

In [20]:
matcher._patterns

{11099143382904271360: [[{66: 'good'}, {66: 'morning'}, {5: True}]],
 13992137771284315935: [[{66: 'good'}, {66: 'evening'}, {5: True}]],
 4690420944186131903: [[{65: 'I'}]],
 5913418915722860132: [[{71: 4}]]}

## is_stop

In [24]:
doc = nlp("There is an elephant.")
pattern5 = [{attrs.IS_STOP:False}]
matcher.add("remove stop words",patterns=[pattern5])
matches = matcher(doc)
for match_id,start,end in matches:
    if match_id == 729437288482027326:
        m_span = doc[start:end]
        print(m_span.text)

elephant
.


In [32]:
nlp.vocab.strings["remove stop words"]

729437288482027326

## like_num

In [27]:
matcher.remove("remove stop words")

In [33]:
doc = nlp("it cost me million.")
pattern6 =[{attrs.LIKE_NUM:True}]
matcher.add("numbers",patterns=[pattern6])
matches = matcher(doc)
for match_id,start,end in matches:
    if match_id == nlp.vocab.strings["numbers"]:
        m_span = doc[start:end]
        print(m_span.text)

million


## Extended syntax 

"IN" - member operator
"NOT_IN"
"<=" ,">=" ...

In [34]:
doc = nlp("Good morning, I want to reserve a ticket. I will then say good evening!")

#? we have used two pattern to match the two different but they have some common in it.

In [35]:
matcher = Matcher(vocab=nlp.vocab)

In [36]:
pattern1 = [{attrs.LOWER:"good"},
            {attrs.LOWER:{"IN":["morning","evening"]}}]
matcher.add("greetings",patterns=[pattern1])
matcher._patterns

{3248590692569088154: [[{66: 'good'}, {66: {'IN': ['morning', 'evening']}}]]}

In [37]:
matches = matcher(doc)
for match_id ,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

Good morning
good evening


### token whose length is greater than 10

In [39]:
doc = nlp("I suffered from Trichotillomania when I was in college. The doctor prescribed me Psychosomatic medicine.")
pattern2 = [{attrs.LENGTH:{">=":10}}]
matcher.add("greaterThan10",patterns=[pattern2]) 

In [40]:
matches = matcher(doc)
for match_id,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

Trichotillomania
prescribed
Psychosomatic


## regex like operator

1. "?" -- optional.
2. "+" -- one or more time.
3. "*" -- zero or multiple time.
4. {} -- wildcard

### ? optional

In [45]:
doc1 = nlp("Barack Obama visited France.")
doc2 = nlp("Barack Hussein Obama visited France.")
matcher = Matcher(vocab=nlp.vocab)

In [46]:
pattern = [{attrs.LOWER:"barack"},
           {attrs.LOWER:"hussein","OP":"?"},
           {attrs.LOWER:"obama"}]
matcher.add("optional",patterns=[pattern])
for doc in [doc1,doc2]:
    matches = matcher(doc)
    for match_id,start,end in matches:
        m_span = doc[start:end]
        print(m_span.text)

Barack Obama
Barack Hussein Obama


### + once or more

In [59]:
doc1 = nlp("Hello hello hello, how are you?")
doc2 = nlp("Hello, how are you?")
doc3 = nlp("How are you?")
matcher = Matcher(vocab=nlp.vocab)

In [60]:
pattern = [{attrs.LOWER:"hello","OP":"+"},
           {attrs.IS_PUNCT:True}]
matcher.add("once",patterns=[pattern])
for doc in [doc1,doc2,doc3]:
    matches = matcher(doc)
    for match_id,start,end in matches:
        m_span = doc[start:end]
        print(m_span.text)

hello,
hello hello,
Hello hello hello,
Hello,


### * zero or more

In [61]:
doc1 = nlp("Hello hello hello, how are you?")
doc2 = nlp("Hello, how are you?")
doc3 = nlp("How are you?")
matcher = Matcher(vocab=nlp.vocab)

In [65]:
pattern = [{attrs.LOWER:"hello","OP":"*"},
           {attrs.IS_PUNCT:True}]
matcher.add("more",patterns=[pattern])
for doc in [doc1,doc2,doc3]:
    matches = matcher(doc)
    for match_id,start,end in matches:
        m_span = doc[start:end]
        print(m_span.text)

Hello hello hello,
hello hello,
hello,
,
?
Hello,
,
?
?


### {} wildcard

In [67]:
doc = nlp("My name is Alice and his name was Elliot.")
matcher = Matcher(vocab=nlp.vocab)

In [68]:
pattern = [{attrs.LOWER:"name"},
           {attrs.LEMMA:"be"},
           {} # match for the wild char
           ]
matcher.add("findName",patterns=[pattern])
matches = matcher(doc)
for _,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

name is Alice
name was Elliot


## regex

In [71]:
doc = nlp("I went to Italy; he has been there too. His mother  also has told me she wants to visit Rome.")
pattern = [{attrs.TAG:{"REGEX":"^V"}}]
matcher = Matcher(vocab=nlp.vocab)
matcher.add("verb",patterns=[pattern])
matches = matcher(doc)
for _,start,end in matches:
    m_span = doc[start:end]
    print(m_span.text)

went
has
been
has
told
wants
visit
