## Token Matcher

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("A complex_example,!")
print([token.text for token in doc])

['A', 'complex_example', ',', '!']


In [32]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern =[ [{"LOWER":"hello"},{"IS_PUNCT":True},{"LOWER":"world"}],
          [{"LOWER":"hello",},{"LOWER":"world"}]
         ]
matcher.add("HelloWorld",pattern)

doc = nlp("The programmer greeted the world with a cheerful 'Hello, world!' as they embarked on their coding journey. Little did they know that this simple phrase, 'hello world,' would become a cornerstone of their programming career. From their first 'hello world' program to complex software projects, the programmer never forgot the humble origins of those two words. 'Hello' and 'world' may seem ordinary, but together they carry the power to initiate a magical connection between humans and machines. So, let us all embrace the spirit of 'hello world' and explore the endless possibilities of coding!")
matches = matcher(doc)
for match_id,start,end in matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc[start:end] # the matches span
    print("match_id :",match_id,'\nstring_id:',string_id,'\nstart and end:',start,end,"\nspan.text:",span.text,'\n')
    # match_id is the hash value of the string ID "HelloWorld"

match_id : 15578876784678163569 
string_id: HelloWorld 
start and end: 9 12 
span.text: Hello, world 

match_id : 15578876784678163569 
string_id: HelloWorld 
start and end: 32 34 
span.text: hello world 

match_id : 15578876784678163569 
string_id: HelloWorld 
start and end: 49 51 
span.text: hello world 

match_id : 15578876784678163569 
string_id: HelloWorld 
start and end: 107 109 
span.text: hello world 



In [29]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
]
matcher.add("HelloWorld", pattern)

doc = nlp("The programmer greeted the world with a cheerful 'Hello, world!' as they embarked on their coding journey. Little did they know that this simple phrase, 'hello world,' would become a cornerstone of their programming career. From their first 'hello world' program to complex software projects, the programmer never forgot the humble origins of those two words. 'Hello' and 'world' may seem ordinary, but together they carry the power to initiate a magical connection between humans and machines. So, let us all embrace the spirit of 'hello world' and explore the endless possibilities of coding!")
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]  # the matches span
    print("match_id:", match_id, "\nstring_id:", string_id, "\nstart and end:", start, end, "\nspan.text:", span.text)


match_id: 15578876784678163569 
string_id: HelloWorld 
start and end: 9 12 
span.text: Hello, world


## Regex matching

In [40]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"[Uu](nited|\\.?) ?[Ss](tates|\\.?)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

Found match: United States
Found match: United States


In [39]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"(United States|USA|U\.S\.|unitedstates|UnitedStates)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)


Found match: United States
Found match: USA
Found match: United States
Found match: U.S.
