### Creating our Own Pattern Matching

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

In [4]:
# Creating a Pattern / Rule, for Searching these 3 Pattern

# SolarPower
pattern1 = [{'LOWER' : 'solarpower'}]
# Solar-power
pattern2 = [{'LOWER' : 'solar'}, {'IS_PUNCT':True}, {'LOWER':'power'}]
# Solar Power
pattern3 = [{'LOWER' : 'solar'}, {'LOWER': 'power'}]

In [7]:
# Adding 'SolarPower' Matcher
matcher.add('SolarPower',None, pattern1, pattern2, pattern3)  # Here 'none' is the callback

In [8]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is awesome")

found_matches = matcher(doc)

print(found_matches)
#(match id, start token, stop token)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [9]:
# Better Function of Seeing the Matches
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [10]:
# Removing Old 'SolarPowr' Matcher
matcher.remove('SolarPower')

In [11]:
# Adding another Pattern

# solarpower SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# solar(any amount of punctuation)power     solar.power     solar..power     solar+power    solar++power    etc
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

In [12]:
# Creating a New 'SolarPower' Matcher
matcher.add('SolarPower', None, pattern1, pattern2)

In [13]:
doc2 = nlp(u"Solar--power is solarpower yay! XD")

found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


### Phrase Matching in a txt document

In [3]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

In [18]:
#myfile = open('../Section 3 - NLP_Basics/reaganomics.txt')
#myfile

<_io.TextIOWrapper name='../Section 3 - NLP_Basics/reaganomics.txt' mode='r' encoding='UTF-8'>

In [4]:
with open('reaganomics.txt', errors='ignore') as f:
    doc3 = nlp(f.read())

In [5]:
# Trying to search in doc, but not sure about the exact terms are 
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Convert each phrase into a document object
phrase_pattern = [nlp(text) for text in phrase_list]

phrase_pattern

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [6]:
type(phrase_pattern[0])

spacy.tokens.doc.Doc

In [7]:
# Add Matcher
matcher.add('EconMatcher', None, *phrase_pattern)

found_matches = matcher(doc3)

found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2984, 2988)]

In [8]:
# Better Function of Seeing the Matches
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics


In [9]:
# Better Function of Seeing the Matches
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+10]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and free-market economics by
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.

The four pillars of Reagan
3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in opposition to Keynesian demand-
3680293220734633682 EconMatcher 2984 2988 became widely known as "trickle-down economics", due to the significant cuts in the upper
