In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
from spacy.matcher import Matcher

In [4]:
matcher = Matcher(nlp.vocab)

In [6]:
#we want to detect stuff with the matcher object. detect different patterns. 
#here we want to detect/match any patterns of solarpower written in the 3 formats shown
#SolarPower
#Solar-power
#Solar power

pattern1 = [{'LOWER':'solarpower'}] #solarpower

pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}] #solar-power

pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}] #Solar power



In [7]:
#to add the pattens to the matcher function
#matcher('name',callback,pattern1,2,3)
matcher.add('SolarPower',None,pattern1,pattern2,pattern3)

In [8]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing!")

In [9]:
#to find the matches.. take the doc and pass it to the matcher object
found_matches = matcher(doc)

In [11]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [12]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] #get string representation
    span = doc[start:end]  #get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [13]:
#remove a particular pattern from the matcher
matcher.remove('SolarPower')

In [14]:
#adding new pattern to the matcher
#this 'OP':'*' matches any punctuation that occurs zero, one or more times i.e. -- or ,,

#solarpower
pattern1 = [{'LOWER':'solarpower'}]

#solar.power, solar-power, solar--power, solar---power etc
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT': True, 'OP':'*'},{'LOWER':'power'}]

In [28]:
matcher.add('SolarPower',None, pattern1,pattern2)

In [29]:
doc2 = nlp(u"Solar--power is solarpower yay! But solar:power is also solar---power!")

In [30]:
found_matches = matcher(doc2)

In [31]:
print(found_matches)

[(13798162653491708028, 0, 3), (8656102463236116519, 0, 3), (13798162653491708028, 4, 5), (8656102463236116519, 4, 5), (13798162653491708028, 8, 11), (8656102463236116519, 8, 11), (13798162653491708028, 13, 16), (8656102463236116519, 13, 16)]


In [32]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] #get string representation
    span = doc2[start:end]  #get the matched span
    print(match_id, string_id, start, end, span.text)

13798162653491708028 SolarPowe 0 3 Solar--power
8656102463236116519 SolarPower 0 3 Solar--power
13798162653491708028 SolarPowe 4 5 solarpower
8656102463236116519 SolarPower 4 5 solarpower
13798162653491708028 SolarPowe 8 11 solar:power
8656102463236116519 SolarPower 8 11 solar:power
13798162653491708028 SolarPowe 13 16 solar---power
8656102463236116519 SolarPower 13 16 solar---power


In [2]:
#matching on a terminalogy list is more efficient than pattern matching
#therefore will will use a phrase matcher

In [4]:
from spacy.matcher import PhraseMatcher

In [5]:
matcher = PhraseMatcher(nlp.vocab)

In [10]:
with open("F:\\NLP intro\\course stuff\\UPDATED_NLP_COURSE\\TextFiles\\reaganomics.txt") as f:
    doc3 = nlp(f.read())

In [11]:
#lets create the phrases we want to look for
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics','free-market economics']

In [12]:
#now we will convert each phrase into a document object
phrase_patterns = [nlp(text) for text in phrase_list]

In [13]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [14]:
#then pass each doc object into a matcher
#*phrase_patterns is a shorthand for phrase_patterns[0], ph_pat[1], ph_p[2] etc
matcher.add('EconMatcher',None,*phrase_patterns)

In [15]:
found_matches = matcher(doc3)

In [16]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [18]:
#to view the matches
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] #get string representation
    span = doc3[start:end]  #get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2985 2989 trickle-down economics


In [19]:
#now to get some context on which these terms were used, we will have to
#get a bit more of a token span.. maybe +5 more tokens after end and -5 before start

In [20]:
#to get some context
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] #get string representation
    span = doc3[start-5:end+5]  #get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2985 2989 became widely known as "trickle-down economics", due to the


In [30]:
#Step by step.. How to find a matching pattern or name or look for something in a doc using spacy
#1 import Matcher from spacy.matcher
#2 create a my_matcher object.. Pass in the nlp.vocab library in the imported Matcher
#3 define your patterns... The ones you want to find
#4 add the patterns into your my_matcher object
#5 open the txt that you want to seach for patterns in and create an nlp doc using the whole txt file
#6 get the matches found by passing the nlp doc into your my_matcher
#7 print the matches.. maybe print a bit more tokens for some context rather than only the match itself.

In [21]:
#1
from spacy.matcher import Matcher

In [22]:
#2
my_matcher = Matcher(nlp.vocab)

In [23]:
#3
pattern1 = [{'LOWER':'lionel'},{'LOWER':'messi'}]
pattern2 = [{'LOWER':'iniesta'}]
pattern3 = [{'LOWER':'forlan'}]

In [24]:
#4
my_matcher.add('wclegends',None,pattern1,pattern2,pattern3)

In [25]:
#5
with open("F:\\NLP intro\\course stuff\\UPDATED_NLP_COURSE\\TextFiles\\fifawc2010.txt") as f:
    wcDoc = nlp(f.read())

In [26]:
#6
found_matches = my_matcher(wcDoc)

In [27]:
found_matches

[(8170561629599831063, 120, 121),
 (8170561629599831063, 192, 194),
 (8170561629599831063, 225, 226)]

In [29]:
#7
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = wcDoc[start-5:end+5]
    print(match_id, string_id, start, end, span.text)

8170561629599831063 wclegends 120 121 midfield maestro AndrÃ©s Iniesta etched his name in history
8170561629599831063 wclegends 192 194 the quarterfinals against Uruguay. Lionel Messi, although not as prolific
8170561629599831063 wclegends 225 226 for players like ForlÃ¡n, Iniesta, and MÃ¼ller to shine
