# <font color = 'pickle'>**Advanced Spacy**</font>
    
In this notebook, we will learn some advanced features of Spacy:

1. Custom Tokenizer
2. Rule-Based Matching
3. Custom Extensions



# <font color = 'pickle'>**Install/Import Libraries**

In [73]:
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [74]:
from pathlib import Path
from collections import Counter
import re
import spacy
import textwrap
from pathlib import Path

In [75]:
spacy.__version__

'3.4.1'

# <font color = 'pickle'>**Set Path for Data**

In [76]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
base_folder =Path('/content/drive/MyDrive/data')
data_folder = base_folder/'datasets'

# <font color = 'pickle'>**Load Spacy Model**

In [78]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 23.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [79]:
# We can use the 'disable=' arguments to disable the component we do not need
nlp = spacy.load('en_core_web_sm')

In [80]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [81]:
disabled = nlp.select_pipes(disable= ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])

In [82]:
disabled

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [83]:
sample_text = " #Reg #Ex @abc@xyz.com! prefixes  stop-words wow!"

In [84]:
doc = nlp(sample_text)
print([token.text for token in doc])

[' ', '#', 'Reg', '#', 'Ex', '@abc@xyz.com', '!', 'prefixes', ' ', 'stop', '-', 'words', 'wow', '!']


# <font color = 'pickle'>**Custom Tokenizer in spaCy**

## <font color = 'pickle'>**Modfiy Prefixes**

In [85]:
# Remove prefix characters to not split from text
# Let us say if we want to keep hashtags together
# spacy treats # as prefixes
prefixes = nlp.Defaults.prefixes
prefixes[20:30]

['\\{', '\\}', '<', '>', '_', '#', '\\*', '&', '。', '？']

In [86]:
# remove # from prefixes and compile tokenizer again
prefixes.remove(r'#')

prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

In [87]:
# create doc
doc = nlp(sample_text)
# print tokens
print([token.text for token in doc])

[' ', '#Reg', '#Ex', '@abc@xyz.com', '!', 'prefixes', ' ', 'stop', '-', 'words', 'wow', '!']


In [88]:
# Add prefix character to split from text
prefixes.append(r'@')
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

doc = nlp(sample_text)
print([token.text for token in doc])

[' ', '#Reg', '#Ex', '@', 'abc@xyz.com', '!', 'prefixes', ' ', 'stop', '-', 'words', 'wow', '!']


## <font color = 'pickle'>**Modify Suffixes**

In [None]:
# check default suffixes in spacy
suffixes = nlp.Defaults.suffixes
suffixes[20:30]

['\\*', '&', '。', '？', '！', '，', '、', '；', '：', '～']

In [None]:
# Remove suffix characters to not split from text
suffixes.remove(r'\!')
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search

doc = nlp(sample_text)
print([token.text for token in doc])

[' ', '#Reg', '#Ex', '@', 'abc@xyz.com!', 'prefixes', ' ', 'stop', '-', 'words', 'wow!']


## <font color = 'pickle'>**Modify infixes**

In [None]:
# check default infixes in spacy
infixes = list(nlp.Defaults.infixes)
infixes[0:3]

['\\.\\.+',
 '…',
 '[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\

In [None]:
# remove infix
infixes = [x for x in infixes if r'-' not in x]
infix_regex = spacy.util.compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer =infix_regex.finditer 

doc = nlp(sample_text)
print([token.text for token in doc])

[' ', '#Reg', '#Ex', '@', 'abc@xyz.com!', 'prefixes', ' ', 'stop-words', 'wow!']


## <font color = 'pickle'>**Adding special case tokenization rules**

In [None]:
from spacy.symbols import ORTH
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']


['gimme', 'that']


In [None]:
# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']

['gim', 'me', 'that']


# <font color = 'pickle'>**Rule-based matching using spaCy**


In [89]:
# Import Matcher module from spacy.matcher
from spacy.matcher import Matcher


Matcher objects let us match sequences of tokens based on pattern rules. This is used as an alternative to regex pattern matching.


- Compared to regular expressions, the matcher works with Doc and Token objects instead of only strings.
- We can search for not only strings but also other lexical attributes.
- We can write very specific rules 

For example, find the word "cloud" only if it's a verb, not a noun.

We will now give an example of how to use Matcher with spacy's Tokens. When we use Matcher object on Tokens, we can use word level features of spaCy such as LOWER, LENGTH, LEMMA, SHAPE and flags such as IS_PUNCT, IS_DIGIT, LIKE_URL, etc. 


In [None]:
text = """New version of operation system is iOS 11. It is better than iOS 9 and iOS 9. 
The new version of iPhone X seems cool. The video of iphone x released. I liked iOS 9 but I like iOS 11 more.
You may not like my like. Contact us : xyz@gmail.com., abc@utdallas.edu"""

## <font color = 'pickle'>**1. Matching Exact Tokens**

In [None]:
# Example 1: Matching Exact Text

# When initiating Matcher we need to specify vocab
# Instantiate Matcher object using nlp.vocab

matcher = Matcher(nlp.vocab)
doc = nlp(text)

# Match Exact Tokens : match TEXT iOS
pattern1 = [{"TEXT":"iOS"}]

# Match sequence of texts : iPhone followed by X
pattern2 = [{"TEXT": "iPhone"},{"TEXT": "X"}]

# matcher.add() method to add patterns to matcher
matcher.add("TextOnly",[pattern1, pattern2])

# When we call the matcher on a doc, it returns a list of tuples.
# Each tuple consists of three values: the match ID, the star index and the end index of the matched span.
matches = matcher(doc)
matches

[(9385982399280393077, 6, 7),
 (9385982399280393077, 13, 14),
 (9385982399280393077, 16, 17),
 (9385982399280393077, 24, 26),
 (9385982399280393077, 38, 39),
 (9385982399280393077, 43, 44)]

In [None]:
# We can acees a span from doc using slicing (similar to arrays in numpy)
print([doc[start:end].text for match_id, start, end in matches])

['iOS', 'iOS', 'iOS', 'iPhone X', 'iOS', 'iOS']


## <font color = 'pickle'>**2. Matching Attribute (LOWER, IS_DIGIT)**

In [None]:
# Match Exact tokens and attributes
# List of availible attributes we can use with matcher : https://spacy.io/usage/rule-based-matching

matcher = Matcher(nlp.vocab)
doc = nlp(text)
# pattern 1 : text iOS followed by digit
pattern1 = [{"TEXT":"iOS"}, {"IS_DIGIT":True}]

# pattern2 : iphone followed by x (irrespective if case (lower/upper) for both iphone and X)
pattern2 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
# matcher.add() method 
matcher.add("TextAndLower",[pattern1, pattern2])

matches = matcher(doc)
print([doc[start:end].text for match_id, start, end in matches])

['iOS 11', 'iOS 9', 'iOS 9', 'iPhone X', 'iphone x', 'iOS 9', 'iOS 11']


For pattern 2 in above example, The ‘LOWER’: ‘iphone', 'x' means that we want to match a word where its lower form is ‘iphone x'. So with this, we can match the word ‘Iphone X’ or even ‘IPHONE X'

## <font color = 'pickle'>**3. Matching Attribute (IS_LOWER)**

In [None]:
# Match Exact tokens and attributes

matcher = Matcher(nlp.vocab)
doc = nlp(text)
# pattern 1 : text iOS followed by digit
pattern1 = [{"TEXT":"iOS"}, {"IS_DIGIT":True}]

# pattern2 :lowercase iPhone
pattern2 = [{"TEXT": "iphone" ,"IS_LOWER":True},  {"LOWER": "x"}]
             
# matcher.add() method 
matcher.add("TextAndIsLower",[pattern1, pattern2])

matches = matcher(doc)
print([doc[start:end].text for match_id, start, end in matches])

['iOS 11', 'iOS 9', 'iOS 9', 'iphone x', 'iOS 9', 'iOS 11']


For pattern 2 in above example, We only want to extract iphone when it is in lowercase.

## <font color = 'pickle'>**4. Matching Attribute (LEMMA)**

In [None]:
# Matching other attributes
disabled.restore()
print(nlp.pipe_names)
matcher = Matcher(nlp.vocab)
doc = nlp(text)
# write a pattern to match word whose lemma is like and pos tag is VERB
pattern = [{"LEMMA": "like"}]

# matcher.add() method 
matcher.add("Lemma",[pattern])

matches = matcher(doc)
print(matches)
print([doc[start:end].text for match_id, start, end in matches])

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
[(12849222793144466734, 37, 38), (12849222793144466734, 42, 43), (12849222793144466734, 51, 52), (12849222793144466734, 53, 54)]
['liked', 'like', 'like', 'like']


In [None]:
[token.pos_ for token in doc if token.lemma_ =='like']

['VERB', 'VERB', 'VERB', 'NOUN']

## <font color = 'pickle'>**5. Matching Attribute (LENGTH)**

In [None]:
doc = nlp("I see you are doing a good job.") 
matcher = Matcher(nlp.vocab) 
pattern = [{"LENGTH": 3}]
matcher.add("Length", [pattern]) 
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text)      

see
you
are
job


##<font color = 'pickle'>**6. Using POS tags in matcher**

### <font color = 'pickle'>**Example 1**

In [None]:
# Matching other attributes
disabled.restore()
matcher = Matcher(nlp.vocab)
doc = nlp(text)
# write a pattern to match word whose lemma is like and pos tag is VERB
pattern = [{"LEMMA": "like", "POS": "VERB"}]
# matcher.add() method 
matcher.add("Pos",[pattern])

matches = matcher(doc)
print(matches)
print([doc[start:end].text for match_id, start, end in matches])

[(12506337956553590349, 37, 38), (12506337956553590349, 42, 43), (12506337956553590349, 51, 52)]
['liked', 'like', 'like']


### <font color = 'pickle'>**Example 2**

In [None]:
# Matching other attributes
matcher = Matcher(nlp.vocab)
doc = nlp(text)
# write a pattern to match word whose lemma is like and pos tag is VERB. This word should be followed by a word
# whise pos tag is Noun
pattern = [{"LEMMA": "like", "POS": "VERB"}, {"POS": "NOUN"}]
# matcher.add() method 
matcher.add("LemmaPos",[pattern])

matches = matcher(doc)
print([doc[start:end].text for match_id, start, end in matches])

['liked iOS']


## <font color = 'pickle'>**7. Use Quantifiers**

In [None]:
doc = nlp("I am reading a new book on NLP. I read excellent Deep Learning book last week")

matcher = Matcher(nlp.vocab) 
pattern = pattern = [
    {"LEMMA": "read"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "ADJ"}
]
matcher.add("Quantifier", [pattern]) 

for match_id, start, end in matcher(doc): 
    print(doc[start:end].text) 

reading a new
read excellent


## <font color = 'pickle'>**8. Using Regular Expressions in Matcher**

### <font color = 'pickle'>**Example 1**

In [None]:
# Using Regular Expression inside the matcher
text = 'YO can contact me at @twitter, xyz@ytdallas.edu, abx@gmail.com'
matcher = Matcher(nlp.vocab)
doc = nlp(text)
# write a pattern to match word whose lemma is like and pos tag is VERB. This word should be followed by a word
# whise pos tag is Noun
pattern = [ {"TEXT": {"REGEX":"[\w]+@[\w+]" }}]
# matcher.add() method 
matcher.add("Regex",[pattern])

matches = matcher(doc)
print([doc[start:end].text for match_id, start, end in matches])

['xyz@ytdallas.edu', 'abx@gmail.com']


### <font color = 'pickle'>**Example 2**

In [None]:
matcher = Matcher(nlp.vocab) 

doc1 = nlp("I travelled by bus.") 
doc2 = nlp("She traveled by bike.") 

pattern = [{"POS": "PRON"}, {"TEXT": {"REGEX": "[Tt]ravell?ed"}}] 
matcher.add("PosRegex", [pattern]) 

for matid, start, end in matcher(doc1): 
    print(doc1[start:end]) 

for mid, start, end in matcher(doc2): 
    print(doc2[start:end]) 

I travelled
She traveled


### <font color = 'pickle'>**Example 3**

In [None]:
text = "Let us try different frequency of radio stations - FM 12.9, AM 104.9, FM 104.1,  AM 123.8. 1234"
radio_stations = re.findall(r'[FA]M\s\d{2,3}\.\d', text)
radio_stations

['FM 12.9', 'AM 104.9', 'FM 104.1', 'AM 123.8']

In [None]:
matcher = Matcher(nlp.vocab) 
text = "Let us try different frequency of radio stations - FM 12.9, AM 104.9, FM 104.1,  AM 123.8. 1234"
doc = nlp(text)
pattern = [{"TEXT": {'REGEX': '[FA]M\s\d{2,3}\.\d'}} ]
matcher.add("RegexMulti1", [pattern]) 
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text) 

In [None]:
matcher = Matcher(nlp.vocab) 
text = "Let us try different frequency of radio stations - FM 12.9, AM 104.9, FM 104.1,  AM 123.8. 1234"
doc = nlp(text)
pattern = [{"TEXT": {'REGEX': '[FA]M'}}, {"TEXT": {'REGEX': '\d{2,3}\.\d'}} ]
matcher.add("RegexMulti2", [pattern]) 
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text)

FM 12.9
AM 104.9
FM 104.1
AM 123.8


## <font color = 'pickle'>**9. Matching Attribute (SHAPE)**

### <font color = 'pickle'>**Understanding Shape Attribute**

In [None]:
text = "Let us try different radio stations - FM 12.9, AM 104.9, FM 104.1,  AM 123.8 and A234Hj.,-9"
doc = nlp(text)
[(token.text,token.shape_) for token in doc]

[('Let', 'Xxx'),
 ('us', 'xx'),
 ('try', 'xxx'),
 ('different', 'xxxx'),
 ('radio', 'xxxx'),
 ('stations', 'xxxx'),
 ('-', '-'),
 ('FM', 'XX'),
 ('12.9', 'dd.d'),
 (',', ','),
 ('AM', 'XX'),
 ('104.9', 'ddd.d'),
 (',', ','),
 ('FM', 'XX'),
 ('104.1', 'ddd.d'),
 (',', ','),
 (' ', ' '),
 ('AM', 'XX'),
 ('123.8', 'ddd.d'),
 ('and', 'xxx'),
 ('A234Hj.,-9', 'XdddXx.,-d')]

### <font color = 'pickle'>**Use SHAPE in Matcher**

In [None]:
doc = nlp("Let us try different frequency of radio stations - FM 12.9, AM 104.9, FM 104.1,  AM 123.8.") 
matcher = Matcher(nlp.vocab) 
pattern = [{"SHAPE": 'ddd.d'}]
matcher.add("Shape", [pattern]) 
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text)   

104.9
104.1
123.8


### <font color = 'pickle'>**Use SHAPE with Regex in Matcher**

In [None]:
doc = nlp("Let us try different frequency of radio stations - FM 12.9, AM 104.9, FM 104.1,  AM 123.8.") 
matcher = Matcher(nlp.vocab) 
pattern = [{"SHAPE": {'REGEX': 'd?dd.d'}} ]
matcher.add("ShapeRegex", [pattern]) 
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text)   

12.9
104.9
104.1
123.8


## <font color = 'pickle'>**10 Extract X relationship Y using Dependency Labels**

Here we will extract pair of entities: (X, Y) if there is a relationship like X acquired (bought) Y, Y was acquired (bought) by Y.


In [None]:
text1 =  "In their largest acquisition to date, Google has acquired YouTube for $1.65 billion"
text2 = " YouTube was acquired by Google for $1.65 billion"
text3 = " Google bought YouTube for $1.65 billion"
text4 = " Work was done"
doc1 = nlp(text1)
doc2 = nlp(text2)
doc3 = nlp(text3)
doc4 = nlp(text4)

### <font color = 'pickle'>**Understanding Dependency Labels**

In [None]:
print(f'{"Text":<12}: {"Lemma":<10}: {"POS":<10}: DEP\n')
for token in doc1:
  print(f'{token.text:<12}: {token.lemma_:<10}: {token.pos_:<10}: {token.dep_}')  

Text        : Lemma     : POS       : DEP

In          : in        : ADP       : prep
their       : their     : PRON      : poss
largest     : large     : ADJ       : amod
acquisition : acquisition: NOUN      : pobj
to          : to        : ADP       : prep
date        : date      : NOUN      : pobj
,           : ,         : PUNCT     : punct
Google      : Google    : PROPN     : nsubj
has         : have      : AUX       : aux
acquired    : acquire   : VERB      : ROOT
YouTube     : YouTube   : PROPN     : dobj
for         : for       : ADP       : prep
$           : $         : SYM       : quantmod
1.65        : 1.65      : NUM       : compound
billion     : billion   : NUM       : pobj


In [None]:
print(f'{"Text":<12}: {"Lemma":<10}: {"POS":<10}: DEP\n')
for token in doc2:
  print(f'{token.text:<12}: {token.lemma_:<10}: {token.pos_:<10}: {token.dep_}')    

Text        : Lemma     : POS       : DEP

            :           : SPACE     : dep
YouTube     : YouTube   : PROPN     : nsubjpass
was         : be        : AUX       : auxpass
acquired    : acquire   : VERB      : ROOT
by          : by        : ADP       : agent
Google      : Google    : PROPN     : pobj
for         : for       : ADP       : prep
$           : $         : SYM       : quantmod
1.65        : 1.65      : NUM       : compound
billion     : billion   : NUM       : pobj


### <font color = 'pickle'>**Label description with spacy.explain()**

In [None]:
print(spacy.explain('nsubjpass'))
print(spacy.explain('nsubj'))
print(spacy.explain('pobj'))
print(spacy.explain('dobj'))

nominal subject (passive)
nominal subject
object of preposition
direct object


In [None]:
print(f'{"Text":<12}: {"Lemma":<10}: {"POS":<10}: DEP\n')
for token in doc3:
  print(f'{token.text:<12}: {token.lemma_:<10}: {token.pos_:<10}: {token.dep_}')  

Text        : Lemma     : POS       : DEP

            :           : SPACE     : dep
Google      : Google    : PROPN     : nsubj
bought      : buy       : VERB      : ROOT
YouTube     : YouTube   : PROPN     : dobj
for         : for       : ADP       : prep
$           : $         : SYM       : quantmod
1.65        : 1.65      : NUM       : compound
billion     : billion   : NUM       : pobj


In [None]:
print(f'{"Text":<12}: {"Lemma":<10}: {"POS":<10}: DEP\n')
for token in doc4:
  print(f'{token.text:<12}: {token.lemma_:<10}: {token.pos_:<10}: {token.dep_}')  

Text        : Lemma     : POS       : DEP

            :           : SPACE     : dep
Work        : Work      : PROPN     : nsubjpass
was         : be        : AUX       : auxpass
done        : do        : VERB      : ROOT


### <font color = 'pickle'>**Step1: Check lemma of ROOT word**

In [None]:
def root_acquire(doc):
  return len([token for token in doc if token.dep_ == 'ROOT' if token.lemma_ in  ['acquire', 'buy']]) >0

In [None]:
print(root_acquire(doc1))
print(root_acquire(doc2))
print(root_acquire(doc3))
print(root_acquire(doc4))

True
True
True
False


### <font color = 'pickle'> **Step2: Check active/passive voice**

In [None]:
def is_passive(doc):
  return len([token for token in doc if token.dep_ == 'nsubjpass']) >0

In [None]:
print(is_passive(doc1))
print(is_passive(doc2))
print(is_passive(doc3))
print(is_passive(doc4))

False
True
False
True


### <font color = 'pickle'> **Step3: Extract Relationship**

In [None]:
def get_x_acquire_y_pairs(doc):
  if root_acquire(doc):
    if is_passive(doc):
      x = [token.text for token in doc if token.dep_.endswith('obj')]
      y = [token.text for token in doc if token.dep_ in ('nsubjpass')]
    else:
      x = [token.text for token in doc if token.dep_.endswith('subj')]
      y = [token.text for token in doc if token.dep_.endswith('dobj')]
    return (x[0], y[0])
  else: 
    print('X acquire Y pair is not present in document') 

In [None]:
print(get_x_acquire_y_pairs(doc1))
print(get_x_acquire_y_pairs(doc2))
print(get_x_acquire_y_pairs(doc3))
get_x_acquire_y_pairs(doc4)

('Google', 'YouTube')
('Google', 'YouTube')
('Google', 'YouTube')
X acquire Y pair is not present in document


## <font color = 'pickle'>**11. Phrase Matcher**

Using the PhraseMatcher to construct Doc objects rather than token patterns is a far more effective option if you need to match extensive terminology lists. For Example - It is difficult to define patterns that will match all the country names. However, we can easily enumerate all the country names and creaet a list. We can create a doc object from this list and use that as the basis of our information extraction script.

In [18]:
from spacy.matcher import PhraseMatcher
import json

### <font color = 'pickle'>**Example1 - Countries**

#### <font color = 'pickle'>**Download list of countries**

In [None]:
file = data_folder/'countries.json'
URL = 'https://raw.githubusercontent.com/explosion/spacy-course/master/exercises/en/countries.json'
!wget {URL} -P {data_folder} -O {file}

In [22]:
with open(file, 'r') as f:
  COUNTRIES = json.loads(f.read())

In [23]:
COUNTRIES[0:10]

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda']

#### <font color = 'pickle'>**Create patterns**
We will now create a list of doc object as patterns

In [49]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [51]:
disable = nlp.select_pipes(disable=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])

In [52]:
# patterns = [nlp.make_doc(country) for country in COUNTRIES] # slower version
patterns = list(nlp.pipe(COUNTRIES))

In [62]:
patterns[0:10]

[Afghanistan,
 Åland Islands,
 Albania,
 Algeria,
 American Samoa,
 Andorra,
 Angola,
 Anguilla,
 Antarctica,
 Antigua and Barbuda]

In [54]:
# Patterns are doc objects not text
type(patterns[0])

spacy.tokens.doc.Doc

#### <font color = 'pickle'>**Add patterns to Phrase Matcher**

In [56]:
text = 'New Zealand defated Germany in rugby'
doc = nlp(text)
matcher = PhraseMatcher(nlp.vocab)
matcher.add('phrase-country', patterns)
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text) 

New Zealand
Germany


Let us try a variation with lowercase and uppercase.

In [58]:
text = 'new zealand defated GERMANY in rugby.'
doc = nlp(text)
matcher = PhraseMatcher(nlp.vocab)
matcher.add('phrase-country', patterns)
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text) 

We do not get any result as the patterns are case senstive (patterns are in Camel case (First word is capital letter)

#### <font color = 'pickle'>**Use attributes in Phrase Matcher**

We can easily overocme the above issue by adding attribute - LOWER in our matcher.

In [63]:
text = 'new zealand defated GERMANY in rugby. Some other Variations iNDIA, united STATES OF America'
doc = nlp(text)
matcher = PhraseMatcher(nlp.vocab, attr = 'LOWER')
matcher.add('phrase-country', patterns)
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text) 

new zealand
GERMANY
iNDIA
united STATES OF America


### <font color = 'pickle'>**Example2 - IP Addresses**

In [65]:
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
ip_adresses = ["197.1.1.1", "197.197.1.1"]
patterns = list(nlp.pipe(ip_adresses))
matcher.add("IpAddressess", patterns)

doc = nlp("The static IP adress for this facility are 127.3.4.1, 127.123.2.2")
for match_id, start, end in matcher(doc):
    print( doc[start:end].text)

127.3.4.1
127.123.2.2


## <font color = 'pickle'>**12. Use entities in Matcher**

In [90]:
nlp.pipe_names

[]

In [91]:
disabled.restore()

In [92]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [109]:
text = "I work at Apple. Apple and banana are my favorite fruits. My favorite fruit is apple."
doc = nlp(text)
[(entity.label_, entity.text) for entity in doc.ents ]

[('ORG', 'Apple'), ('ORG', 'Apple')]

In [103]:
matcher = Matcher(nlp.vocab) 

pattern = [{"ENT_TYPE": "ORG", "LOWER": "apple"} ]
matcher.add("entity", [pattern]) 
matches = matcher(doc) 
for match_id, start, end in matches: 
    print(doc[start:end].text)

apple
Apple


# <font color = 'pickle'>**Custom extensions for tokens**

In [66]:
# we need to import Token class to set custom extension
from spacy.tokens import Token
doc = nlp("My email is harpreet@utdallas.edu and my url is https://j.u.edu/faculty/hs.")

In [67]:
# Define the extension attribute on the token level with name as "numeric" and default value as False
Token.set_extension('clean', default=False, force=True)

In [68]:
# Printing each token on the doc object and the stored value by the extension attrivute.
# All the values default to 'False'
print(f'{"token.text":<27} : {"token._.clean"}')
for token in doc:
  print(f'{token.text:<27} : {token._.clean}')

token.text                  : token._.clean
My                          : False
email                       : False
is                          : False
harpreet@utdallas.edu       : False
and                         : False
my                          : False
url                         : False
is                          : False
https://j.u.edu/faculty/hs  : False
.                           : False


In [69]:
# Change the value of custom extension (content) to True if it is not a number or alphabet
for token in doc:
  if not (token.is_punct or token.like_url or token.like_email):
    token._.set('clean', True)

In [70]:
# Printing the tokens again to see the modified values.
print(f'{"token.text":<27} : {"token._.clean"}')
for token in doc:
  print(f'{token.text:<27} : {token._.clean}')

token.text                  : token._.clean
My                          : True
email                       : True
is                          : True
harpreet@utdallas.edu       : False
and                         : True
my                          : True
url                         : True
is                          : True
https://j.u.edu/faculty/hs  : False
.                           : False


In [71]:
[token.text for token in doc if token._.clean]

['My', 'email', 'is', 'and', 'my', 'url', 'is']