# RegEx in Spacy

- cannot be used for multi word tokens

In [5]:
import spacy
text = "This is a sample number 555-5555."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for token in doc:
    print(token)

This
is
a
sample
number
555
-
5555
.


In [2]:
import spacy
text = "This is a sample number 555-5555."
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [
                {
                    "label": "PHONE_NUMBER",
                    "pattern": [
                                    {"TEXT": {"REGEX": "((\d){3}-(\d){4})"}}
                                ]
                }
            ]
ruler.add_patterns(patterns)
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

This does not give any result because we are using a multi-word token in RegEx. Trying for single token

In [4]:
import spacy
text = "This is a sample number 555-5555."
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [
                {
                    "label": "PHONE_NUMBER",
                    "pattern": [
                                    {"TEXT": {"REGEX": "((\d){4})"}}
                                ]
                }
            ]
ruler.add_patterns(patterns)
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

5555 PHONE_NUMBER


Finding the OG word

In [1]:
import spacy
text = "This is a sample number 555-5555."
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [
                {
                    "label": "PHONE_NUMBER", 
                    "pattern": [
                                    {"SHAPE": "ddd"},
                                    {"ORTH": "-", "OP": "?"}, 
                                    {"SHAPE": "dddd"}
                                ]
                }
            ]
ruler.add_patterns(patterns)
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

555-5555 PHONE_NUMBER


## Extracting Multi Word Tokens

In [6]:
import re

In [7]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [8]:
pattern = r"Paul [A-Z]\w+" # Paul followed by a capital leter, until a word break

In [9]:
matches = re.finditer(pattern, text)

In [11]:
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


### Implementing in a custom Spacy Pipe

In [12]:
import spacy
from spacy.tokens import Span

In [13]:
nlp = spacy.blank("en")

In [15]:
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ent = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    print(span)

Paul Newman
Paul Hollywood


In [16]:
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


Adding the spans found inside `doc.ents`

In [19]:
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
print(doc.ents)

(Paul Newman, Paul Hollywood)


In [23]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return doc

In [24]:
nlp2 = spacy.blank('en')
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [25]:
nlp2.pipe_names

['paul_ner']

In [26]:
doc2 = nlp2(text)

In [27]:
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [28]:
from spacy.language import Language

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return doc

In [29]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [30]:
doc3 = nlp3(text)

ValueError: [E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.

This is because we have overlapping entities, each with different labels  
Fix:

In [41]:
from spacy.language import Language
from spacy.util import filter_spans

@Language.component("cinema_ner_fixed")
def cinema_ner_fixed(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents) # gives priority to longest label
    doc.ents = filtered
    return doc

In [42]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner_fixed")

<function __main__.cinema_ner_fixed(doc)>

In [43]:
nlp3.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'cinema_ner_fixed']

In [44]:
doc3 = nlp3(text)

In [45]:
print(doc3.ents)

(Paul Newman, American, Paul Hollywood, British)
