In [None]:
import spacy

#Part-of-speech tagging

In [None]:
nlp=spacy.load("en_core_web_sm")
doc=nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.shape,token.is_alpha,token.is_stop)

Apple Apple PROPN NNP nsubj 16072095006890171862 True False
is be AUX VBZ aux 4370460163704169311 True True
looking look VERB VBG ROOT 13110060611322374290 True False
at at ADP IN prep 4370460163704169311 True True
buying buy VERB VBG pcomp 13110060611322374290 True False
U.K. U.K. PROPN NNP dobj 9346084826459880894 False False
startup startup NOUN NN dep 13110060611322374290 True False
for for ADP IN prep 4088098365541558500 True True
$ $ SYM $ quantmod 11283501755624150392 False False
1 1 NUM CD compound 8148669997605808657 False False
billion billion NUM CD pobj 13110060611322374290 True False


#Morphology


In [None]:
npl=spacy.load("en_core_web_sm")
print("pipeline:",nlp.pipe_names)
doc=npl("I was reading the paper")
token=doc[0]
print(token.morph)
print(token.morph.get("ProneType"))

pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Case=Nom|Number=Sing|Person=1|PronType=Prs
[]


#Statistical morphology

In [None]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load("de_core_news_sm")
doc = nlp("Wo bist du?")
print(doc[2].morph)
print(doc[2].pos_)

Case=Nom|Number=Sing|Person=2|PronType=Prs
PRON


#Rule-based morphology

In [None]:
nlp=spacy.load("en_core_web_sm")
doc=nlp("where are you?")
print(doc[2].morph)
print(doc[2].pos_)


Case=Nom|Person=2|PronType=Prs
PRON


#Lemmatization

In [None]:
nlp=spacy.load("en_core_web_sm")
lemmatizer=nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)
doc=nlp("I was reading the paper")
print([token.lemma_ for token in doc])

rule
['I', 'be', 'read', 'the', 'paper']


In [None]:
# lookup lemmatizer
nlp=spacy.blank("sv") #Create a blank Swedish pipeline
nlp.add_pipe("lemmatizer",config={"mode":"lookup"}) # Add a lemmatizer in "lookup" mode

<spacy.pipeline.lemmatizer.Lemmatizer at 0x7ced4e0ad100>

In [None]:
# Rule-based lemmatizer Needs model
nlp=spacy.blank("de") # Create a blank German model
nlp.add_pipe("morphologizer")# Add morphologizer pipe
nlp.add_pipe("lemmatizer",config={"mode":"rule"}) # Add the lemmatizer in "rule" mode


<spacy.pipeline.lemmatizer.Lemmatizer at 0x7ced4d067d40>

In [None]:
# Trainable lemmatizer
nlp=spacy.blank("en")  # Create a blank English model
nlp.add_pipe("trainable_lemmatizer",name="lemmatizer") # Add trainable lemmatizer to the pipeline

<spacy.pipeline.edit_tree_lemmatizer.EditTreeLemmatizer at 0x7ced46b33400>

#Dependency Parsing Needs model

In [None]:
#Noun chunk
nlp=spacy.load("en_core_web_sm")
doc=nlp("Autonomes cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
  print(chunk.text,chunk.root.text,chunk.root.dep_,chunk.root.head.text)


Autonomes cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [None]:
# Navigating the parse tree
nlp=spacy.load("en_core_web_sm")
doc=nlp("Autonomes cars shift insurance liability toward manufacturers")
for token in doc:
  print(token.text,token.dep_,token.head.text,token.head.pos_,[child for child in token.children])

Autonomes compound cars NOUN []
cars nsubj shift VERB [Autonomes]
shift ROOT shift VERB [cars, liability, toward]
insurance compound liability NOUN []
liability dobj shift VERB [insurance]
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


In [None]:
# Finding Verbs with Subjects from above
from spacy.symbols import nsubj,VERB
nlp=spacy.load("en_core_web_sm")
doc=nlp("Autonomes cars shift insurance liability toward manufacturers")
verbs=set()
for possible_subject in doc:
  if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
    verbs.add(possible_subject.head)
print(verbs)

{shift}


In [None]:
# Finding a verb with a subject from below
verbs=[]
for possible_verb in doc:
  if possible_verb.pos == VERB:
    for possible_subject in possible_verb.children:
      if possible_subject.dep == nsubj:
        verbs.append(possible_verb)
print(verbs)

[shift]


In [None]:
# itrating around the local tree
nlp=spacy.load("en_core_web_sm")
doc=nlp("bright red apples on the tree")
print([token.text for token in doc[2].lefts])
print([token.text for token in doc[2].rights])
print(doc[2].n_lefts)
print(doc[2].n_rights)

['bright', 'red']
['on']
2
1


In [None]:
# Extracting Left and Right Dependencies in German
nlp=spacy.load("de_core_news_sm")
doc=nlp("schöne rote Äpfel auf dem Baum")
print([token.text for token in doc[2].lefts])
print([token.text for token in doc[2].rights])

['schöne', 'rote']
['auf']


In [None]:
# Merging a Span in English Text
nlp=spacy.load("en_core_web_sm")
doc=nlp("Credit and mortgage account holders must submit their requests")
span=doc[doc[4].left_edge.i:doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer:
  retokenizer.merge(span)
for token in doc:
  print(token.text,token.pos_,token.dep_,token.head.text)

Credit and mortgage account holders NOUN nsubj submit
must AUX aux submit
submit VERB ROOT submit
their PRON poss requests
requests NOUN dobj submit


In [None]:
# Extracting MONEY Entities from Multiple Texts
nlp=spacy.load("en_core_web_sm")
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")
doc=nlp("Credit and mortgage account holders must submit their requests")
TEXTS=[
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]
for doc in nlp.pipe(TEXTS):


  for token in doc:
    if token.ent_type_=="MONEY":
      if token.dep_ in ("attr","dobj"):
        subject=[w for w in token.head.lefts if w.dep_=="nsubj"]
        if subject:
          print(subject[0],"-->",token)
        elif token.dep_=="pobj" and token.head.dep_ == "pep":
          print(token.head.head,"-->",token)

Net income --> $9.4 million
Revenue --> twelve billion dollars


#Visualizing dependencies

In [None]:
from spacy import displacy
nlp=spacy.load("en_core_web_sm")
doc=nlp("Autonomous cars shift insurance liability towards manufactures")
displacy.render(doc,style="dep")

In [None]:
from spacy import displacy
nlp=spacy.load("en_core_web_sm")
doc=nlp("The lifeless body of Mukesh Chandrakar, a journalist from Maoist hotbed of Bijapur, was discovered from a septic tank on January 3, 2025. The 33-year-old had been missing since January 1 night. Known for his fearless reporting on local issues, including exposing corruption in government contracts, Mukesh's death has sent shockwaves through the region.")
displacy.render(doc,style="ent")

# Name entity recognition

In [None]:
# Name entity recognition
nlp=spacy.load("en_core_web_sm")
doc=nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
  print(ent.text,ent.start_char,ent.end_char,ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [None]:
# Acessing entity annotations and labels
nlp=spacy.load("en_core_web_sm")
doc=nlp("San fransisco considers banning sidewalk delivery robots")
ents=[(e.text,e.start_char,e.end_char,e.label_) for e in doc.ents]
print("Before",ents)
ent_san=doc[0].text,doc[0].ent_iob,doc[0].ent_type_
ent_fran=doc[1].text,doc[1].ent_iob,doc[1].ent_type_
print(ent_san)
print(ent_fran)

Before [('San fransisco', 0, 13, 'GPE')]
('San', 3, 'GPE')
('fransisco', 1, 'GPE')


In [None]:
# Setting entity annotations
from spacy.tokens import Span
nlp=spacy.load("en_core_web_sm")
doc=nlp("fb is hiring a new VP of global policy")
ents=[(e.text,e.start_char,e.end_char,e.label_) for e in doc.ents]
print("Before",ents)
# create a span for the new entity
fb_ent=Span(doc,0,1,label="ORG")
orig_ents=list(doc.ents)
# Modify provider entity span
doc.set_ents([fb_ent], default="unmodified")
#  Assign a complete list of ents to doc.ents
doc.ents=orig_ents+[fb_ent]
ents=[(e.text,e.start_char,e.end_char,e.label_) for e in doc.ents]
print("After",ents)

Before []
After [('fb', 0, 2, 'ORG')]


In [None]:
# setting entity annotations from array
import numpy
from spacy.attrs import ENT_IOB,ENT_TYPE
nlp=spacy.load("en_core_web_sm")
doc=nlp.make_doc("London is a big city in the United Kingdom")
print("Before",doc.ents)

header=(ENT_IOB,ENT_TYPE)
attr_array=numpy.zeros((len(doc),len(header)),dtype="uint64")
attr_array[0,0]=3
attr_array[0,1]=doc.vocab.strings["GPE"]
doc.from_array(header,attr_array)
print("After",doc.ents)


Before ()
After (London,)


#Built-in entity types

In [None]:
# Visualizing named entities
from spacy import displacy
text="When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp=spacy.load("en_core_web_sm")
doc=nlp(text)
displacy.serve(doc,style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


#Tokenization

In [None]:
nlp=spacy.load("en_core_web_sm")
doc=nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
  print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [None]:
# Adding special case tokenization rules
from spacy.symbols import ORTH
nlp=spacy.load("en_core_web_sm")
doc=nlp("gimme that")
print([w.text for w in doc])
# Add special case rule
special_case=[{ORTH:"gim"},{ORTH:"me"}]
nlp.tokenizer.add_special_case("gimme",special_case)
doc=nlp("gimme that")
print([w.text for w in nlp("gimme that")])

['gimme', 'that']
['gim', 'me', 'that']


In [None]:
# debugging the tokenizer
from spacy.lang.en import English
nlp=English()
text='''"Let's go!"'''
doc=nlp(text)
token_exp=nlp.tokenizer.explain(text)
for t in token_exp:
  print(t[1],"\t",t[0])

" 	 PREFIX
Let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX
" 	 SUFFIX


In [None]:
# Customizing spaCy’s Tokenizer class

import re
from spacy.tokenizer import Tokenizer

special_cases={":)":[{ORTH:":)"}]}
prefix_re=re.compile(r'''^[\\[\\("']''')
suffix_re = re.compile(r'''[\]\)"']$''')
infix_re=re.compile(r'''[-~]''')
simple_url_re=re.compile(r'''^https?://''')
def custom_tokenizer(nlp):
  return Tokenizer(nlp.vocab,rules=special_cases,prefix_search=prefix_re.search,suffix_search=suffix_re.search,infix_finditer=infix_re.finditer,url_match=simple_url_re.match)
nlp=spacy.load("en_core_web_sm")
nlp.tokenizer=custom_tokenizer(nlp)
doc=nlp("hello-word.:)")
print([t.text for t in doc])

['hello', '-', 'word.:', ')']


In [None]:
import re
from spacy.lang.char_classes import ALPHA,ALPHA_LOWER,ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES,LIST_ELLIPSES,LIST_ICONS
from spacy.util import compile_infix_regex

nlp=spacy.load("en_core_web_sm")
doc=nlp("Mother-in-law")
print([t.text for t in doc])

# Modify tokenizer
infixes=(LIST_ELLIPSES+LIST_ICONS+
         [
             r"(?<=[0-9])[+\\-\\*^](?=[0-9-])",
             r"(?<=[{al}{q}])\\.(?=[{au}{q}])".format(
                 al=ALPHA_LOWER,au=ALPHA_UPPER,q=CONCAT_QUOTES
             ),
             r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
             r"(?<=[{a}0-9])[:<>/](?=[{a}])".format(a=ALPHA)
         ])
infixes_re=compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer=infixes_re.finditer
doc=nlp("Mother-in-law")
print([t.text for t in doc])


['Mother', '-', 'in', '-', 'law']
['Mother-in-law']


#Basic whitespace tokenizer


In [None]:
from string import whitespace
from spacy.tokens import Doc

class whitespacetokenizer:
  def __init__(self,vocab):
    self.vocab=vocab
  def __call__(self,text):
    words=text.split(" ")
    spaces=[True]*len(words)
    for i, word in enumerate(words):
      if word == "":
        words[i]=" "
        spaces[i]=False
    if words[-1]=="":
      words=words[:-1]
      spaces=spaces[:-1]
    else:
      spaces[-1]=False
    return Doc(self.vocab,words=words, spaces=spaces)
nlp=spacy.blank("en")
nlp.tokenizer=whitespacetokenizer(nlp.vocab)
doc=nlp("What's happend to me ? he thought.It wasn't a dream.")
print([token.text for token in doc])

["What's", 'happend', 'to', 'me', '?', 'he', 'thought.It', "wasn't", 'a', 'dream.']


In [None]:
!pip install transformers



#using Third-party tokenizers BERT word pieces

In [None]:
from tokenizers import BertWordPieceTokenizer
from spacy.tokens import Doc
import spacy

class BertTokenizer:
  def __init__(self,vocab,vocab_file,lowercase=True):
    self.vocab=vocab
    self.tokenizer=BertWordPieceTokenizer(vocab_file,lowercase=lowercase)
  def __call__(self,text):
    tokens=self.tokenizer.encode(text)
    word=[]
    space=[]
    for i,(text,(start,end)) in enumerate(zip(tokens.tokens,tokens.offsets)):
      word.append(text)
      if i < len(tokens.tokens)-1:
        next_start,next_end=tokens.offsets[i+1]
        space.append(next_start==start)
      else:
        space.append(False)
    return Doc(self.vocab,words=word,spaces=space)
nlp=spacy.blank("en")
nlp.tokenizer=BertTokenizer(nlp.vocab,"bert-base-uncased-vocab.txt")
doc=nlp("Justin Dew Biber is a Canadian singer,songwriter, and actor.")
print([token.text for token in doc])

['[CLS]', 'justin', 'dew', 'bi', '##ber', 'is', 'a', 'canadian', 'singer', ',', 'songwriter', ',', 'and', 'actor', '.', '[SEP]']


In [None]:
# using pre_tokenized text

from spacy.tokens import Doc
nlp=spacy.blank("en")
words=["Hello",",","World","!"]
spaces=[False,True,False,False]
doc=Doc(nlp.vocab,words=words,spaces=spaces)
print(doc.text)
print([(t.text,t.text_with_ws,t.whitespace_ )for t in doc])

Hello, World!
[('Hello', 'Hello', ''), (',', ', ', ' '), ('World', 'World', ''), ('!', '!', '')]


In [None]:
# Alligning tokenization

from spacy.training import Alignment
other_tokens=["i","listeended","to", "obama", "'", "s", "podcasts", "."]
spacy_tokens=["i","listeended","to", "obama", "'", "s", "podcasts", "."]
align=Alignment.from_strings(other_tokens,spacy_tokens)
print(f"a->b,lengths:{align.x2y.lengths}")
print(f"a->b,mapping:{align.x2y.data}")
print(f"a->b,lengths:{align.y2x.lengths}")
print(f"a->b,mapping:{align.y2x.data}")

a->b,lengths:[1 1 1 1 1 1 1 1]
a->b,mapping:[0 1 2 3 4 5 6 7]
a->b,lengths:[1 1 1 1 1 1 1 1]
a->b,mapping:[0 1 2 3 4 5 6 7]


#Merging and splitting

In [None]:
nlp=spacy.load("en_core_web_sm")
doc=nlp("I live in New York")
print("Before",[token.text for token in doc])
with doc.retokenize() as retokenizer:
  retokenizer.merge(doc[3:5],attrs={"LEMMA":"newyork"})
print("After",[token.text for token in doc])

Before ['I', 'live', 'in', 'New', 'York']
After ['I', 'live', 'in', 'New York']


#Splitting tokens

In [None]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I live in NewYork")
print("Before:", [token.text for token in doc])
displacy.render(doc)
with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
    retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
print("After:", [token.text for token in doc])
displacy.render(doc)

Before: ['I', 'live', 'in', 'NewYork']


After: ['I', 'live', 'in', 'New', 'York']


#Sentence Segmentation

In [None]:
nlp=spacy.load("en_core_web_sm")
doc=nlp("This is a sentence.This is another sentence.")
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
  print(sent.text)

This is a sentence.
This is another sentence.


#Sentence Segmentation using dependancy parse

In [None]:
nlp=spacy.load("en_core_web_sm")
doc=nlp("This is a sentence.This is another sentence.")
for sent in doc.sents:
  print(sent.text)

This is a sentence.
This is another sentence.


#Sentence Segmentation using Statistical sentence segmenter


In [None]:
nlp=spacy.load("en_core_web_sm")
nlp.enable_pipe("senter")
doc=nlp("This is a sentence.This is another sentence.")
for sent in doc.sents:
  print(sent.text)

This is a sentence.
This is another sentence.


#Sentence Segmentation using Rule based pipeline component

In [None]:
from spacy.lang.en import English
nlp=English()
nlp.add_pipe("sentencizer")
doc=nlp("This is a sentence.This is another sentence.")
for sent in doc.sents:
  print(sent.text)

This is a sentence.
This is another sentence.


#Sentence Segmentation using custom rule-based strategy

In [None]:
from spacy.language import Language
text= "this is a sentence...hello...and another sentence."
nlp=spacy.load("en_core_web_sm")
doc=nlp(text)
print("Before:",[sent.text for sent in doc.sents])

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == "...":
      doc[token.i+1].is_sent_start=True
  return doc

nlp.add_pipe("set_custom_boundaries",before="parser")
doc=nlp(text)
print("After:",[sent.text for sent in doc.sents])


Before: ['this is a sentence...hello...and another sentence.']
After: ['this is a sentence...', 'hello...', 'and another sentence.']


#mappings & exceptions

In [None]:
nlp=spacy.load("en_core_web_sm")
text="I saw The Who perform. Who did you see?"
doc1=nlp(text)
print(doc1[2].tag_,doc1[2].pos_)
print(doc1[3].tag_,doc1[3].pos_)

# add attribute ruler with exception
ruler=nlp.get_pipe("attribute_ruler")
# pattern to match (The Who)
patterns=[[{"LOWER":"the"},{"TEXT":"Who"}]]
# attributes to assign to the matched token
attrs={"TAG":"NNP","POS":"PROPN"}
# add rules to the attribute ruler
ruler.add(patterns=patterns,attrs=attrs,index=0)
ruler.add(patterns=patterns,attrs=attrs,index=1)

doc2=nlp(text)
print(doc2[2].tag_,doc2[2].pos_)
print(doc2[3].tag_,doc2[3].pos_)
# the second who remains unmodified
print(doc2[5].tag_,doc2[5].pos_)


DT DET
WP PRON
NNP PROPN
NNP PROPN
. PUNCT


#Word vectors and semantic similarity

In [None]:
nlp = spacy.load("en_core_web_sm")
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(f"text: {token.text}, has vector{token.has_vector}, Vector norm: {token.vector_norm}, Out of vocabulary (OOV): {token.is_oov}")


text: dog, has vectorTrue, Vector norm: 6.814785957336426, Out of vocabulary (OOV): True
text: cat, has vectorTrue, Vector norm: 7.370901584625244, Out of vocabulary (OOV): True
text: banana, has vectorTrue, Vector norm: 7.646069526672363, Out of vocabulary (OOV): True
text: afskfsd, has vectorTrue, Vector norm: 7.192255973815918, Out of vocabulary (OOV): True


In [None]:
nlp=spacy.load("en_core_web_sm")
doc1=nlp("I like salty fries and hamburgers")
doc2=nlp("Fast food tastes very good")
# similarity of both doc1 and doc2
print(doc1,"<->",doc2,doc1.similarity(doc2))

# similarity of tokens and span
french_fries=doc1[2:4]
burgers=doc1[5]
print(french_fries,"<->",burgers,french_fries.similarity(burgers))

I like salty fries and hamburgers <-> Fast food tastes very good 0.2457051288099938
salty fries <-> hamburgers 0.3522574305534363


  print(doc1,"<->",doc2,doc1.similarity(doc2))
  print(french_fries,"<->",burgers,french_fries.similarity(burgers))


#Creating a custom language subclass

In [None]:
from spacy.lang.en import English
class CustomEnglish(English.Defaults):
  stop_words=set(["custom","stop"])
class CustomEnglish(English):
  lang="custom_en"
  Defaults=CustomEnglish
nlp1=English()
nlp2=CustomEnglish()
print(nlp1.lang,[token.is_stop for token in nlp1("custom stop")])
print(nlp2.lang,[token.is_stop for token in nlp2("custom stop")])

en [False, False]
custom_en [True, True]


#Rule-based matching

#Token-based matching

In [None]:
from spacy.matcher import Matcher
nlp=spacy.load("en_core_web_sm")
matcher=Matcher(nlp.vocab)
# Define the pattern
pattern=[{"LOWER":"hello"},{"IS_PUNCT":True},{"LOWER":"world"}]
# Add pattern to the matcher
matcher.add("HelloWorld",[pattern])
doc=nlp("Hello, world! Hello world!")
matches=matcher(doc)
for match_id,start,end, in matches:
  string_id=nlp.vocab.strings[match_id]
  span=doc[start:end]
  print(match_id,string_id,start,end,span.text)

15578876784678163569 HelloWorld 0 3 Hello, world


#Matching regular expressions on the full text

In [None]:
import re
nlp=spacy.load("en_core_web_sm")
doc=nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")
expression=r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression,doc.text):
  start,end=match.span()
  span=doc.char_span(start,end)
  if span is not None:
    print("Found match:",span.text)

Found match: United States
Found match: United States
Found match: U.S.
Found match: US


# Adding on_match rules

In [None]:
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span
nlp=English()
matcher=Matcher(nlp.vocab)
def add_event_ent(matcher,doc,i,matches):
  match_id,start,end=matches[i]
  entity=Span(doc,start,end,label="EVENT")
  doc.ents+=(entity,)
  print( entity.text)
pattern=[{"ORTH":"Google"},{"ORTH":"I"},{"ORTH":"/"},{"ORTH":"O"}]
matcher.add("GoogleIO",[pattern],on_match=add_event_ent)
doc=nlp("This is a text about Google I/O")
matches=matcher(doc)

Google I/O


#Creating spans from matches

In [None]:
from spacy.matcher import Matcher
from spacy.tokens import Span
nlp=spacy.blank("en")
matcher=Matcher(nlp.vocab)
matcher.add("person",[[{"lower":"barak"},{"lower":"obama"}]])
doc=nlp("Barak Obama was the president of United States")
matches=matcher(doc)
for match_id,start,end in matches:
  span=Span(doc,start,end,label=match_id)
  print(span.text,span.label_)
matches=matcher(doc,as_spans=True)
for span in matches:
  print(span.text,span.label_)

Barak Obama person
Barak Obama person


#using custom pipeline components

In [None]:
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
@Language.factory("html_merger")
def create_bad_html_merger(nlp, name):
    return BadHTMLMerger(nlp.vocab)
class BadHTMLMerger:
    def __init__(self, vocab):
        patterns = [
            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
        ]
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("BAD_HTML", patterns)
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []
        for match_id, start, end in matches:
            spans.append(doc[start:end])
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.bad_html = True
        return doc
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("html_merger", last=True)
doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc:
    print(token.text, token._.bad_html)



Hello False
<br> True
world False
! False
<br/> True
This False
is False
a False
test False
. False


#using linguistic annotation

In [None]:
from spacy import displacy
from spacy.matcher import Matcher
nlp=spacy.load("en_core_web_sm")
matcher=Matcher(nlp.vocab)
matcher_sents=[]
def collect_sents(matcher,doc,i,matches):
  match_id,start,end=matches[i]
  span=doc[start:end]
  sent=span.sent
  match_ents=[{"start":span.start_char-sent.start_char,
               "end":span.end_char-sent.start_char,
               "label":"MATCH",}]
  matcher_sents.append(({"text":sent.text,"ents":match_ents}))
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]
matcher.add("FacebookIs",[pattern],on_match=collect_sents)
doc=nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")
matches=matcher(doc)
displacy.render(matcher_sents,style="ent",manual=True)

# Detect phone numbers in a specific format

In [None]:
from spacy.matcher import Matcher

nlp=spacy.load("en_core_web_sm")
matcher=Matcher(nlp.vocab)
pattern=[{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
           {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER",[pattern])
doc=nlp("Call me at (123) 456 789 or (123) 456-789!")
print([t.text for t in doc])
matches=matcher(doc)
for match_id,start,end in matches:
  span=doc[start:end]
  print(span.text)

['Call', 'me', 'at', '(', '123', ')', '456', '789', 'or', '(', '123', ')', '456', '-', '789', '!']
(123) 456 789
(123) 456-789


# Hashtags and emoji on social media

In [None]:
from spacy.lang.en import English
from spacy.matcher import Matcher
nlp=English()
matcher=Matcher(nlp.vocab)
pos_emoji=["😀","😃","😂","🤣","😊","😍"]
neg_emoji=["😞","😠","😩","😢","😭","😒"]
pos_patterns=[[{"ORTH":emoji}] for emoji in pos_emoji]
neg_patterns=[[{"ORTH":emoji}] for emoji in neg_emoji]

def lable_sentiment(matcher,doc,i,matches):
  match_id,start,end=matches[i]
  if doc.vocab.strings[match_id]=="HAPPY":
    doc.sentiment+=0.1
  elif doc.vocab.strings[match_id]=="SAD":
    doc.sentiment-=0.1
matcher.add("HAPPY",pos_patterns,on_match=lable_sentiment)
matcher.add("SAD",neg_patterns,on_match=lable_sentiment)
matcher.add("HASHTAG",[[{"ORTH":"#"},{"IS_ASCII":True}]])
doc=nlp("Hello world 😀 #MondayMotivation")
matches=matcher(doc)
for match_id,start,end in matches:
  string_id=doc.vocab.strings[match_id]
  span=doc[start:end]
  print(string_id,span.text)

HAPPY 😀
HASHTAG #MondayMotivation


#Phrase Matcher

#Efficient phrase matching

In [None]:
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


# Dependency Matcher


In [None]:
from spacy.matcher import DependencyMatcher
nlp=spacy.load("en_core_web_sm")
matcher=DependencyMatcher(nlp.vocab)
pattern=[
  {
    "RIGHT_ID": "anchor_founded",
    "RIGHT_ATTRS": {"ORTH": "founded"}
  }
]
matcher.add("FOUNDED",[pattern])
doc=nlp("Smith founded two companies.")
matches=matcher(doc)
print(matches)

[(4851363122962674176, [1])]


In [None]:
from spacy.matcher import DependencyMatcher
nlp=spacy.load("en_core_web_sm")
matcher=DependencyMatcher(nlp.vocab)
pattern = [
    {
        "RIGHT_ID": "anchor_founded",
        "RIGHT_ATTRS": {"ORTH": "founded"}
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_subject",
        "RIGHT_ATTRS": {"DEP": "nsubj"},
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object",
        "RIGHT_ATTRS": {"DEP": "dobj"},
    },
    {
        "LEFT_ID": "founded_object",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object_modifier",
        "RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
    }
]
matcher.add("FOUNDED",[pattern])
doc=nlp("Lee, an experienced CEO, has founded two AI startups.")
matches=matcher(doc)
print(matches)
match_id,token_ids=matches[0]
for i in range(len(token_ids)):
  print(pattern[i]["RIGHT_ID"]+":",doc[token_ids[i]].text)

[(4851363122962674176, [7, 0, 10, 9])]
anchor_founded: founded
founded_subject: Lee
founded_object: startups
founded_object_modifier: AI


#Entity Ruler

In [None]:
from spacy.lang.en import English
nlp=English()
ruler=nlp.add_pipe("entity_ruler")
patterns=[{"label": "ORG", "pattern": "Apple"},{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)
doc=nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text,ent.label_) for ent in doc.ents])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]


In [None]:
# Adding Ids to patterns
from spacy.lang.en import English
nlp=English()
ruler=nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)
doc1=nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text,ent.label_) for ent in doc1.ents])
doc2=nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text,ent.label_) for ent in doc2.ents])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]
[('Apple', 'ORG'), ('San Francisco', 'GPE')]


#Span Ruler

In [None]:
nlp=spacy.blank("en")
ruler=nlp.add_pipe("span_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)
doc=nlp("Apple is opening its first big office in San Francisco.")
print([(span.text,span.label_) for span in doc.spans["ruler"]])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]


In [None]:
nlp=spacy.load("en_core_web_sm")
config={"spans_key":None,"annotate_ents":True,"overwrite":False}
ruler=nlp.add_pipe("span_ruler",config=config)
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)
doc=nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text,ent.label_) for ent in doc.ents])

[('MyCorp Inc.', 'ORG'), ('U.S.', 'GPE')]


#Combining models and rules

In [None]:
# Expanding named entities
from spacy.language import Language
from spacy.tokens import Span
nlp=spacy.load("en_core_web_sm")
@Language.component("expand_person_entities")
def expand_person_entities(doc):
  new_ents=[]
  for ent in doc.ents:
    if ent.label_=="PERSON" and ent.start!=0:
      prev_token=doc[ent.start-1]
      if prev_token.text in ("Dr","Dr.","Mr","Mr.","Ms","Ms."):
        new_ent=Span(doc,ent.start-1,ent.end,label=ent.label)
        new_ents.append(new_ent)
      else:
        new_ents.append(ent)
    else:
      new_ents.append(ent)
  doc.ents=new_ents
  return doc
nlp.add_pipe("expand_person_entities",after="ner")
doc=nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text,ent.label_) for ent in doc.ents])



[('Dr. Alex Smith', 'PERSON'), ('first', 'ORDINAL'), ('Acme Corp Inc.', 'ORG')]


In [None]:
from spacy.tokens import span
nlp=spacy.load("en_core_web_sm")
def get_person_title(span):
  if span.label_=="PERSON"and span.start!=0:
    prev_token=span.doc[span.start-1]
    if prev_token.text in ("Dr","Dr.","Mr","Mr.","Ms","Ms."):
      return prev_token.text
Span.set_extension("person_title",getter=get_person_title)
doc=nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text,ent.label_,ent._.person_title) for ent in doc.ents])

[('Alex Smith', 'PERSON', 'Dr.'), ('first', 'ORDINAL', None), ('Acme Corp Inc.', 'ORG', None)]


# Using entities, part-of-speech tags and the dependency parse

In [None]:
from spacy.language import Language
from spacy import displacy
nlp=spacy.load("en_core_web_sm")
@Language.component("extract_person_orgs")
def extract_person_orgs(doc):
  person_entities=[ent for ent in doc.ents if ent.label_=="PERSON"]
  for ent in person_entities:
    head=ent.root.head
    if head.lemma_=="work":
      preps=[token for token in head.children if token.dep_=="prep"]
      for prep in preps:
        orgs=[token for token in prep.children if token.ent_type_=="ORG"]
        print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
  return doc
nlp.add_pipe("merge_entities")
nlp.add_pipe("extract_person_orgs")
doc=nlp("Alex Smith worked at Acme Corp Inc.")
displacy.render(doc,options={"fine_grained":True})

{'person': Alex Smith, 'orgs': [Acme Corp Inc.], 'past': True}
