In [1]:
import spacy
import numpy as np
import pandas as pd
from spacy import displacy


nlp = spacy.load("en_core_web_sm")
print("Pipeline:", nlp.pipe_names)

  from .autonotebook import tqdm as notebook_tqdm


Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


# Part-of-speech tagging

    Text: The original word text.
    
    Lemma: The base form of the word.
    
    POS: The simple UPOS part-of-speech tag.
    
    Tag: The detailed part-of-speech tag.
    
    Dep: Syntactic dependency, i.e. the relation between tokens.
    
    Shape: The word shape – capitalization, punctuation, digits.
    
    is alpha: Is the token an alpha character?
    
    is stop: Is the token part of a stop list (most common words)
    
    

In [24]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

data = []
column_names = ["Text","Lemma", "POS", "Tag", "Dep","Shape","Is_Alpha","Is_Stop"]

for token in doc:
    data.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop])
    
df = pd.DataFrame(data, columns=column_names)
df


Unnamed: 0,Text,Lemma,POS,Tag,Dep,Shape,Is_Alpha,Is_Stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,dobj,X.X.,False,False
6,startup,startup,NOUN,NN,dep,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


In [27]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [4]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 30})

In [5]:
displacy.render(doc, style='span', jupyter=True, options={'distance': 30})


Available keys: []


# Morphology

We say that a lemma (root form) is inflected (modified/combined) with one or more morphological features to create a surface form



| CONTEXT                |	SURFACE	 | LEMMA  | POS   | MORPHOLOGICAL FEATURES |
|------------------------|-----------|--------|-------|------------------------|
|I was reading the paper | reading   |	read  |	VERB  |	VerbForm=Ger           |
|------------------------------------------------------------------------------|
|I don’t watch the news, |   read    |  read  | VERB  | VerbForm=Fin,          |
|I read the paper 	     |	    	 |        |	      | Mood=Ind, Tense=Pres   |
|------------------------------------------------------------------------------|
|I read the paper        |   read    |   read | VERB  | VerbForm=Fin,          |
|yesterday	             |      	 |   	  |       |	Mood=Ind,Tense=Past    |

    


In [6]:
doc = nlp("I was reading the paper.")
token = doc[0]  # 'I'
print(token.morph.to_dict())  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']


{'Case': 'Nom', 'Number': 'Sing', 'Person': '1', 'PronType': 'Prs'}
['Prs']


## Statistical morphology 

spaCy’s statistical Morphologizer component assigns the morphological features and coarse-grained part-of-speech tags as Token.morph and Token.pos.

In [7]:
doc = nlp("Wo bist du?") # English: 'Where are you?'
print(doc[2].morph)  # 'Case=Nom|Number=Sing|Person=2|PronType=Prs'
print(doc[2].pos_) # 'PRON'

Number=Sing
PROPN


## Rule-based morphology

For languages with relatively simple morphological systems like English, spaCy can assign morphological features through a rule-based approach, which uses the token text and fine-grained part-of-speech tags to produce coarse-grained part-of-speech tags and morphological features.

The part-of-speech tagger assigns each token a fine-grained part-of-speech tag. In the API, these tags are known as Token.tag. They express the part-of-speech (e.g. verb) and some amount of morphological information, e.g. that the verb is past tense (e.g. VBD for a past tense verb in the Penn Treebank) .
For words whose coarse-grained POS is not set by a prior process, a mapping table maps the fine-grained tags to a coarse-grained POS tags and morphological features.

In [8]:
doc = nlp("Where are you?")
print(doc[2].morph)  # 'Case=Nom|Person=2|PronType=Prs'
print(doc[2].pos_)  # 'PRON'

Case=Nom|Person=2|PronType=Prs
PRON


# Lemmatization

The EditTreeLemmatizer component provides a trainable lemmatizer.


### Rule Based Lemmatization

In [11]:
lemmatizer = nlp.get_pipe("lemmatizer")
print("lemmatization mode : ",lemmatizer.mode)  # 'rule'

doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])
# ['I', 'be', 'read', 'the', 'paper', '.']


lemmatization mode :  rule
['I', 'be', 'read', 'the', 'paper', '.']


## Lookup lemmatizer

For pipelines without a tagger or morphologizer, a lookup lemmatizer can be added to the pipeline as long as a lookup table is provided, typically through spacy-lookups-data. The lookup lemmatizer looks up the token surface form in the lookup table without reference to the token’s part-of-speech or context.



            # pip install -U spacy[lookups]
            import spacy
            
            nlp = spacy.blank("sv")
            nlp.add_pipe("lemmatizer", config={"mode": "lookup"})

 ## Rule-based lemmatizer

When training pipelines that include a component that assigns part-of-speech tags (a morphologizer or a tagger with a POS mapping), a rule-based lemmatizer can be added using rule tables from spacy-lookups-data

            # pip install -U spacy[lookups]
            import spacy
            
            nlp = spacy.blank("de")
            # Morphologizer (note: model is not yet trained!)
            nlp.add_pipe("morphologizer")
            # Rule-based lemmatizer
            nlp.add_pipe("lemmatizer", config={"mode": "rule"})
            

# Dependency Parsing

he parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or “chunks”.

### Noun chunks

Noun chunks are “base noun phrases” – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun – for example, “the lavish green grass” or “the world’s largest tech fund”. 

        Text: The original noun chunk text.
        
        Root text: The original text of the word connecting the noun chunk to the rest of the parse.
        
        Root dep: Dependency relation connecting the root to its head.
            
        Root head text: The text of the root token’s head.


In [18]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
data = []
for chunk in doc.noun_chunks:
    data.append([chunk.text, chunk.root.text, chunk.root.dep_,chunk.root.head.text])

df = pd.DataFrame(data,columns=["Text","Root_Text","Root_Dep","Root_head_text"])
df

Unnamed: 0,Text,Root_Text,Root_Dep,Root_head_text
0,Autonomous cars,cars,nsubj,shift
1,insurance liability,liability,dobj,shift
2,manufacturers,manufacturers,pobj,toward


## Navigating the parse tree

spaCy uses the terms head and child to describe the words connected by a single arc in the dependency tree. The term dep is used for the arc label, which describes the type of syntactic relation that connects the child to the head. As with other attributes, the value of .dep is a hash value. You can get the string value with .dep_.

In [19]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
data = []
for token in doc:
    data.append([token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children]])

df = pd.DataFrame(data,columns=["Text","Dep","Head_Text","Head_Pos","Children"])
df

Unnamed: 0,Text,Dep,Head_Text,Head_Pos,Children
0,Autonomous,amod,cars,NOUN,[]
1,cars,nsubj,shift,VERB,[Autonomous]
2,shift,ROOT,shift,VERB,"[cars, liability, toward]"
3,insurance,compound,liability,NOUN,[]
4,liability,dobj,shift,VERB,[insurance]
5,toward,prep,shift,VERB,[manufacturers]
6,manufacturers,pobj,toward,ADP,[]


In [22]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})

# Named Entity Recognition

    Text: The original entity text.
    
    Start: Index of start of entity in the Doc.
    
    End: Index of end of entity in the Doc.
    
    Label: Entity label, i.e. type.



In [35]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion. When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.")
data = []
for ent in doc.ents:
    data.append([ent.text, ent.start_char, ent.end_char, ent.label_])

df = pd.DataFrame(data,columns=["Text","Start_char","End_char","Label"])
df


Unnamed: 0,Text,Start_char,End_char,Label
0,Apple,0,5,ORG
1,U.K.,27,31,GPE
2,$1 billion,44,54,MONEY
3,Sebastian Thrun,61,76,PERSON
4,Google,117,123,ORG
5,2007,127,131,DATE


In [36]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 100})

## Entity Linking

To ground the named entities into the “real world”, spaCy provides functionality to perform entity linking, which resolves a textual entity to a unique identifier from a knowledge base (KB). You can create your own KnowledgeBase and train a new EntityLinker using that custom knowledge base.

Accessing entity identifiers NEEDS MODEL
The annotated KB identifier is accessible as either a hash value or as a string, using the attributes ent.kb_id and ent.kb_id_ of a Span object, or the ent_kb_id and ent_kb_id_ attributes of a Token object.

In [37]:
# nlp = spacy.load("my_custom_el_pipeline")

doc = nlp("Ada Lovelace was born in London")

# Document level
ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
print(ents)  # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')]

# Token level
ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_]
ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_]
ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_]
print(ent_ada_0)  # ['Ada', 'PERSON', 'Q7259']
print(ent_ada_1)  # ['Lovelace', 'PERSON', 'Q7259']
print(ent_london_5)  # ['London', 'GPE', 'Q84']


[('Ada Lovelace', 'PERSON', ''), ('London', 'GPE', '')]
['Ada', 'PERSON', '']
['Lovelace', 'PERSON', '']
['London', 'GPE', '']


# Tokenization

Tokenizer exception: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied.

    Prefix: Character(s) at the beginning, e.g. $, (, “, ¿.
    
    Suffix: Character(s) at the end, e.g. km, ), ”, !.
    
    Infix: Character(s) in between, e.g. -, --, /, ….

In [38]:
from spacy.symbols import ORTH


doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']


['gimme', 'that']
['gim', 'me', 'that']


In [54]:
import itertools
text = "Apple is looking at buying U.K. startup for $1 billion"
orginal = text.split()
doc = nlp(text)

data = []
for token in doc:
    data.append(token.text)
    
nested = [orginal,data]
df = pd.DataFrame((_ for _ in itertools.zip_longest(*nested)), columns=["Text","Tokens"])
df

Unnamed: 0,Text,Tokens
0,Apple,Apple
1,is,is
2,looking,looking
3,at,at
4,buying,buying
5,U.K.,U.K.
6,startup,startup
7,for,for
8,$1,$
9,billion,1


In [48]:
data

['Apple',
 'is',
 'looking',
 'at',
 'buying',
 'U.K.',
 'startup',
 'for',
 '$',
 '1',
 'billion']

## Debugging the tokenizer
A working implementation of the pseudo-code above is available for debugging as nlp.tokenizer.explain(text). It returns a list of tuples showing which tokenizer rule or pattern was matched for each token. The tokens produced are identical to nlp.tokenizer() except for whitespace tokens:

    "      PREFIX
    Let    SPECIAL-1
    's     SPECIAL-2
    go     TOKEN
    !      SUFFIX
    "      SUFFIX

In [61]:
from spacy.lang.en import English

nlp = English()
text = '''"Let's go!"'''
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
df = pd.DataFrame(columns=["Tokens","Labels"])
for idx,t in enumerate(tok_exp):
    df.loc[idx]=[t[1],t[0]]

df

Unnamed: 0,Tokens,Labels
0,"""",PREFIX
1,Let,SPECIAL-1
2,'s,SPECIAL-2
3,go,TOKEN
4,!,SUFFIX
5,"""",SUFFIX


## Modifying existing rule sets

If you need to subclass the tokenizer instead, the relevant methods to specialize are find_prefix, find_suffix and find_infix.


In many situations, you don’t necessarily need entirely custom rules. Sometimes you just want to add another character to the prefixes, suffixes or infixes. The default prefix, suffix and infix rules are available via the nlp object’s Defaults and the Tokenizer attributes such as Tokenizer.suffix_search are writable, so you can overwrite them with compiled regular expression objects using modified default rules. spaCy ships with utility functions to help you compile the regular expressions – for example,

The prefix, infix and suffix rule sets include not only individual characters but also detailed regular expressions that take the surrounding context into account. For example, there is a regular expression that treats a hyphen between letters as an infix. If you do not want the tokenizer to split on hyphens between letters, you can modify the existing infix definition from lang/punctuation.py:

In [62]:
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

# Default tokenizer
doc = nlp("mother-in-law")
print([t.text for t in doc]) # ['mother', '-', 'in', '-', 'law']

# Modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\\-\\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # ✅ Commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer
doc = nlp("mother-in-law")
print([t.text for t in doc]) # ['mother-in-law']


['mother', '-', 'in', '-', 'law']
['mother-in-law']


# Merging and splitting

The Doc.retokenize context manager lets you merge and split tokens. 


In [68]:
doc = nlp("I live in New York")
print("Before:", [token.text for token in doc])

with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5], attrs={"LEMMA": "new york"})
print("After:", [token.text for token in doc])


Before: ['I', 'live', 'in', 'New', 'York']
After: ['I', 'live', 'in', 'New York']


## Context Dependent Merge & Splits

If an attribute in the attrs is a context-dependent token attribute, it will be applied to the underlying Token. For example LEMMA, POS or DEP only apply to a word in context, so they’re token attributes. If an attribute is a context-independent lexical attribute, it will be applied to the underlying Lexeme, the entry in the vocabulary. For example, LOWER or IS_STOP apply to all words of the same spelling, regardless of the context.

In [75]:
doc = nlp("I live in NewYork")
print("Before:", [token.text for token in doc])

with doc.retokenize() as retokenizer:
    heads = [(doc[3], 1), doc[2]]
    attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
    retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
print("After:", [token.text for token in doc])


Before: ['I', 'live', 'in', 'NewYork']
After: ['I', 'live', 'in', 'New', 'York']


# Statistical sentence segmenter

The SentenceRecognizer is a simple statistical component that only provides sentence boundaries. Along with being faster and smaller than the parser, its primary advantage is that it’s easier to train because it only requires annotated sentence boundaries rather than full dependency parses. 


In [79]:

nlp.enable_pipe("senter")
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


## Rule-based pipeline component

The Sentencizer component is a pipeline component that splits sentences on punctuation like ., ! or ?. 

In [80]:
from spacy.lang.en import English

nlp_e = English()  # just the language with no pipeline
nlp_e.add_pipe("sentencizer")
doc = nlp_e("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


## Mappings & Exceptions 

The AttributeRuler manages rule-based mappings and exceptions for all token-level attributes.

provide exceptions for any token attributes
map fine-grained tags to coarse-grained tags for languages without statistical morphologizers (replacing the v2.x tag_map in the language data)
map token surface form + fine-grained tags to morphological features (replacing the v2.x morph_rules in the language data)
specify the tags for space tokens (replacing hard-coded behavior in the tagger)

In [81]:
text = "I saw The Who perform. Who did you see?"
doc1 = nlp(text)
print(doc1[2].tag_, doc1[2].pos_)  # DT DET
print(doc1[3].tag_, doc1[3].pos_)  # WP PRON

# Add attribute ruler with exception for "The Who" as NNP/PROPN NNP/PROPN
ruler = nlp.get_pipe("attribute_ruler")
# Pattern to match "The Who"
patterns = [[{"LOWER": "the"}, {"TEXT": "Who"}]]
# The attributes to assign to the matched token
attrs = {"TAG": "NNP", "POS": "PROPN"}
# Add rules to the attribute ruler
ruler.add(patterns=patterns, attrs=attrs, index=0)  # "The" in "The Who"
ruler.add(patterns=patterns, attrs=attrs, index=1)  # "Who" in "The Who"

doc2 = nlp(text)
print(doc2[2].tag_, doc2[2].pos_)  # NNP PROPN
print(doc2[3].tag_, doc2[3].pos_)  # NNP PROPN
# The second "Who" remains unmodified
print(doc2[5].tag_, doc2[5].pos_)  # WP PRON


DT DET
WP PRON
NNP PROPN
NNP PROPN
. PUNCT


# Word vectors and semantic similarity

    Text: The original token text.
    
    has vector: Does the token have a vector representation?
    
    Vector norm: The L2 norm of the token’s vector (the square root of the sum of the values squared)
    
    OOV: Out-of-vocabulary

In [86]:
tokens = nlp("dog cat banana afskfsd")
data = []
for token in tokens:
    data.append([token.text, token.has_vector, token.vector_norm, token.is_oov])

df = pd.DataFrame(data,columns=["Text","Has_Vector","Vector_Norm","Is_OOV"])
df

Unnamed: 0,Text,Has_Vector,Vector_Norm,Is_OOV
0,dog,True,6.814786,True
1,cat,True,7.370902,True
2,banana,True,7.64607,True
3,afskfsd,True,7.192256,True


In [87]:
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")
data = []
# Similarity of two documents
data.append([doc1, doc2, doc1.similarity(doc2)])
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
data.append([french_fries, burgers, french_fries.similarity(burgers)])
df = pd.DataFrame(data,columns=["Doc1","Doc2","Similarity_Score"])
df

  data.append([doc1, doc2, doc1.similarity(doc2)])
  data.append([french_fries, burgers, french_fries.similarity(burgers)])


Unnamed: 0,Doc1,Doc2,Similarity_Score
0,"(I, like, salty, fries, and, hamburgers, .)","(Fast, food, tastes, very, good, .)",0.3676
1,"(salty, fries)",hamburgers,0.433389


# Creating a custom language subclass

If you want to customize multiple components of the language data or add support for a custom language or domain-specific “dialect”, you can also implement your own language subclass. The subclass should define two attributes: the lang (unique language code) and the Defaults defining the language data. For an overview of the available attributes that can be overwritten,

In [84]:
from spacy.lang.en import English

class CustomEnglishDefaults(English.Defaults):
    stop_words = set(["custom", "stop"])

class CustomEnglish(English):
    lang = "custom_en"
    Defaults = CustomEnglishDefaults

nlp1 = English()
nlp2 = CustomEnglish()

print(nlp1.lang, [token.is_stop for token in nlp1("custom stop")])
print(nlp2.lang, [token.is_stop for token in nlp2("custom stop")])

en [False, False]
custom_en [True, True]


#