# Chapter 3: Processing Pipelines

- https://course.spacy.io/chapter3

# 1)- Inspecting the pipeline

In [1]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load("en_core_web_sm")

In [2]:
# Print the names of the pipeline components
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [3]:
# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x11d718e50>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x11ddad440>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x11ddad360>)]


# 2)-custom Pipeline Components


def custom_component(doc): <br>
    # Do something to the doc here <br>
    return doc <br>

nlp.add_pipe(custom_component) <br>

### 2.1)- A simple component

In [4]:
# Create the nlp object
nlp = spacy.load('en_core_web_sm')

# Define a custom component
def custom_component(doc):
    # Print the doc's length
    print('Doc length:', len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe(custom_component, first=True)

# Print the pipeline component names
print('Pipeline:', nlp.pipe_names)

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']


In [5]:
# Process a text
doc = nlp("Hello world!")

Doc length: 3


### 2.2)- Length of document

In [6]:
import spacy

# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print("This document is {} tokens long.".format(doc_length))
    # Return the doc
    return doc


# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("My name is Hassan")

['length_component', 'tagger', 'parser', 'ner']
This document is 4 tokens long.


# 3)- Complex Component

custom component that uses the PhraseMatcher to find animal names in the document

In [7]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'ANIMAL'
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tagger', 'parser', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


# 4)-Extension Attributes

- Attribute extensions
- Property extensions
- Method extensions

### 4.1)-attribute Extension


In [8]:
from spacy.tokens import Token

# Set extension on the Token with default value
Token.set_extension('is_color', default=False, force=True)

doc = nlp("The sky is blue.")

# Overwrite extension attribute value
doc[3]._.is_color = True

### 4.2)-Property extension

- Span extensions should almost always use a getter

In [9]:
from spacy.tokens import Span

# Define getter function
def get_has_color(span):
    colors = ['red', 'yellow', 'blue']
    return any(token.text in colors for token in span)

# Set extension on the Span with getter
Span.set_extension('has_color', getter=get_has_color, force=True)

doc = nlp("The sky is blue.")
print(doc[1:4]._.has_color, '-', doc[1:4].text)
print(doc[0:2]._.has_color, '-', doc[0:2].text)

True - sky is blue
False - The sky
