# Inspecting pipeline


In [1]:
import os
import spacy

DATA_PATH = "../data/raw/"


In [20]:
# Load the en_core_web_sm model
nlp = spacy.load("en_core_web_sm")

# Print the names of the pipeline components
print(nlp.pipe_names)

# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fc7461a9580>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fc7461a9a60>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7fc7469522e0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fc747b23f80>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fc745867700>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fc746952190>)]


## Custom component in pipeline


In [3]:
from spacy import Language

In [4]:
# Define the custom component
@Language.component("length_component")
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print("This document is {} tokens long.".format(doc_length))
    # Return the doc
    return doc


# Load the small English model and Add the component first in the pipeline
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("length_component", first=True)

# Process a text
doc = nlp("This is a sentence.")


This document is 5 tokens long.


In [7]:
from spacy.tokens import Span, Token, Doc
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)


In [6]:
# Define the custom component
@Language.component("animal_component")
def animal_component(doc):
    # Create a Span for each match and assign the label 'ANIMAL'
    # and overwrite the doc.ents with the matched spans
    doc.ents = [
        Span(doc, start, end, label="ANIMAL") for match_id, start, end in matcher(doc)
    ]
    return doc


# Add the component to the pipeline after the 'ner' component
nlp.add_pipe("animal_component", after="ner")

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])


This document is 8 tokens long.
[]


## Extension attributes


In [8]:
# Register the Token extension attribute 'is_country' with the default value False
Token.set_extension("is_country", default=False)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])


This document is 5 tokens long.
[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [9]:
# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]


# Register the Token property extension 'reversed' with the getter get_reversed
Token.set_extension("reversed", getter=get_reversed)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)


This document is 9 tokens long.
reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [10]:
# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


# Register the Doc property extension 'has_number' with the getter get_has_number
Doc.set_extension("has_number", getter=get_has_number)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)


This document is 9 tokens long.
has_number: True


In [11]:
# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return "<{tag}>{text}</{tag}>".format(tag=tag, text=span.text)


# Register the Span property extension 'to_html' with the method to_html
Span.set_extension("to_html", method=to_html)

# Process the text and call the to_html method on the span with the tag name 'strong'
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html("strong"))


This document is 8 tokens long.
<strong>Hello world</strong>


## Entities & extensions


In [19]:
def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)


In [25]:
capitals = {
    "Afghanistan": "Kabul",
    "Albania": "Tirana",
    "Algeria": "Algiers",
    "American Samoa": "Pago Pago",
    "Andorra": "Andorra la Vella",
    "Angola": "Luanda",
    "Anguilla": "The Valley",
    "Antarctica": "",
    "Antigua and Barbuda": "Saint John's",
    "Argentina": "Buenos Aires",
    "Armenia": "Yerevan",
    "Aruba": "Oranjestad",
    "Australia": "Canberra",
    "Austria": "Vienna",
    "Azerbaijan": "Baku",
    "Bahamas": "Nassau",
    "Bahrain": "Manama",
    "Bangladesh": "Dhaka",
    "Barbados": "Bridgetown",
    "Belarus": "Minsk",
    "Belgium": "Brussels",
    "Belize": "Belmopan",
    "Benin": "Porto-Novo",
    "Bermuda": "Hamilton",
    "Bhutan": "Thimphu",
    "Bolivia (Plurinational State of)": "Sucre",
    "Bonaire, Sint Eustatius and Saba": "Kralendijk",
    "Bosnia and Herzegovina": "Sarajevo",
    "Botswana": "Gaborone",
    "Bouvet Island": "",
    "Brazil": "Brasília",
    "British Indian Ocean Territory": "Diego Garcia",
    "Brunei Darussalam": "Bandar Seri Begawan",
    "Bulgaria": "Sofia",
    "Burkina Faso": "Ouagadougou",
    "Burundi": "Bujumbura",
    "Cabo Verde": "Praia",
    "Cambodia": "Phnom Penh",
    "Cameroon": "Yaoundé",
    "Canada": "Ottawa",
    "Cayman Islands": "George Town",
    "Central African Republic": "Bangui",
    "Chad": "N'Djamena",
    "Chile": "Santiago",
    "China": "Beijing",
    "Christmas Island": "Flying Fish Cove",
    "Cocos (Keeling) Islands": "West Island",
    "Colombia": "Bogotá",
    "Comoros": "Moroni",
    "Congo": "Brazzaville",
    "Congo (Democratic Republic of the)": "Kinshasa",
    "Cook Islands": "Avarua",
    "Costa Rica": "San José",
    "Croatia": "Zagreb",
    "Cuba": "Havana",
    "Curaçao": "Willemstad",
    "Cyprus": "Nicosia",
    "Czech Republic": "Prague",
    "Côte d'Ivoire": "Yamoussoukro",
    "Denmark": "Copenhagen",
    "Djibouti": "Djibouti",
    "Dominica": "Roseau",
    "Dominican Republic": "Santo Domingo",
    "Ecuador": "Quito",
    "Egypt": "Cairo",
    "El Salvador": "San Salvador",
    "Equatorial Guinea": "Malabo",
    "Eritrea": "Asmara",
    "Estonia": "Tallinn",
    "Ethiopia": "Addis Ababa",
    "Falkland Islands (Malvinas)": "Stanley",
    "Faroe Islands": "Tórshavn",
    "Fiji": "Suva",
    "Finland": "Helsinki",
    "France": "Paris",
    "French Guiana": "Cayenne",
    "French Polynesia": "Papeetē",
    "French Southern Territories": "Port-aux-Français",
    "Gabon": "Libreville",
    "Gambia": "Banjul",
    "Georgia": "Tbilisi",
    "Germany": "Berlin",
    "Ghana": "Accra",
    "Gibraltar": "Gibraltar",
    "Greece": "Athens",
    "Greenland": "Nuuk",
    "Grenada": "St. George's",
    "Guadeloupe": "Basse-Terre",
    "Guam": "Hagåtña",
    "Guatemala": "Guatemala City",
    "Guernsey": "St. Peter Port",
    "Guinea": "Conakry",
    "Guinea-Bissau": "Bissau",
    "Guyana": "Georgetown",
    "Haiti": "Port-au-Prince",
    "Heard Island and McDonald Islands": "",
    "Holy See": "Rome",
    "Honduras": "Tegucigalpa",
    "Hong Kong": "City of Victoria",
    "Hungary": "Budapest",
    "Iceland": "Reykjavík",
    "India": "New Delhi",
    "Indonesia": "Jakarta",
    "Iran (Islamic Republic of)": "Tehran",
    "Iraq": "Baghdad",
    "Ireland": "Dublin",
    "Isle of Man": "Douglas",
    "Israel": "Jerusalem",
    "Italy": "Rome",
    "Jamaica": "Kingston",
    "Japan": "Tokyo",
    "Jersey": "Saint Helier",
    "Jordan": "Amman",
    "Kazakhstan": "Astana",
    "Kenya": "Nairobi",
    "Kiribati": "South Tarawa",
    "Korea (Democratic People's Republic of)": "Pyongyang",
    "Korea (Republic of)": "Seoul",
    "Kuwait": "Kuwait City",
    "Kyrgyzstan": "Bishkek",
    "Lao People's Democratic Republic": "Vientiane",
    "Latvia": "Riga",
    "Lebanon": "Beirut",
    "Lesotho": "Maseru",
    "Liberia": "Monrovia",
    "Libya": "Tripoli",
    "Liechtenstein": "Vaduz",
    "Lithuania": "Vilnius",
    "Luxembourg": "Luxembourg",
    "Macao": "",
    "Macedonia (the former Yugoslav Republic of)": "Skopje",
    "Madagascar": "Antananarivo",
    "Malawi": "Lilongwe",
    "Malaysia": "Kuala Lumpur",
    "Maldives": "Malé",
    "Mali": "Bamako",
    "Malta": "Valletta",
    "Marshall Islands": "Majuro",
    "Martinique": "Fort-de-France",
    "Mauritania": "Nouakchott",
    "Mauritius": "Port Louis",
    "Mayotte": "Mamoudzou",
    "Mexico": "Mexico City",
    "Micronesia (Federated States of)": "Palikir",
    "Moldova (Republic of)": "Chișinău",
    "Monaco": "Monaco",
    "Mongolia": "Ulan Bator",
    "Montenegro": "Podgorica",
    "Montserrat": "Plymouth",
    "Morocco": "Rabat",
    "Mozambique": "Maputo",
    "Myanmar": "Naypyidaw",
    "Namibia": "Windhoek",
    "Nauru": "Yaren",
    "Nepal": "Kathmandu",
    "Netherlands": "Amsterdam",
    "New Caledonia": "Nouméa",
    "New Zealand": "Wellington",
    "Nicaragua": "Managua",
    "Niger": "Niamey",
    "Nigeria": "Abuja",
    "Niue": "Alofi",
    "Norfolk Island": "Kingston",
    "Northern Mariana Islands": "Saipan",
    "Norway": "Oslo",
    "Oman": "Muscat",
    "Pakistan": "Islamabad",
    "Palau": "Ngerulmud",
    "Palestine, State of": "Ramallah",
    "Panama": "Panama City",
    "Papua New Guinea": "Port Moresby",
    "Paraguay": "Asunción",
    "Peru": "Lima",
    "Philippines": "Manila",
    "Pitcairn": "Adamstown",
    "Poland": "Warsaw",
    "Portugal": "Lisbon",
    "Puerto Rico": "San Juan",
    "Qatar": "Doha",
    "Republic of Kosovo": "Pristina",
    "Romania": "Bucharest",
    "Russian Federation": "Moscow",
    "Rwanda": "Kigali",
    "Réunion": "Saint-Denis",
    "Saint Barthélemy": "Gustavia",
    "Saint Helena, Ascension and Tristan da Cunha": "Jamestown",
    "Saint Kitts and Nevis": "Basseterre",
    "Saint Lucia": "Castries",
    "Saint Martin (French part)": "Marigot",
    "Saint Pierre and Miquelon": "Saint-Pierre",
    "Saint Vincent and the Grenadines": "Kingstown",
    "Samoa": "Apia",
    "San Marino": "City of San Marino",
    "Sao Tome and Principe": "São Tomé",
    "Saudi Arabia": "Riyadh",
    "Senegal": "Dakar",
    "Serbia": "Belgrade",
    "Seychelles": "Victoria",
    "Sierra Leone": "Freetown",
    "Singapore": "Singapore",
    "Sint Maarten (Dutch part)": "Philipsburg",
    "Slovakia": "Bratislava",
    "Slovenia": "Ljubljana",
    "Solomon Islands": "Honiara",
    "Somalia": "Mogadishu",
    "South Africa": "Pretoria",
    "South Georgia and the South Sandwich Islands": "King Edward Point",
    "South Sudan": "Juba",
    "Spain": "Madrid",
    "Sri Lanka": "Colombo",
    "Sudan": "Khartoum",
    "Suriname": "Paramaribo",
    "Svalbard and Jan Mayen": "Longyearbyen",
    "Swaziland": "Lobamba",
    "Sweden": "Stockholm",
    "Switzerland": "Bern",
    "Syrian Arab Republic": "Damascus",
    "Taiwan": "Taipei",
    "Tajikistan": "Dushanbe",
    "Tanzania, United Republic of": "Dodoma",
    "Thailand": "Bangkok",
    "Timor-Leste": "Dili",
    "Togo": "Lomé",
    "Tokelau": "Fakaofo",
    "Tonga": "Nuku'alofa",
    "Trinidad and Tobago": "Port of Spain",
    "Tunisia": "Tunis",
    "Turkey": "Ankara",
    "Turkmenistan": "Ashgabat",
    "Turks and Caicos Islands": "Cockburn Town",
    "Tuvalu": "Funafuti",
    "Uganda": "Kampala",
    "Ukraine": "Kiev",
    "United Arab Emirates": "Abu Dhabi",
    "United Kingdom of Great Britain and Northern Ireland": "London",
    "United States Minor Outlying Islands": "",
    "United States of America": "Washington, D.C.",
    "Uruguay": "Montevideo",
    "Uzbekistan": "Tashkent",
    "Vanuatu": "Port Vila",
    "Venezuela (Bolivarian Republic of)": "Caracas",
    "Viet Nam": "Hanoi",
    "Virgin Islands (British)": "Road Town",
    "Virgin Islands (U.S.)": "Charlotte Amalie",
    "Wallis and Futuna": "Mata-Utu",
    "Western Sahara": "El Aaiún",
    "Yemen": "Sana'a",
    "Zambia": "Lusaka",
    "Zimbabwe": "Harare",
    "Åland Islands": "Mariehamn",
}


In [27]:
@Language.component("countries_component")
def countries_component(doc):
    # Create an entity Span with the label 'GPE' for all matches
    doc.ents = [
        Span(doc, start, end, label="GPE") for match_id, start, end in matcher(doc)
    ]
    return doc


# Add the component to the pipeline
nlp.add_pipe("countries_component")

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: capitals.get(span.text)

# Register the Span extension attribute 'capital' with the getter get_capital
Span.set_extension("capital", getter=get_capital)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])


[]


## Processing Stream


In [28]:
TEXTS = [
    "McDonalds is my favorite restaurant.",
    "Here I thought @McDonalds only had precooked burgers but it seems they only have not cooked ones?? I have no time to get sick..",
    "People really still eat McDonalds :(",
    "The McDonalds in Spain has chicken wings. My heart is so happy ",
    "@McDonalds Please bring back the most delicious fast food sandwich of all times!!....The Arch Deluxe :P",
    "please hurry and open. I WANT A #McRib SANDWICH SO BAD! :D",
    "This morning i made a terrible decision by gettin mcdonalds and now my stomach is payin for it",
]


In [29]:
# Process the texts and print the adjectives
# for text in TEXTS:
#     doc = nlp(text)
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == "ADJ"])


['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
[]
['terrible']


In [30]:
# Process the texts and print the entities
# docs = [nlp(text) for text in TEXTS]
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)


() () () () () () ()


In [31]:
people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Create a list of patterns for the PhraseMatcher
# patterns = [nlp(person) for person in people]
patterns = list(nlp.pipe(people))


In [32]:
DATA = [
    (
        "One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.",
        {"author": "Franz Kafka", "book": "Metamorphosis"},
    ),
    (
        "I know not all that may be coming, but be it what it will, I'll go to it laughing.",
        {"author": "Herman Melville", "book": "Moby-Dick or, The Whale"},
    ),
    (
        "It was the best of times, it was the worst of times.",
        {"author": "Charles Dickens", "book": "A Tale of Two Cities"},
    ),
    (
        "The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars.",
        {"author": "Jack Kerouac", "book": "On the Road"},
    ),
    (
        "It was a bright cold day in April, and the clocks were striking thirteen.",
        {"author": "George Orwell", "book": "1984"},
    ),
    (
        "Nowadays people know the price of everything and the value of nothing.",
        {"author": "Oscar Wilde", "book": "The Picture Of Dorian Gray"},
    ),
]


In [37]:
# Import the Doc class and register the extensions 'author' and 'book'
from spacy.tokens import Doc

Doc.set_extension("book", default=None)
Doc.set_extension("author", default=None)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # Set the doc._.book and doc._.author attributes from the context
    doc._.book = context["book"]
    doc._.author = context["author"]

    # Print the text and custom attribute data
    print(doc.text, "\n", "— '{}' by {}".format(doc._.book, doc._.author), "\n")


One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. 
 — 'Metamorphosis' by Franz Kafka 

I know not all that may be coming, but be it what it will, I'll go to it laughing. 
 — 'Moby-Dick or, The Whale' by Herman Melville 

It was the best of times, it was the worst of times. 
 — 'A Tale of Two Cities' by Charles Dickens 

The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars. 
 — 'On the Road' by Jack Kerouac 

It was a bright cold day in April, and the clocks were striking thirteen. 
 — '1984' by George Orwell 

Nowadays people know the price of everything and the value of nothing. 
 — 'The Picture Of Dorian Gray' by Oscar Wilde 



In [38]:
text = "Chick-fil-A is an American fast food restaurant chain headquartered in the city of College Park, Georgia, specializing in chicken sandwiches."

# Only tokenize the text
doc = nlp.make_doc(text)

print([token.text for token in doc])

# Disable the tagger and parser
with nlp.disable_pipes("tagger", "parser"):
    # Process the text
    doc = nlp(text)
    # Print the entities in the doc
    print(doc.ents)


['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']
()


