# Solutions III: SpaCy Basics

In [5]:
import spacy

In [6]:
# Example text.
example = "The European Central Bank raised its interest rates by 0.25% on the 20th of September 2023."

In [7]:
# Run this cell if you have not yet installed the language model.
# You may have to restart the Python kernel after installing!
# %pip install python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl

In [8]:
# Load the en_core_web_sm model.
nlp = spacy.load("en_core_web_md")

In [None]:
# Apply the model to the example.
doc = nlp(example)
type(doc)

In [None]:
# The .ents property contains entities.
doc.ents

In [None]:
# Print additional entity information.
for ent in doc.ents:
    print(f"{ent} | {ent.label_} | {ent.start}-{ent.end}")

In [None]:
# What elements are in the doc?
type(doc[0])

In [None]:
# Loop over the tokens in doc.
for token in doc:
    print(f"{token.text} | {token.lemma_} | {token.pos_} | {token.ent_type_}")

In [None]:
# Mask organization names from the text.
["<MASKED>" if t.ent_type_ == "ORG" else t for t in doc ]


In [15]:
# Use retokenize() to merge entities into a single token.
with doc.retokenize() as retokenizer:

    # Loop over entities discovered by SpaCy.
    for ent in doc.ents:

        # Merge tokens for the entity.
        retokenizer.merge(

            # Use the detected entity span.
            doc[ent.start:ent.end],
            attrs={"LEMMA": ent.text}
        )

In [None]:
# Print modified tokenization.
for token in doc:
    print(f"{token.text} | {token.ent_type_}")

In [None]:
# Mask organization names from the text.
["<MASKED>" if t.ent_type_ == "ORG" else t for t in doc ]