# Solutions IV: SpaCy Named Entity Recognition

In [1]:
import spacy
import pandas as pd

In [None]:
# Load a small sample of the IMDB reviews.
reviews = pd.read_csv("../../0_data/imdb/imdb_reviews_small.csv", compression="zip")
reviews = reviews.sample(1000)
reviews.head()

In [3]:
# Load language model.
nlp = spacy.load("en_core_web_md")

In [None]:
# Check components in the spaCy pipeline.
nlp.component_names

In [5]:
# Use SpaCy pipeline to get Doc objects.
docs = list(
    nlp.pipe(
        # Text for the reviews.
        reviews["content"],

        # Enable parallel processing.
        n_process=3,

        # Disable all components we do not need.
        disable=[
            'tok2vec',
            'tagger',
            'parser',
            'senter',
            'attribute_ruler',
            'lemmatizer',
        ]
    )
)

In [6]:
def find_persons(doc):
    """Find PERSON entities in a SpaCy document."""
    entities = []
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            # Use entity text and strip possesive 's.
            entities.append(entity.text.strip("'s"))

    return entities

In [None]:
# Find actors using the find_persons function.
actors = [find_persons(doc) for doc in docs]
actors[0:5]

In [None]:
# Combine actors and ratings
df_actors = pd.DataFrame({
    "rating": reviews["rating"],
    "actor": actors,
})
df_actors

In [None]:
# Create one row per actor, drop duplicates.
# Duplicate: actor was mentioned multiple times in the same review.
df_actors = (
    df_actors
    .explode(column="actor")
    .drop_duplicates()
    .loc[lambda df: df["actor"].notna()]
)
df_actors.head()

In [None]:
# Best ranking actors (based on at least 3 reviews).
(
    df_actors
    .groupby("actor", as_index=False)
    .agg(
        rating=("rating", "mean"),
        count=("rating", "size"),
    )
    .sort_values("rating", ascending=False)
    .query("count > 2")
    .head(30)
)