In [None]:
## Import Libraries
import streamlit as st
import spacy
import pandas as pd
import numpy as np
from collections import defaultdict
from spacy.pipeline import EntityRuler


## ---------- Load Data From GoofyScraper.py ---------- ##
SpongeData = pd.read_csv('spongebob_transcripts.csv')

# Clear data of NaN values and empty strings
SpongeData.dropna(inplace=True)

# Remove rows where any critical field is an empty string
SpongeData = SpongeData[
    (SpongeData.iloc[:, 0].str.strip() != "") &  # Episode
    (SpongeData.iloc[:, 1].str.strip() != "") &  # Character
    (SpongeData.iloc[:, 2].str.strip() != "")    # Dialogue
]

SpongeData = SpongeData.astype(str)

## ---------- Load the spaCy model for named entity recognition ---------- ##
## ---------- Create Patterns for NER ---------- ##

nlp = spacy.load("en_core_web_sm")


# The dataframe is structured as follows:
# Column 1: Episode Name
# Column 2: Character Name
# Column 3: Character Dialogue
# Because of this the patterns will be based on this structure and through a for loop to capture every character and their dialogue

patterns  = []

# Episode
for ep in SpongeData["ep"].unique():
    patterns.append({"label": "EPISODE", "pattern": ep})

# Character
for char in SpongeData["char"].unique():
    patterns.append({"label": "CHARACTER", "pattern": char})

# Dialogue
for text in SpongeData["text"].unique():
    patterns.append({"label": "DIALOGUE", "pattern": text})



## ---------- Add Entity Ruler ----------- ##
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)