In [None]:
## Import Libraries
import streamlit as st
import spacy
import pandas as pd
import numpy as np
from collections import defaultdict
from spacy.pipeline import EntityRuler


## ---------- Load Data From GoofyScraper.py ---------- ##
SpongeData = pd.read_csv('spongebob_transcripts.csv')

# Clear data of NaN values and empty strings
SpongeData.dropna(inplace=True)

# Remove rows where any critical field is an empty string
SpongeData = SpongeData[
    (SpongeData.iloc[:, 0].str.strip() != "") &  # Episode
    (SpongeData.iloc[:, 1].str.strip() != "") &  # Character
    (SpongeData.iloc[:, 2].str.strip() != "")    # Dialogue
]

SpongeData = SpongeData.astype(str)

## ---------- Load the spaCy model for named entity recognition ---------- ##
## ---------- Create Patterns for NER ---------- ##

nlp = spacy.load("en_core_web_sm")


def location_token_patterns(location_list):
    patterns = []
    for loc in location_list:
        tokens = [{"LOWER": word.lower()} for word in loc.strip().split()]
        # Add flexible version with optional "the"
        if tokens[0]["LOWER"] != "the":
            tokens_with_the = [{"LOWER": "the"}] + tokens
            patterns.append({"label": "LOCATION", "pattern": tokens_with_the})
        patterns.append({"label": "LOCATION", "pattern": tokens})
    return patterns

 # The dataframe is structured as follows:
    # Column 1: Episode Name
    # Column 2: Character Name
    # Column 3: Character Dialogue
    # Because of this the patterns will be based on this structure and through a for loop to capture every character and their dialogue
patterns  = []

    # Episode
for ep in SpongeData["ep"].unique():
    patterns.append({"label": "EPISODE", "pattern": ep})

# Character - Dataframe lists some actions in character section, usually above 2 words
# So event names will be filtered out and put into their own category
    
for char in SpongeData["char"].unique():
    if len(char.split()) > 2:
        patterns.append({"label": "EVENT", "pattern": char})
    else:
        patterns.append({"label": "CHARACTER", "pattern": char})


location_names = [
    "Krusty Krab", "The Krusty Krab", "Krusty Krab Pizza", "Chum Bucket", "Rock Bottom",
    "Goo Lagoon", "Jellyfish Fields", "Sandy's Treedome", "Mrs. Puff's Boating School",
    "Shell City", "Glove World", "The Salty Spitoon", "Bikini Bottom", "Karate Island",
    "The Flying Dutchman's Ship", "Bikini Bottom Hospital", "Bikini Bottom Library"]

    
patterns.extend(location_token_patterns(location_names))
    



## ---------- Add Entity Ruler ----------- ##
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)