In [None]:
import spacy
import pandas as pd
from spacy.training import Example
from spacy.util import minibatch

df = pd.read_csv('Friends_Journey_Dataset.csv')
df.fillna("", inplace=True)

ModuleNotFoundError: No module named 'spacy'

Preparing Training Data for NER

In [4]:
TRAIN_DATA = []

for _, row in df.iterrows():
    text = f"Chapter {row['Chapter Number']}: {row['Narrative']} {row['Dialogue']}"
    character_names = row['Character Names'].split(", ")  # Assumes names are comma-separated
    entities = []

    for name in character_names:
        start = text.find(name)
        if start != -1:
            end = start + len(name)
            entities.append((start, end, "PERSON"))  # Assign "PERSON" entity to character names

    # Append only if at least one entity was found
    if entities:
        TRAIN_DATA.append((text, {"entities": entities}))

Train the NER Model

In [5]:
def remove_overlapping_entities(entities):
    """
    Removes overlapping entity labels, keeping only the longest one.
    """
    sorted_entities = sorted(entities, key=lambda x: (x[0], x[1] - x[0]))  # Sort by start index & length
    non_overlapping = []
    prev_end = -1

    for start, end, label in sorted_entities:
        if start >= prev_end:  # If it doesn't overlap with the previous entity
            non_overlapping.append((start, end, label))
            prev_end = end  # Update last used end position

    return non_overlapping

# Apply the function to clean training data
for i, (text, annotations) in enumerate(TRAIN_DATA):
    TRAIN_DATA[i][1]["entities"] = remove_overlapping_entities(annotations["entities"])


TRAIN spaCy NER MODEL

In [6]:
nlp = spacy.blank("en")  # Create a blank English model
ner = nlp.add_pipe("ner")  # Add Named Entity Recognizer component

# Add PERSON label to the model
for _, annotations in TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label("PERSON")

# Begin training
nlp.begin_training()

# Train for 10 epochs
for epoch in range(10):
    losses = {}
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], losses=losses)
    print(f"Epoch {epoch+1}, Loss: {losses}")



Epoch 1, Loss: {'ner': 5977.545335289811}
Epoch 2, Loss: {'ner': 1769.3899074982414}
Epoch 3, Loss: {'ner': 1745.5955682579163}
Epoch 4, Loss: {'ner': 1714.1498022898377}
Epoch 5, Loss: {'ner': 1520.959529358244}
Epoch 6, Loss: {'ner': 1402.028978892928}
Epoch 7, Loss: {'ner': 1247.7324987488578}
Epoch 8, Loss: {'ner': 1145.1067579424878}
Epoch 9, Loss: {'ner': 1082.5808384375887}
Epoch 10, Loss: {'ner': 907.0083841790397}


SAVE TRAINED MODEL

In [8]:
output_dir = "/content/Character_NER_Model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

Model saved to /content/Character_NER_Model


TEST THE TRAINED MODEL

In [9]:
def predict_character_chapters(text):
    """
    Given a text input, extracts character names using the trained model.
    """
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Example test
test_text = "Ethan and Victor went on an adventure."
predicted_characters = predict_character_chapters(test_text)
print("Predicted Characters:", predicted_characters)

Predicted Characters: [('Victor', 'PERSON')]
