In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
import os

In [2]:
# Load the English NLP model (you only need to do this once)
# !python3 -m spacy download en_core_web_sm

In [3]:
# Load spacy English languague model (small version)
NER = spacy.load("en_core_web_sm")

In [5]:
# Get all book files in the data directory
all_books = [b for b in os.scandir('../data') if '.txt' in b.name]
all_books

[<DirEntry 'Master-and-Commander.txt'>, <DirEntry 'H.M.S.-Surprise.txt'>]

In [6]:
# Load the first book text file, read its contents, and process it with spaCy for NLP analysis
book = all_books[0]
book_text = open(book).read()
book_doc = NER(book_text)

In [7]:
# Visualize identified entities
displacy.render(book_doc[0:200], style="ent", jupyter=True)

In [None]:
# Extract character names from both books and save to a dataframe
book_files = [
    ('Master and Commander', '../data/Master-and-Commander.txt'),
    ('HMS Surprise', '../data/H.M.S.-Surprise.txt')
]

character_data = []
for book_title, file_path in book_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    doc = NER(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            character_data.append({'book': book_title, 'character': ent.text})

character_df = pd.DataFrame(character_data)
character_df

                      book      character
0     Master and Commander      Locatelli
1     Master and Commander  Jack Aubrey's
2     Master and Commander           Jack
3     Master and Commander           Jack
4     Master and Commander           Jack
...                    ...            ...
3912          HMS Surprise           Jack
3913          HMS Surprise           Jack
3914          HMS Surprise        Stephen
3915          HMS Surprise        Stephen
3916          HMS Surprise           Jack

[3917 rows x 2 columns]


In [11]:
# Remove duplicate character names within each book
# character_df = character_df.drop_duplicates(subset=['book', 'character'])
# character_df
# Remove duplicate character names within each book and display all rows
import pandas as pd
pd.set_option('display.max_rows', None)
character_df = character_df.drop_duplicates(subset=['book', 'character'])
character_df

Unnamed: 0,book,character
0,Master and Commander,Locatelli
1,Master and Commander,Jack Aubrey's
2,Master and Commander,Jack
7,Master and Commander,Mrs Harte
8,Master and Commander,Jack Aubrey
9,Master and Commander,Saturn
10,Master and Commander,Molly Harte
19,Master and Commander,Allen
25,Master and Commander,Hunks
26,Master and Commander,Samuel Allen


In [12]:
!pip install coreferee

Collecting coreferee
  Downloading coreferee-1.4.1-py3-none-any.whl.metadata (2.5 kB)
Downloading coreferee-1.4.1-py3-none-any.whl (182 kB)
Installing collected packages: coreferee
Successfully installed coreferee-1.4.1


In [13]:
import coreferee
!python -m coreferee install en

Collecting https://github.com/richardpaulhudson/coreferee/raw/master/models/coreferee_model_en.zip
  Downloading https://github.com/richardpaulhudson/coreferee/raw/master/models/coreferee_model_en.zip (65.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: coreferee-model-en
  Building wheel for coreferee-model-en (pyproject.toml) ... [?25ldone
[?25h  Created wheel for coreferee-model-en: filename=coreferee_model_en-1.0.0-py3-none-any.whl size=65422506 sha256=2f854d718c0d003ca3a191a2f63fb5e0760354f52511c79c0bd862c42449be60
  Stored in directory: /private/var/folders/k4/vg8gmwj941z_9rv5xw3mmt0m0000gn/T/pip-ephem-wheel-cache-mzwdbg_n/wheels/87/ca/93/b7c91eabb7b65a4200a75ea842a70098

In [14]:
import coreferee

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Add coreferee to the pipeline
nlp.add_pipe('coreferee')

<coreferee.manager.CorefereeBroker at 0x14385e8c0>

In [15]:
doc = nlp(book_text)

# Print coreference chains
for chain in doc._.coref_chains:
    print(chain)

0: [41], [66]
1: [92], [126]
2: [145], [156], [173], [177], [212], [221], [232], [295], [298], [302], [315], [337], [344], [347]
3: [250], [252]
4: [304], [307]
5: [393], [412], [419], [428], [445], [450], [452], [478], [523], [531], [558], [565], [579], [601]
6: [436], [439]
7: [485], [490]
8: [505], [521]
9: [509], [515]
10: [632], [639]
11: [635], [651], [660], [662], [685], [691], [703], [708]
12: [663], [669], [671], [679]
13: [715], [725], [735], [737], [741], [751], [758], [854], [923], [928], [937], [954], [963], [966], [969], [980], [986], [993], [1001], [1021], [1026], [1041], [1045]
14: [717], [720], [731], [753], [761]
15: [820], [868]
16: [839], [843]
17: [846], [872]
18: [906], [925]
19: [914], [918], [939], [946]
20: [970], [982]
21: [1006], [1033]
22: [1042], [1062]
23: [1078], [1095]
24: [1099], [1158], [1216], [1225], [1234], [1236], [1241], [1288], [1366], [1374], [1376], [1382], [1399], [1405], [1414], [1421], [1477], [1479], [1542]
25: [1113], [1154], [1191], [1201

In [34]:
import re

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"'s\b", "", name)  # Remove possessive 's
    name = re.sub(r"[^\w\s]", "", name)  # Remove punctuation
    name = name.strip()
    return name

character_df['character_normalized'] = character_df['character'].apply(normalize_name)

In [35]:
character_df

Unnamed: 0,book,character,character_normalized
0,Master and Commander,Locatelli,locatelli
1,Master and Commander,Jack Aubrey's,jack aubrey
2,Master and Commander,Jack,jack
7,Master and Commander,Mrs Harte,mrs harte
8,Master and Commander,Jack Aubrey,jack aubrey
9,Master and Commander,Saturn,saturn
10,Master and Commander,Molly Harte,molly harte
19,Master and Commander,Allen,allen
25,Master and Commander,Hunks,hunks
26,Master and Commander,Samuel Allen,samuel allen


In [64]:
# Filter out common words or names that are not characters
blacklist = {
    "his majesty", 
             "sloop", 
             "locatelli",
             "finisterre",
             "saturn",
             "his majesty",
             "esqr",
             "alien",
             "boccherini"
             "port mahon",
             "mahon",
             "tarpaulin hats",
             "pigtail steps",
             "gib",
             "majesty",
             "thankee",
             "canaletto",
             "alicante",
             "mozart",
             }  # add more as needed
character_df = character_df[~character_df['character_normalized'].isin(blacklist)]

In [65]:
# Count the number of rows in character_df
print(len(character_df))

750
