In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
import os

In [2]:
# Load the English NLP model (you only need to do this once)
# !python3 -m spacy download en_core_web_sm

In [3]:
# Load spacy English languague model (small version)
NER = spacy.load("en_core_web_sm")

In [5]:
# Get all book files in the data directory
all_books = [b for b in os.scandir('../data') if '.txt' in b.name]
all_books

[<DirEntry 'Master-and-Commander.txt'>, <DirEntry 'H.M.S.-Surprise.txt'>]

In [6]:
# Load the first book text file, read its contents, and process it with spaCy for NLP analysis
book = all_books[0]
book_text = open(book).read()
book_doc = NER(book_text)

In [7]:
# Visualize identified entities
displacy.render(book_doc[0:200], style="ent", jupyter=True)

In [None]:
# Extract character names from both books and save to a dataframe
book_files = [
    ('Master and Commander', '../data/Master-and-Commander.txt'),
    ('HMS Surprise', '../data/H.M.S.-Surprise.txt')
]

character_data = []
for book_title, file_path in book_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    doc = NER(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            character_data.append({'book': book_title, 'character': ent.text})

character_df = pd.DataFrame(character_data)
character_df

                      book      character
0     Master and Commander      Locatelli
1     Master and Commander  Jack Aubrey's
2     Master and Commander           Jack
3     Master and Commander           Jack
4     Master and Commander           Jack
...                    ...            ...
3912          HMS Surprise           Jack
3913          HMS Surprise           Jack
3914          HMS Surprise        Stephen
3915          HMS Surprise        Stephen
3916          HMS Surprise           Jack

[3917 rows x 2 columns]


In [11]:
# Remove duplicate character names within each book
# character_df = character_df.drop_duplicates(subset=['book', 'character'])
# character_df
# Remove duplicate character names within each book and display all rows
import pandas as pd
pd.set_option('display.max_rows', None)
character_df = character_df.drop_duplicates(subset=['book', 'character'])
character_df

Unnamed: 0,book,character
0,Master and Commander,Locatelli
1,Master and Commander,Jack Aubrey's
2,Master and Commander,Jack
7,Master and Commander,Mrs Harte
8,Master and Commander,Jack Aubrey
9,Master and Commander,Saturn
10,Master and Commander,Molly Harte
19,Master and Commander,Allen
25,Master and Commander,Hunks
26,Master and Commander,Samuel Allen
