In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.tokens import DocBin
import networkx as nx
import matplotlib.pyplot as plt
import os

Load character names

In [55]:
char_df = pd.read_csv('data/char_name.csv')
# Removing the names inside brackets, making a new column with first names and removing duplicated names
import re

char_df['character'] = char_df['character'].apply(lambda x: re.sub(r'\(.*?\)', '', x))
char_df['char_firstname'] = char_df['character'].apply(lambda x: x.split(' ', 1)[0])
char_df.drop_duplicates(subset=['character'], inplace=True)

Load spacy English language model

In [3]:
NER = spacy.load("en_core_web_sm")
NER.max_length = 2000000

Load the books

In [4]:
all_books = [b for b in os.scandir('data') if '.txt' in b.name]

In [5]:
book_path = all_books[0]
# Read the text of the book with UTF-8 encoding
with open(book_path, "r", encoding="utf-8") as file:
    book_text = file.read()
book_doc = NER(book_text)

Get entity names per sentence  

In [30]:
sent_entity_df = []
for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({'sentence': sent, 'entities':entity_list})

sent_entity_df = pd.DataFrame(sent_entity_df)

Applying filter to extract the names and positions 

In [53]:
# Function to filter out the non-character entities

def filter_entity(ent_list, char_df):
    filtered_entities = []
    for entity in ent_list:
        # Check if entity exactly matches any DataFrame value after removing parentheses
        if (char_df['character'] == entity).any() or \
           (char_df['char_firstname'] == entity).any():
            filtered_entities.append(entity)
    return filtered_entities if filtered_entities else []
    

In [59]:
# Applying the filter in the entire DF
sent_entity_df['char_ent'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, char_df))

In [60]:
# Removing the empyt entities and getting only the first names
sent_entity_df_filtered =  sent_entity_df[sent_entity_df['char_ent'].map(len) > 0]
sent_entity_df_filtered['char_ent'] = sent_entity_df_filtered['char_ent'].apply(lambda x: [item.split()[0] for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_entity_df_filtered['char_ent'] = sent_entity_df_filtered['char_ent'].apply(lambda x: [item.split()[0] for item in x])


Creating relatioships

In [68]:
window_size = 5
relationship = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i:end_i].char_ent), [])
    # Remove duplciated char in the same sentence
    char_unique = [char_list[i] for i in range(len(char_list))
    if (i == 0 ) or char_list[i] != char_list[i-1]]

    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx+1]
            relationship.append({'source': a,'target': b})



In [90]:
relationship_df = pd.DataFrame(relationship)
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns= relationship_df.columns)

In [91]:
relationship_df['value'] = 1
relationship_df = relationship_df.groupby(['source','target'], sort= False, as_index=False).sum()
relationship_df.head(50)


Unnamed: 0,source,target,value
0,Gared,Waymar,42
1,Gared,Royce,11
2,Royce,Waymar,4
3,Royce,Will,3
4,Robert,Waymar,5
5,Mance,Old,4
6,Eddard,Theon,6
7,Jory,Theon,20
8,Jory,Robert,28
9,Jon,Robert,106
