In [None]:
import pandas as pd
import numpy as np
import os
import spacy
from spacy import displacy
from tqdm.notebook import tqdm
import networkx as nx


In [None]:
# Initialize empty list for graphs from books
books_graph = []
all_books = [b for b in os.scandir('/directory') if '.txt' in b.name]
# Sort dir entries by number
all_books.sort(key=lambda x: int(x.name.split(".")[0]))

In [None]:
def ner(file_name):
    """
    Function to process text from a text file (.txt) using Spacy.

    Params:
    file_name -- name of a txt file as string

    Returns:
    a processed doc file using Spacy English language model

    """
    # Load spacy English languague model
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 2185000
    book_text = open(file_name).read()
    book_doc = nlp(book_text)
    return book_doc


def get_ne_list_per_sentence(spacy_doc):
    """
    Get a list of entites per sentence of a Spacy document and store in a dataframe.

    Params:
    spacy_doc -- a Spacy processed document

    Returns:
    a dataframe containing the sentences and corresponding list of recognised named entities       in the sentences
    """

    sent_entity_df = []

    # Loop through sentences, store named entity list for each sentence
    for sent in spacy_doc.sents:
        entity_list = [ent.text for ent in sent.ents]
        sent_entity_df.append({"sentence": sent, "entities": entity_list})

    sent_entity_df = pd.DataFrame(sent_entity_df)

    return sent_entity_df


def filter_entity(ent_list, character_df, locations):
    """
    Function to filter out non-character entities.

    Params:
    ent_list -- list of entities to be filtered
    character_df -- a dataframe contain characters' names and characters' first names
    locations -- list of locations

    Returns:
    a list of entities that are characters (matching by names or first names).

    """
    return [ent for ent in ent_list
            if ent in list(character_df.character)
            or ent in list(character_df.character_firstname)
            and ent not in locations]


def create_relationships(df, window_size):

    """
    Create a dataframe of relationships based on the df dataframe (containing lists of chracters per sentence) and the window size of n sentences.

    Params:
    df -- a dataframe containing a column called character_entities with the list of chracters for each sentence of a document.
    window_size -- size of the windows (number of sentences) for creating relationships between two adjacent characters in the text.

    Returns:
    a relationship dataframe containing 3 columns: source, target, value.

    """

    relationships = []

    for i in range(df.index[-1]):
        end_i = min(i + window_size, df.index[-1])
        char_list = sum((df.loc[i: end_i].character_entities), [])

        # Remove duplicated characters that are next to each other
        char_unique = [char_list[i] for i in range(len(char_list))
                       if (i==0) or char_list[i] != char_list[i-1]]

        if len(char_unique) > 1:
            for idx, a in enumerate(char_unique[:-1]):
                b = char_unique[idx + 1]
                relationships.append({"source": a, "target": b})

    relationship_df = pd.DataFrame(relationships)
    # Sort the cases with a->b and b->a
    relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1),
                                   columns = relationship_df.columns)
    relationship_df["value"] = 1
    relationship_df = relationship_df.groupby(["source","target"],
                                              sort=False,
                                              as_index=False).sum()

    return relationship_df

In [None]:
# Load character names from characters_wot.csv
character_df = pd.read_csv("/directory/characters_wot.csv")
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(' ', 1)[0])

In [None]:
# List of locations extracted from the scraping of locations and nations
locations = ['Aiel Waste','Almoth Plain','Arindrim','Asnelle','Black Hills','Blasted Lands','Caralain Grass','Comaidin Riots','Drowned Lands',
             'Ganai','Great Blight','Haddon Mirk','Hills of Absher','Hills of Kintara','Kaensada Hills',"Kinslayer's Dagger",'Kiranaille',
             'Kunwar','Larcheen','Mad Lands','Maram Kashor','Plain of Lances','Plains of Maredo','Pujili',"Sa'las Plains",'Shadow Coast',
             'Shahayni','Sharaman',"Strangers' Markets",'Termool','Toman Head','Two Rivers',"World's End",'Almoth Plain','Altara','Amadicia',
             'Amayar','Andor','Arad Doman','Arafel',"Atha'an Miere",'Borderlands','Cairhien', 'Ghealdan', 'Illian','Kandor','Mad Lands', 'Malkier',
             'Murandy','Saldaea','Seanchan','Shara', 'Shienar','Tarabon','Tear','Westlands']

In [None]:
# Loop through book list and create graphs
for book in tqdm(all_books, total=len(all_books)):
    book_text = ner(book)

    # Get list of entities per sentences
    sent_entity_df = get_ne_list_per_sentence(book_text)

    # Select only character entities
    sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df, locations))

    # Filter out sentences that don't have any character entities
    sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]

    # Take only first name of characters
    sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0]
                                                                                                               for item in x])

    # Create relationship df
    relationship_df = create_relationships(df = sent_entity_df_filtered, window_size = 5)

    # Create a graph from a pandas dataframe
    G = nx.from_pandas_edgelist(relationship_df,
                                source = "source",
                                target = "target",
                                edge_attr = "value",
                                create_using = nx.Graph())

    books_graph.append(G)

In [None]:
# Create dataframes of relations for each book
dataframes = []

for graph in books_graph:
    dataframe = nx.to_pandas_edgelist(graph)
    dataframes.append(dataframe)


for i, dataframe in enumerate(dataframes):
    dataframe.to_csv(f"dataframe_{i}.csv", index=False)
