# Data Preprocessing

## Importing Libraries

In [132]:
# install libraries
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install networkx
!pip install nltk
!pip install scipy
!pip install pyvis

# things to do:
# add all character names and aliases
# change names to darrow-red, etc.
# add other books from first series.  In second series, break books up by chapter ("I" changes based on narrator)
# add books from second series
# When making graph, remove all nodes with no edges



In [133]:
#Library catalogue
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import networkx as nx
import re
import math

from pyvis.network import Network

import matplotlib.pyplot as plt
import scipy.stats as stats
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import ne_chunk, pos_tag

nltk.download('maxent_ne_chunker')

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

#--NotebookApp.iopub_data_rate_limit=1.0e10


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Logan\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Logan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Logan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Logan\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## Import All Books

In [134]:
with open('rr.txt', 'r', encoding = 'utf-8') as rr:
  rr_text = rr.read()

with open('gs.txt', 'r', encoding = 'utf-8') as gs:
  gs_text = gs.read()

with open('ms.txt', 'r', encoding = 'utf-8') as ms:
  ms_text = ms.read()

with open('ig.txt', 'r', encoding = 'utf-8') as ig:
  ig_text = ig.read()

with open('da.txt', 'r', encoding = 'utf-8') as da:
  da_text = da.read()

with open('lb.txt', 'r', encoding = 'utf-8') as lb:
  lb_text = lb.read()

text_array = [rr_text, gs_text, ms_text, ig_text, da_text, lb_text]

# Define Functions

## Name Mapping

In [135]:
# Remove joined words
def remove_joined_words(text):
    # Remove unusual characters: hyphons, apostrophes, commas, periods, quotation marks, etc.
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return cleaned_text

def preprocess_names(text, name_mapping):
  for variation, actual_name in name_mapping.items():
    text = text.replace((" " + variation + " "), (" " + actual_name + " "))
  return text

name_mapping = {
    # alphabetical by first name

    # A
    "Abomination": "adrius-gold", "Adrius": "adrius-gold", "Jackal": "adrius-gold", 
    "Aeneas": "aeneas-gold",
    "Aevius": "aevius-gold",
    "Agala": "agala-gold",
    "Agrippina": "agrippina-gold",
    "Aja": "aja-gold", 
    "Ajax": "ajax-gold",
    "Akari": "akari-gold",
    "Alexandar": "alexandar-gold",
    "Alia": "alia-obsidian", 
    "Alred": "alred-red", 
    "Amel": "amel-pink", 
    "Anastasia": "anastasia-gold", 
    "Anicetus": "anicetus-gold", 
    "Anthousa": "anthousa-copper", 
    "Antonia": "antonia-gold", 
    "Apollonius": "apollonius-gold", "Minotaur": "apollonius-gold",
    "Arlus": "titus-red", "Titus": "titus-red", 
    "Arminius": "arminius-gold",
    "Arria": "arria-gold",
    "Arturius": "arturius-gold",
    "Asmodeus": "asmodeus-gold",
    "Atalantia": "atalantia-gold",
    "Athena": "athena-red",
    "Atlas": "atlas-gold", "Fear Knight": "atlas-gold",
    "Aurae": "aurae-pink",

    # B
    "Barlow": "barlow-red",
    "Bellagra": "bellagra-gold",
    "Bellerophon": "bellerophon-gold",
    "Bondilus": "bondilus-copper",
    "Braga": "braga-obsidian",
    "Brea": "brea-red",
    "Bridge": "bridge-gray",
    "Britannia": "britannia-silver",
    "Brutus": "brutus-gold",
    "Bryn": "bryn-red", 
    
    # C
    "Cadus": "cadus-orange", "Harnassus": "cadus-orange",
    "Cagney": "cagney-gold",
    "Calliope": "calliope-pink",
    "Calypso": "calypso-gold",
    "Cassandra": "cassandra-gold",
    "Cassius": "cassius-gold",
    "Cedric": "cedric-copper",
    "Cicero": "cicero-gold", 
    "Cipio": "cipio-gold",
    "Claudius": "claudius-gold",
    "Clintus": "clintus-gold",
    "Clown": "clown-gold",
    "Colloway": "colloway-blue",
    "Cormac": "cormac-red",
    "Corthus": "corthus-red",
    "Crastus": "crastus-gold",
    "Croissy": "croissy-yellow",
    "Cylax": "cylax-green",
    "Cylus": "cylus-gold",
    "Cyra": "cyra-green",
    "Cyther": "cyther-orange",

    # D
    "Dago": "dago-red",
    "Dale": "dale-red",
    "Dancer": "dancer-red",
    "Dano": "dano-red",
    "Danto": "danto-gray",
    "Daria": "daria-gold", "Harpy": "daria-gold",
    "Darrow": "darrow-red", "Reaper": "darrow-red", "Andromedus": "darrow-red", "DARROW": "darrow-red",
    "Dax": "dax-gold",
    "Daxo": "daxo-gold",
    "Deanna": "deanna-red", 
    "Demetrius": "demetrius-gray",
    "Diana-Selene": "diana_selene-gold",
    "Dido": "dido-gold",
    "Dio": "dio-red",
    "Diomedes": "diomedes-gold",
    "Doran": "doran-red",
    "Drusilla": "drusilla-gold",
    "Drusus": "drusus-gold",
    "Duke of Hands": "duke_of_hands-pink", "Hands": "duke_of_hands-pink",
    "Duncan": "duncan-red", 

    # E
    "Electra": "electra-gold",
    "Eo": "eo-red",
    "Ephraim": "ephraim-gray", "Eph": "ephraim-gray", "EPHRAIM": "ephraim-gray",
    "Evey": "evey-pink",
    "Exeter": "exeter-brown",
    
    # F
    "Fabera": "fabera-gold",
    "Fachnan": "fachnan-red",
    "Felix": "felix-gold",
    "Fenix": "fenix-gray",
    "Fel": "fel-red",
    "Fitchner": "fitchner-gold", "Proctor Mars": "fitchner-gold", "Ares": "fitchner-gold",
    "Freihild": "freihild-obsidian",
    
    # G
    "Gaianna": "gaianna-gold",
    "Gaia": "gaia-gold",
    "Glaucus": "glaucus-red",
    "Glirastes": "glirastes-orange",
    "Gorgo": "gorgo-obsidian",
    "Goroth": "goroth-obsidian",
    "Gregarius": "gregarius-gold",
    "Gudkind": "gudkind-obsidian",
    
    # H
    "Hadrian": "hadrian-gold",
    "Harmony": "harmony-red",
    "Hecuba": "hecuba-gold",
    "Helios": "helios-gold",
    "Hjornir": "hjornir-obsidian",
    "Holiday": "holiday-gray", "Nakamura": "holiday-gray",
    "Horatia": "horatia-gold",
    "Horatius": "horatius-gold", "screwface": "horatius-gold",
    
    # I
    "Ignacius": "ignacius-gold",
    "Iona": "iona-gold",
    "Irenia": "irenia-gold",
    "Iro": "iro-red",
    
    # J
    "Janus": "janus-gold",
    "John Merrywater": "john_merrywater-unknown",
    "Julian": "julian-gold",
    "Julia": "julia-gold",
    "Julius": "julius-gold",
    "June": "june-gold",
    
    # K
    "Kal": "kal-silver",
    "Kalindora": "kalindora-gold",
    "Karnus": "karnus-gold",
    "Kavax": "kavax-gold",
    "Kieran": "kieran-red",
    "Kobachi": "kobachi-green",
    
    # L
    "Lana": "lana-red",
    "Lea": "lea-gold",
    "Leanna": "leanna-red",
    "Leto": "leto-gold",
    "Liago": "liago-yellow",
    "Liam": "liam-red",
    "Licenus": "licenus-copper",
    "Lilath": "lilath-gold",
    "Loran": "loran-red",
    "Lorn": "lorn-gold", 
    "Lottie": "lottie-brown",
    "Lucius": "lucius-gold",
    "Lyria": "lyria-red", "LYRIA": "lyria-red",
    "Lysander": "lysander-gold", "LYSANDER": "lysander-gold",
    
    # M
    "Maeve": "maeve-red",
    "Magnus": "magnus-gold",
    "Marbles": "marbles-green",
    "Marcel": "marcel-gray",
    "Marcus": "marcus-gold",
    "Marius": "marius-gold",
    "Markus": "markus-gray",
    "Matteo": "matteo-pink",
    "Mickey": "mickey-violet",
    "Milia": "milia-gold",
    "Min-Min": "min_min-gold",
    "Modjob": "modjob-brown",
    "Moira": "moira-gold",
    "Mora": "mora-red",
    "Murani": "murani-unknown",
    
    # N
    "Narol": "narol-red",
    "Nero": "nero-gold",
    "Nexus": "nexus-gold",
    "Nicator": "nicator-copper",
    "Niobe": "niobe-gold",
    "Novas": "novas-gold",
    "Nyla": "nyla-gold",
    
    # O
    "Octavia": "octavia-gold", "Soverign": "octavia-gold",
    "Ophelia": "ophelia-gold",
    "Orion": "orion-blue",
    "Oro": "oro-blue",
    "Oslo": "oslo-white",
    "Oxis": "oxis-orange",
    "Ozgard": "ozgard-obsidian",
    
    # P
    "Paleron": "paleron-gold",
    "Pandora": "pandora-gold",
    "Pax": "pax_telemanus-gold",
    "Pax": "pax_augustus-gold",
    "Paxton": "paxton-gray",
    "Pebble": "pebble-gold",
    "Picker": "picker-red",
    "Pliny": "pliny-gold",
    "Pollux": "pollux-gold",
    "Priam": "priam-gold",
    "Proctor Apollo": "apollo-gold",
    "Proctor Bacchus": "bacchus-gold",
    "Proctor Ceres": "ceres-gold",
    "Proctor Diana": "diana-gold",
    "Proctor Juno": "juno-gold",
    "Proctor Jupiter": "jupiter-gold",
    "Proctor Mercury": "mercury-gold",
    "Prospera": "prospera-gold",
    "Publius": "publius-copper",
    "Pytha": "pytha-blue",
    
    # Q
    "Quinn": "quinn-gold",
    
    # R
    "Ragnar": "ragnar-obsidian",
    "Ralp": "ralph-red",
    "Reagan": "reagan-red",
    "Regulus": "regulus-silver", "Quicksilver": "regulus-silver",
    "Revus": "revus-gold",
    "Rhone": "rhone-gray",
    "Rhonna": "rhonna-red",
    "Rollo": "rollo-red",
    "Romulus": "romulus-gold",
    "Roque": "roque-gold",
    "Roran": "roran-red",
    "Rotback": "rotback-gold",
    "Ryanna": "ryanna-red",
    
    # S
    "Scipio": "scipio-gold",
    "Scorpio": "scorpio-gold",
    "Sefi": "sefi-obsidian",
    "Seneca": "seneca-gold", # there are 2 senecas but one isn't very relevant
    "Seraphina": "seraphina-gold",
    "Servilla": "servilla-gold",
    "Sevro": "sevro-gold",
    "Silenius": "silenius-gold",
    "Skarde": "skarde-obsidian",
    "Sleepy": "sleepy-red",
    "Stefano": "stefano-gray",
    
    # T
    "Tactus": "tactus-gold",
    "Tails": "tails-red",
    "Tamara": "tamara-gold",
    "Tania": "tania-gold",
    "Thalia": "thalia-gold",
    "Tharsus": "tharsus-gold",
    "The Figment": "fig-copper", "fig": "fig-copper",
    "Theodora": "theodora-pink",
    "Theron": "theron-gold",
    "Thesalia": "thesalia-gold",
    "Thistle": "thistle-gold",
    "Thraxa": "thraxa-gold",
    "Tiberius": "tiberius-gold",
    "Timony": "timony-copper",
    "Tiran": "tiran-red",
    "Tongueless": "tongueless-obsidian",
    "Torrow": "torrow-red",
    "Trigg": "trigg-gray",
    
    # U
    "Ugly Dan": "dan-gray", "Dan": "dan-gray",
    "Ulysses": "ulysses-gold",
    
    # V
    "Valdir": "valdir-obsidian",
    "Valeria": "valeria-gold",
    "Vanna": "vanna-red",
    "Varlo": "varlo-red",
    "Varus": "varus-gold",
    "Vela": "vela-gold",
    "Venetia": "venetia-gold",
    "Victra": "victra-gold",
    "Videlia": "videlia-copper",
    "Virginia": "virginia-gold", "Mustang": "virginia-gold", "VIRGINIA": "virginia-gold",
    "Vixus": "vixus-gold",
    "Volga": "volga-obsidian",
    "Volsung": "volsung_fa-obsidian", "Fá": "volsung_fa-obsidian",
    
    # W
    "Weed": "weed-gold",
    "Winkle": "winkle-green",
    "Wulfgar": "wulfgar-obsidian",
    
    # X
    "Xana": "xana-gold",
    "Xenophon": "xenophon-white",
    
    # Z
    "Zan": "zan-blue",
    "Zanzibar": "zanzibar-violet",
    "Zarubal": "zarubal-violet",
}


# Remove joined words with hex codes and forward slashes
for i in range(len(text_array)):
  text_array[i] = remove_joined_words(text_array[i])




## Deal with POV Chapters

In [136]:
# in rr, gs, and ms, we convert each " I " to "darrow-red" 
# in ig, da, and lb, each chapter has a different POV.  We convert each " I " to the POV character's name.
# we split each book into chapters, and convert the " I " values individually before using the rest of the code
# each change in pov has "POV-DARROW", "POV-LYSANDER", "POV-LYRIA", or "POV-EPHRAIM".  We can use this to split the chapters

for i in range(0, 3):
  text_array[i] = text_array[i].replace(" I ", " darrow-red ")
  text_array[i] = text_array[i].replace(" my ", " darrow-red ")
  text_array[i] = text_array[i].replace(" me ", " darrow-red ")
  text_array[i] = text_array[i].replace(" we ", " darrow-red ")
  text_array[i] = text_array[i].replace(" Pax ", " pax_telemanus-gold ")

# split by "POV-name", and then replace " I " with "name" for each
ig_chapters = ig_text.split("POV-")
# now first line of each section is pov character name
for i in range(1, len(ig_chapters)):
  ig_chapters[i] = ig_chapters[i].replace(" I ", " " + ig_chapters[i].split("\n")[0] + " ")
  ig_chapters[i] = ig_chapters[i].replace(" my ", " " + ig_chapters[i].split("\n")[0] + " ")
  ig_chapters[i] = ig_chapters[i].replace(" me ", " " + ig_chapters[i].split("\n")[0] + " ")
  ig_chapters[i] = ig_chapters[i].replace(" we ", " " + ig_chapters[i].split("\n")[0] + " ")

da_chapters = da_text.split("POV-")
for i in range(1, len(da_chapters)):
  da_chapters[i] = da_chapters[i].replace(" I ", " " + da_chapters[i].split("\n")[0] + " ")
  da_chapters[i] = da_chapters[i].replace(" my ", " " + da_chapters[i].split("\n")[0] + " ")
  da_chapters[i] = da_chapters[i].replace(" me ", " " + da_chapters[i].split("\n")[0] + " ")
  da_chapters[i] = da_chapters[i].replace(" we ", " " + da_chapters[i].split("\n")[0] + " ")
  #print(da_chapters[i].split("\n")[0])

lb_chapters = lb_text.split("POV-")
for i in range(1, len(lb_chapters)):
  lb_chapters[i] = lb_chapters[i].replace(" I ", " " + lb_chapters[i].split("\n")[0] + " ")
  lb_chapters[i] = lb_chapters[i].replace(" my ", " " + lb_chapters[i].split("\n")[0] + " ")
  lb_chapters[i] = lb_chapters[i].replace(" me ", " " + lb_chapters[i].split("\n")[0] + " ")
  lb_chapters[i] = lb_chapters[i].replace(" we ", " " + lb_chapters[i].split("\n")[0] + " ")
  #print(lb_chapters[i].split("\n")[0])

# recombine into text_array, use 30 x's to separate chapters
text_array[3] = " \n x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x \n ".join(ig_chapters)
text_array[4] = " \n x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x \n ".join(da_chapters)
text_array[5] = " \n x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x \n ".join(lb_chapters)


In [137]:
print(text_array[3])


Dramatis Personae

  The Fall of Mercury
  Part I: Wind
    Chapter 1: Darrow
    Chapter 2: Darrow
    Chapter 3: Darrow
    Chapter 4: Lyria
    Chapter 5: Lyria
    Chapter 6: Ephraim
    Chapter 7: Ephraim
    Chapter 8: Lysander
    Chapter 9: Lysander
    Chapter 10: Darrow
    Chapter 11: Darrow
    Chapter 12: Lyria
    Chapter 13: Lyria
    Chapter 14: Ephraim
    Chapter 15: Lysander
    Chapter 16: Darrow
    Chapter 17: Lyria
    Chapter 18: Ephraim
 Chapter 19: Ephraim
 Chapter 20: Lysander
 Chapter 21: Darrow
Part II: Shadow
 Chapter 22: Lysander
 Chapter 23: Lyria
 Chapter 24: Ephraim
 Chapter 25: Lysander
 Chapter 26: Lysander
 Chapter 27: Darrow
 Chapter 28: Darrow
 Chapter 29: Lyria
 Chapter 30: Darrow
 Chapter 31: Ephraim
 Chapter 32: Lysander
 Chapter 33: Lysander
 Chapter 34: Darrow
 Chapter 35: Lyria
 Chapter 36: Lysander
 Chapter 37: Lysander
 Chapter 38: Lysander
 Chapter 39: Ephraim
Part III: Dust
 Chapter 40: Lysander
 Chapter 41: Lysander
 Chapter 42: Ephra

## Preprocess text

In [138]:
processed_text = []
# Preprocess the text
for i in range(len(text_array)):
  processed_text.append(preprocess_names(text_array[i], name_mapping))

In [139]:
print(processed_text[4])

  Dark Age is a work of fiction. Names, places, and incidents either are products of the author’s
imagination or are used fictitiously. Any resemblance to actual events, locales, or persons, living or
                                    dead, is entirely coincidental.

                               Copyright © 2019 by Pierce Brown
                           Map copyright © 2019 by Joel Daniel Phillips

                                         All rights reserved.

  Published in the United States by Del Rey, an imprint of Random House, a division of Penguin
                                 Random House LLC, New York.

 DEL REY and the HOUSE colophon are registered trademarks of Penguin Random House LLC.

                                Hardback ISBN 9780425285947
                           International edition ISBN 9781984817501
                                  Ebook ISBN 9780425285954

                                      randomhousebooks.com

                     Book design by

## Tokenize books

In [140]:
# tokenize books; will do each manually here

rr_sentences = sent_tokenize(processed_text[0])
rr_words = [word_tokenize(rr_sentence) for rr_sentence in rr_sentences]
rr_tagged_words = [pos_tag(rr_word_list) for rr_word_list in rr_words]

In [141]:
gs_sentences = sent_tokenize(processed_text[1])
gs_words = [word_tokenize(gs_sentence) for gs_sentence in gs_sentences]
gs_tagged_words = [pos_tag(gs_word_list) for gs_word_list in gs_words]

In [142]:
ms_sentences = sent_tokenize(processed_text[2])
ms_words = [word_tokenize(ms_sentence) for ms_sentence in ms_sentences]
ms_tagged_words = [pos_tag(ms_word_list) for ms_word_list in ms_words]

In [143]:
ig_sentences = sent_tokenize(processed_text[3])
ig_words = [word_tokenize(ig_sentence) for ig_sentence in ig_sentences]
ig_tagged_words = [pos_tag(ig_word_list) for ig_word_list in ig_words]

In [144]:
da_sentences = sent_tokenize(processed_text[4])
da_words = [word_tokenize(da_sentence) for da_sentence in da_sentences]
da_tagged_words = [pos_tag(da_word_list) for da_word_list in da_words]

In [145]:
lb_sentences = sent_tokenize(processed_text[5])
lb_words = [word_tokenize(lb_sentence) for lb_sentence in lb_sentences]
lb_tagged_words = [pos_tag(lb_word_list) for lb_word_list in lb_words]

## Identify and list characters

In [146]:
# get values from name_mapping, remove duplicates
characters = name_mapping.values()
characters = list(dict.fromkeys(characters))
characters.append("pax_telemanus-gold")

In [147]:
for character in characters:
  print(character)

adrius-gold
aeneas-gold
aevius-gold
agala-gold
agrippina-gold
aja-gold
ajax-gold
akari-gold
alexandar-gold
alia-obsidian
alred-red
amel-pink
anastasia-gold
anicetus-gold
anthousa-copper
antonia-gold
apollonius-gold
titus-red
arminius-gold
arria-gold
arturius-gold
asmodeus-gold
atalantia-gold
athena-red
atlas-gold
aurae-pink
barlow-red
bellagra-gold
bellerophon-gold
bondilus-copper
braga-obsidian
brea-red
bridge-gray
britannia-silver
brutus-gold
bryn-red
cadus-orange
cagney-gold
calliope-pink
calypso-gold
cassandra-gold
cassius-gold
cedric-copper
cicero-gold
cipio-gold
claudius-gold
clintus-gold
clown-gold
colloway-blue
cormac-red
corthus-red
crastus-gold
croissy-yellow
cylax-green
cylus-gold
cyra-green
cyther-orange
dago-red
dale-red
dancer-red
dano-red
danto-gray
daria-gold
darrow-red
dax-gold
daxo-gold
deanna-red
demetrius-gray
diana_selene-gold
dido-gold
dio-red
diomedes-gold
doran-red
drusilla-gold
drusus-gold
duke_of_hands-pink
duncan-red
electra-gold
eo-red
ephraim-gray
evey-pink

## Create edges between characters

### rr

In [148]:
# I already have it written for one and I don't really need to make this reusable.
# I'm just going to run it for each book individually

rr_character_mentions = {char: [] for char in characters}
gs_character_mentions = {char: [] for char in characters}
ms_character_mentions = {char: [] for char in characters}
ig_character_mentions = {char: [] for char in characters}
da_character_mentions = {char: [] for char in characters}
lb_character_mentions = {char: [] for char in characters}

# Step 2: Iterate through tagged_words and count individual character mentions
def find_mentions(tagged_words, character_mentions):
    for i, sentence in enumerate(tagged_words):
        token_count = 0
        for word, tag in sentence:
            for char in characters:
                if char.lower() in word.lower():  # Case-insensitive check
                    character_mentions[char].append((i, token_count))  # Store sentence index and token index
            token_count += 1

# ik this is gross sorry
# do yourself a favour and don't even think about the runtime
find_mentions(rr_tagged_words, rr_character_mentions)
find_mentions(gs_tagged_words, gs_character_mentions)
find_mentions(ms_tagged_words, ms_character_mentions)
find_mentions(ig_tagged_words, ig_character_mentions)
find_mentions(da_tagged_words, da_character_mentions)
find_mentions(lb_tagged_words, lb_character_mentions)

# Step 3: Count mentions within 15 tokens for each pair of characters, prevent double counting
edge_mention_counts = {f"{char1} and {char2}": 0 for char1 in characters for char2 in characters if char1 != char2}
rr_mention_counts = edge_mention_counts.copy()
gs_mention_counts = edge_mention_counts.copy()
ms_mention_counts = edge_mention_counts.copy()
ig_mention_counts = edge_mention_counts.copy()
da_mention_counts = edge_mention_counts.copy()
lb_mention_counts = edge_mention_counts.copy()

# function: counts how many times each character is mentioned within a certain distance of each other character
def count_mentions_within_distance(mentions1, mentions2, distance):
    count = 0
    for i1, j1 in mentions1:
        for i2, j2 in mentions2:
            if i1 == i2 and abs(j1 - j2) <= distance:
                count += 1
    return count


# Count mentions within distance=30 tokens for each pair of characters
DISTANCE_THRESHOLD = 30

book_mention_counts = [rr_mention_counts, gs_mention_counts, ms_mention_counts, ig_mention_counts, da_mention_counts, lb_mention_counts]
book_character_mentions = [rr_character_mentions, gs_character_mentions, ms_character_mentions, ig_character_mentions, da_character_mentions, lb_character_mentions]

# for each book, count mentions within distance for each pair of characters
for i in range(6):
  for char1 in characters:
    for char2 in characters:
        if char1 != char2:
            mentions1 = book_character_mentions[i][char1]
            mentions2 = book_character_mentions[i][char2]
            mention_count = count_mentions_within_distance(mentions1, mentions2, DISTANCE_THRESHOLD)
            book_mention_counts[i][f"{char1} and {char2}"] = mention_count


# Sort mention_counts in descending order by count
rr_sorted_mention_counts = dict(sorted(rr_mention_counts.items(), key=lambda item: item[1], reverse=True))
gs_sorted_mention_counts = dict(sorted(gs_mention_counts.items(), key=lambda item: item[1], reverse=True))
ms_sorted_mention_counts = dict(sorted(ms_mention_counts.items(), key=lambda item: item[1], reverse=True))
ig_sorted_mention_counts = dict(sorted(ig_mention_counts.items(), key=lambda item: item[1], reverse=True))
da_sorted_mention_counts = dict(sorted(da_mention_counts.items(), key=lambda item: item[1], reverse=True))
lb_sorted_mention_counts = dict(sorted(lb_mention_counts.items(), key=lambda item: item[1], reverse=True))


# Count the number of times each character is mentioned
rr_character_mentions_count = {char: len(mentions) for char, mentions in rr_character_mentions.items()}
gs_character_mentions_count = {char: len(mentions) for char, mentions in gs_character_mentions.items()}
ms_character_mentions_count = {char: len(mentions) for char, mentions in ms_character_mentions.items()}
ig_character_mentions_count = {char: len(mentions) for char, mentions in ig_character_mentions.items()}
da_character_mentions_count = {char: len(mentions) for char, mentions in da_character_mentions.items()}
lb_character_mentions_count = {char: len(mentions) for char, mentions in lb_character_mentions.items()}


# should take about 1min to run this cell

In [149]:
# Create a dict that contains the size for each node
# use log to prevent skewing in sizes
rr_node_sizes = {char: np.log(count+1)*2 for char, count in rr_character_mentions_count.items()}
gs_node_sizes = {char: np.log(count+1)*2 for char, count in gs_character_mentions_count.items()}
ms_node_sizes = {char: np.log(count+1)*2 for char, count in ms_character_mentions_count.items()}
ig_node_sizes = {char: np.log(count+1)*2 for char, count in ig_character_mentions_count.items()}
da_node_sizes = {char: np.log(count+1)*2 for char, count in da_character_mentions_count.items()}
lb_node_sizes = {char: np.log(count+1)*2 for char, count in lb_character_mentions_count.items()}
book_node_sizes = [rr_node_sizes, gs_node_sizes, ms_node_sizes, ig_node_sizes, da_node_sizes, lb_node_sizes]

9
34


## Set graph properties

In [200]:
# Draw graph

# Create a graph for each book
rr_G = nx.Graph()
gs_G = nx.Graph()
ms_G = nx.Graph()
ig_G = nx.Graph()
da_G = nx.Graph()
lb_G = nx.Graph()

# Add nodes for characters with sizes
graphs = [rr_G, gs_G, ms_G, ig_G, da_G, lb_G]
for i in range(len(graphs)):
  for char, size in book_node_sizes[i].items():
    graphs[i].add_node(char, size=size)

all_character_mentions_counts = [rr_character_mentions_count, gs_character_mentions_count, ms_character_mentions_count, ig_character_mentions_count, da_character_mentions_count, lb_character_mentions_count]
all_sorted_mention_counts = [rr_sorted_mention_counts, gs_sorted_mention_counts, ms_sorted_mention_counts, ig_sorted_mention_counts, da_sorted_mention_counts, lb_sorted_mention_counts]

for i in range(len(graphs)):
  for pair, count in all_sorted_mention_counts[i].items():
    char1, char2 = pair.split(" and ")
    # first book has sparsely connected minor characters that must be added
    if i == 0:
      # ensure > 2 connections exist for each pair UNLESS one of the characters has few mentions in the entire story
      # this way, very minor characters can still be connected to the main characters
      if (count > 9) or (count > 3 and (1 < all_character_mentions_counts[i][char1] < 15 or 1 < all_character_mentions_counts[i][char2] < 15)):
        #print("adding edge " + char1 + " " + char2)
        if not graphs[i].has_edge(char1, char2):
          graphs[i].add_edge(char1, char2, weight=count)
        else:
          graphs[i].edges[char1, char2]["weight"] += count
    
    # other books in first trilogy are more densely connected, so we increase the threshold
    elif i < 3:
      # ensure > 2 connections exist for each pair UNLESS one of the characters has few mentions in the entire story
      # this way, very minor characters can still be connected to the main characters or each other
      if (count > 10) or (count > 2 and (1 < all_character_mentions_counts[i][char1] < 5 or 1 < all_character_mentions_counts[i][char2] < 5) or (count > 1 and (1 < all_character_mentions_counts[i][char1] < 10 and 1 < all_character_mentions_counts[i][char2] < 10))):
        #print("adding edge " + char1 + " " + char2)
        if not graphs[i].has_edge(char1, char2):
          graphs[i].add_edge(char1, char2, weight=count)
        else:
          graphs[i].edges[char1, char2]["weight"] += count

    # Second trilogy has much more prose, so we lower the threshold
    else:
      if (count > 2) or (count > 0 and (1 < all_character_mentions_counts[i][char1] < 10 or 1 < all_character_mentions_counts[i][char2] < 10)):
        #print("adding edge " + char1 + " " + char2)
        if not graphs[i].has_edge(char1, char2):
          graphs[i].add_edge(char1, char2, weight=count)
        else:
          graphs[i].edges[char1, char2]["weight"] += count


# create dict of colours for each character, this is universal
node_colours = {}
for char in characters:
  cur_colour = char.split("-")[1]
  if cur_colour == "obsidian":
    node_colours[char] = "black"
  elif cur_colour == "copper":
    node_colours[char] = "dark orange"
  else:
    node_colours[char] = cur_colour


# get and normalize edge weights with log scale

rr_edge_weights = None
gs_edge_weights = None
ms_edge_weights = None
ig_edge_weights = None
da_edge_weights = None
lb_edge_weights = None
all_edge_weights = [rr_edge_weights, gs_edge_weights, ms_edge_weights, ig_edge_weights, da_edge_weights, lb_edge_weights]

print(rr_edge_weights)
for i in range(len(all_edge_weights)):
  all_edge_weights[i] = nx.get_edge_attributes(graphs[i], "weight").values()
  all_edge_weights[i] = np.array(list(all_edge_weights[i]))
  all_edge_weights[i] = np.log(all_edge_weights[i]+1)






None


## Draw graph with nx

In [152]:

# Draw the graph
# pos = nx.shell_layout(rr_G)  # Layout algorithm
# plt.figure(figsize=(20, 20))  # Size of figure
# nx.draw(rr_G, pos, node_size=[size for size in rr_node_sizes.values()], node_color=[colour for colour in node_colours.values()], with_labels=True)

# nx.draw_networkx_edges(rr_G, pos, width=list(rr_edge_weights), alpha=0.75, edge_color="gray")


## Create interactive graph with pyvis

In [201]:
rr_G2 = rr_G.copy()
gs_G2 = gs_G.copy()
ms_G2 = ms_G.copy()
ig_G2 = ig_G.copy()
da_G2 = da_G.copy()
lb_G2 = lb_G.copy()
new_graphs = [rr_G2, gs_G2, ms_G2, ig_G2, da_G2, lb_G2]
output_files = ["rr_graph.html", "gs_graph.html", "ms_graph.html", "ig_graph.html", "da_graph.html", "lb_graph.html"]
for i in range(len(new_graphs)):
    for node in new_graphs[i].nodes():
        new_graphs[i].nodes[node]["color"] = node_colours[node]
    
    current_edge_weights = nx.get_edge_attributes(new_graphs[i], "weight").values()
    current_edge_weights = np.array(list(current_edge_weights))
    current_edge_weights = np.log(current_edge_weights+1)
    nx.set_edge_attributes(new_graphs[i], dict(zip(new_graphs[i].edges(), current_edge_weights)), "weight")
    net = Network(
        directed=False,
        select_menu=True,
        filter_menu=True,
        notebook=True,
    )
    #net.repulsion()
    net.show_buttons()
    net.from_nx(new_graphs[i])
    net.show(output_files[i])
    

rr_graph.html
gs_graph.html
ms_graph.html
ig_graph.html
da_graph.html
lb_graph.html


# Compute Network Statistics

In [183]:
# Centrality
# Betweenness centrality
all_betweenness_centralities = [nx.betweenness_centrality(graph) for graph in graphs]
# currently have 6 dicts, each of which stores the same keys (characters) but different values (betweenness centrality)
# we want to combine these into one dict, where each key has 1 value, which is the sum of the 6 values
# we can then sort this dict and find the most important characters
betweenness_centrality = {}
for char in characters:
  betweenness_centrality[char] = 0
  for i in range(len(all_betweenness_centralities)):
    betweenness_centrality[char] += all_betweenness_centralities[i][char]

# sort dict
sorted_betweenness_centrality = dict(sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True))
print("all books")
print(sorted_betweenness_centrality)

# sort and print each individual one
for i in range(len(all_betweenness_centralities)):
  print("book " + str(i+1))
  print(dict(sorted(all_betweenness_centralities[i].items(), key=lambda item: item[1], reverse=True)))


all books
{'darrow-red': 0.31160215619261483, 'lysander-gold': 0.12862320202001504, 'virginia-gold': 0.07771969388516223, 'lyria-red': 0.05937746353846238, 'ephraim-gray': 0.039157526161995644, 'pax_augustus-gold': 0.012786039190042302, 'victra-gold': 0.012196356762524656, 'atlas-gold': 0.008433904355493942, 'sevro-gold': 0.008406803679831272, 'aja-gold': 0.0076752805571305375, 'cassius-gold': 0.0073419767138189216, 'sefi-obsidian': 0.005693228623660027, 'silenius-gold': 0.005638683767327377, 'theodora-pink': 0.004955305091333074, 'volga-obsidian': 0.004034391534391535, 'thraxa-gold': 0.00382145698653666, 'atalantia-gold': 0.0034093876787230845, 'asmodeus-gold': 0.0031038131018698454, 'ozgard-obsidian': 0.0030957161981258374, 'drusilla-gold': 0.0030768234227231506, 'zan-blue': 0.0030768234227231506, 'adrius-gold': 0.0029287657542612848, 'dido-gold': 0.002643060722151278, 'clown-gold': 0.002493846353154554, 'ragnar-obsidian': 0.0023210095887903886, 'hjornir-obsidian': 0.0022347454333462

In [184]:
# Eigenvector centrality
all_eigenvector_centralities = [nx.eigenvector_centrality(graph) for graph in graphs]
eigenvector_centrality = {}
for char in characters:
  eigenvector_centrality[char] = 0
  for i in range(len(all_eigenvector_centralities)):
    eigenvector_centrality[char] += all_eigenvector_centralities[i][char]

# sort dict
sorted_eigenvector_centrality = dict(sorted(eigenvector_centrality.items(), key=lambda item: item[1], reverse=True))
print(sorted_eigenvector_centrality)

# sort and print each individual one
for i in range(len(all_eigenvector_centralities)):
  print("book " + str(i+1))
  print(dict(sorted(all_eigenvector_centralities[i].items(), key=lambda item: item[1], reverse=True)))

{'darrow-red': 3.0180213321167617, 'virginia-gold': 1.5757932756600184, 'sevro-gold': 1.392555244271117, 'lysander-gold': 1.0520253665355648, 'adrius-gold': 1.0459159287282287, 'cassius-gold': 0.9623628355562309, 'victra-gold': 0.9458720666238265, 'fitchner-gold': 0.7298060865380194, 'roque-gold': 0.6742421913473242, 'ragnar-obsidian': 0.6570892281002917, 'lyria-red': 0.6518619279830291, 'aja-gold': 0.6485542584437394, 'dancer-red': 0.5701057954629982, 'lilath-gold': 0.4652478997036977, 'lorn-gold': 0.460674508731721, 'clown-gold': 0.4538250305817561, 'pax_augustus-gold': 0.44878920463823013, 'sefi-obsidian': 0.43333479931640967, 'atlas-gold': 0.43207117143529866, 'tactus-gold': 0.4306634568822631, 'pebble-gold': 0.41185749542748773, 'quinn-gold': 0.4111139228255163, 'kavax-gold': 0.3927015439682346, 'ephraim-gray': 0.3855208673599349, 'atalantia-gold': 0.3795596388097049, 'antonia-gold': 0.3789450786647243, 'orion-blue': 0.36546037723231956, 'daxo-gold': 0.36452458645760444, 'alexanda

In [176]:
# Degree centrality
all_degree_centralities = [nx.degree_centrality(graph) for graph in graphs]
degree_centrality = {}
for char in characters:
  degree_centrality[char] = 0
  for i in range(len(all_degree_centralities)):
    degree_centrality[char] += all_degree_centralities[i][char]

# sort dict
sorted_degree_centrality = dict(sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True))
print(sorted_degree_centrality)

{'darrow-red': 1.14859437751004, 'virginia-gold': 0.3935742971887549, 'lysander-gold': 0.357429718875502, 'sevro-gold': 0.2931726907630522, 'lyria-red': 0.1967871485943775, 'cassius-gold': 0.17670682730923692, 'victra-gold': 0.17670682730923692, 'adrius-gold': 0.1767068273092369, 'aja-gold': 0.1325301204819277, 'ephraim-gray': 0.1285140562248996, 'roque-gold': 0.12048192771084336, 'ragnar-obsidian': 0.11646586345381524, 'fitchner-gold': 0.11244979919678713, 'dancer-red': 0.08835341365461846, 'kavax-gold': 0.07630522088353414, 'atlas-gold': 0.07228915662650602, 'lorn-gold': 0.07228915662650602, 'holiday-gray': 0.07228915662650601, 'sefi-obsidian': 0.07228915662650601, 'pax_augustus-gold': 0.0642570281124498, 'pebble-gold': 0.0642570281124498, 'lilath-gold': 0.060240963855421686, 'quinn-gold': 0.060240963855421686, 'tactus-gold': 0.060240963855421686, 'trigg-gray': 0.060240963855421686, 'clown-gold': 0.05622489959839358, 'antonia-gold': 0.05622489959839357, 'octavia-gold': 0.056224899598

In [174]:
# Closeness centrality
all_closeness_centralities = [nx.closeness_centrality(graph) for graph in graphs]
closeness_centrality = {}
for char in characters:
  closeness_centrality[char] = 0
  for i in range(len(all_closeness_centralities)):
    closeness_centrality[char] += all_closeness_centralities[i][char]

# sort dict
sorted_closeness_centrality = dict(sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True))
print(sorted_closeness_centrality)

{'darrow-red': 1.315191658223902, 'virginia-gold': 0.9173759510845765, 'sevro-gold': 0.8528151588122256, 'cassius-gold': 0.8163675722778597, 'adrius-gold': 0.8121813037290752, 'lysander-gold': 0.8032925263747144, 'dancer-red': 0.772027261467946, 'lilath-gold': 0.7702383222542601, 'clown-gold': 0.7478881732241527, 'lorn-gold': 0.73724851086808, 'victra-gold': 0.7324471562326349, 'fitchner-gold': 0.7301167115046905, 'octavia-gold': 0.7086484482698755, 'kieran-red': 0.7081752476192715, 'aja-gold': 0.6650979308747808, 'ragnar-obsidian': 0.6627391046701238, 'orion-blue': 0.6412700837185874, 'eo-red': 0.6337157025910509, 'kavax-gold': 0.6318601352410717, 'nero-gold': 0.629371588845542, 'theodora-pink': 0.6282750047810288, 'mickey-violet': 0.6173948218875278, 'roque-gold': 0.612251964798805, 'harmony-red': 0.6112589394703868, 'pebble-gold': 0.5789101422897391, 'daxo-gold': 0.5398064109343194, 'regulus-silver': 0.5354746713558554, 'ephraim-gray': 0.5336806292408017, 'sefi-obsidian': 0.52885758

In [177]:
# get top 10 characters from all centrality measures combined and normalized
top_10 = []
# set normalizing factors
betweenness_normalizing_factor = sorted_betweenness_centrality["darrow-red"]
eigenvector_normalizing_factor = sorted_eigenvector_centrality["darrow-red"]
degree_normalizing_factor = sorted_degree_centrality["darrow-red"]
closeness_normalizing_factor = sorted_closeness_centrality["darrow-red"]

aggregated_cenrality = {}
for char in characters:
    aggregated_cenrality[char] = 0
    aggregated_cenrality[char] += sorted_betweenness_centrality[char]/betweenness_normalizing_factor
    aggregated_cenrality[char] += sorted_eigenvector_centrality[char]/eigenvector_normalizing_factor
    aggregated_cenrality[char] += sorted_degree_centrality[char]/degree_normalizing_factor
    aggregated_cenrality[char] += sorted_closeness_centrality[char]/closeness_normalizing_factor

sorted_aggregated_centrality = dict(sorted(aggregated_cenrality.items(), key=lambda item: item[1], reverse=True))
print(sorted_aggregated_centrality)


{'darrow-red': 4.0, 'virginia-gold': 1.811727550605504, 'lysander-gold': 1.6833299947138325, 'sevro-gold': 1.3920714918356971, 'adrius-gold': 1.1273404093655066, 'cassius-gold': 1.1170016604791289, 'victra-gold': 1.0633078295556944, 'lyria-red': 0.9256776561514506, 'fitchner-gold': 0.8963408164041053, 'aja-gold': 0.8606143329920539, 'dancer-red': 0.853948648184681, 'ragnar-obsidian': 0.830479762305981, 'lilath-gold': 0.7990042201694565, 'roque-gold': 0.7951558789604916, 'lorn-gold': 0.7785442549085013, 'clown-gold': 0.775979435734048, 'ephraim-gray': 0.771074494882273, 'octavia-gold': 0.7043025257533257, 'kavax-gold': 0.6789054475799416, 'kieran-red': 0.6753426678729275, 'orion-blue': 0.643644537091903, 'pebble-gold': 0.6382291686966188, 'sefi-obsidian': 0.6269047653100259, 'mickey-violet': 0.6261522718227391, 'eo-red': 0.6176608506634377, 'harmony-red': 0.6108252338629161, 'theodora-pink': 0.5995545394767935, 'nero-gold': 0.5882605681720406, 'pax_augustus-gold': 0.5824286790714603, 't