In [15]:
# Text processing libraries
import re
from nltk import tokenize

# Data manipulation libraries
import pandas as pd

# File management libraries
import pickle as pkl

# Web scraping libraries
import requests
from bs4 import BeautifulSoup

# Looping utilities
from tqdm import tqdm

# Basic Stats

## Data cleaning and preprocessing choices

### Data cleaning of the books and character lists

We need to reflect on the data-cleaning choices we made in the context of what we wanted to achieve: Namely to build a character network for each chapter in each book, where the nodes were characters, their size were proportional to the number of times they appeared in the chapter, and the edges were weighted by the number of times two characters appeared on the same page in that chapter. Thus, we needed the books to be split to individual sentences, for us to check the interactions between characters on the same page. We also needed the character lists to be cleaned, so that we could check whether a character was mentioned in a sentence and ensure that all relevant character aliases were included in the character lists.

**1. Cleaning the book texts:**
* Separate the text into chapters by splitting using the chapter headings. Here we used a regex statement to find the chapter headings, but we could also have used the chapter headings from the table of contents, due to spelling mistakes in the chapter headings. The regex statement for the chapter headings was `"\n{9}|\n{8}|\n{7}|\n{6}|\n{5}[0-9]+\s?\n{5}"`. For more details, see the `split_chapters()` function below.
* Separate the text into pages by splitting using the page headings. The regex statement for the page headings was `f"Page \| [0-9]+ {book_title} -\s?J.K. Rowling"`. For more details, see the `split_pages()` function below.
* Separate the page text into sentences by using the `nltk` sentence tokenizer to get a list of sentences for a specific page. For more details, see the `format_page()` function below.	
* Create a dictionary where the keys are the chapter numbers and the values are the nested list of nested lists of text on a specific chapter page sentence. The first sentence of a new page was moved to the previous page, as there was a big chance that this was in fact a part of the last sentence on the previous side. The final function to preprocess the books is thus called using the `book_dict = split_book()` function as seen below. (We realize that this sentence is slightly confusing, but this translates to calling the 10th sentence of the 5th page of the 3rd chapter as `book_dict[3][4][9]`)
* For each book, the `book_dict = split_book()`function was called, and the resulting dictionary was saved as a pickle file. Before saving, it was tested that each `book_dict` had the right number of chapters and pages.


In [None]:
def split_chapter(Book):
    split_string = "\n{9}|\n{8}|\n{7}|\n{6}|\n{5}[0-9]+\s?\n{5}"
    Chapters_text = {idx+1 : chapter for idx, chapter in enumerate(re.split(split_string, Book[7:]))}

    return Chapters_text

def split_pages(Chapter,split_string):
    Pages_text = re.split(split_string,Chapter)
    return Pages_text

def format_page(Page):
    Page = re.sub("\n+|\t","",Page)
    Page = tokenize.sent_tokenize(Page)
    return Page

def split_book(Book,split_string):
    Chapters_text = split_chapter(Book)
    for chapter in Chapters_text:
        Pages_text = split_pages(Chapters_text[chapter],split_string)
        Pages_text = [format_page(Page) for Page in Pages_text[:-1]]
        Chapters_text[chapter] = Pages_text

    for chapter in Chapters_text:
        for idx,page in enumerate(Chapters_text[chapter]):
            if idx == 0:
                continue
            else:
                if not page[0][0].isupper():

                    Chapters_text[chapter][idx-1][-1] = Chapters_text[chapter][idx-1][-1] + " " + page[0]
                    
                    Chapters_text[chapter][idx] = page[1:]
                
    return Chapters_text

**2. Cleaning the character list:**
* The raw character list consisted of a data-frame that included their name, and links. For starters we used the `get_aliases()` function on each character to get a list of the characters webscraped aliases.
* This list of aliases was further enhanced by handling some edge-cases: 1) including edge_case aliases (most commonly the most important characters formal name, i.e. "Mr Potter", or "Professor Dumbledore). This process was done by using the `get_edge_alias()` function and linked to a manually curated list of additional aliases that we knew were present in the books, but had been left out of the fandom alias section. 2) including their first name as an alias. This was done using the `add_first_name()` function, and again included some edge cases that where handled. These edge cases typically centered around the characters first name being a descriptive word such as "Mr." and "Mrs" instead of an actual name.
* Next we focused on cleaning the data further by removing duplicate characters. We manually curated the list `potential_dublicates.txt` that contained the names of all the characters where one of their aliases was the same as another characters name. We then used the `remove_dublicates()` function to remove these characters from the list. There are several reasons for dublicate character names, two major once being 1) that smaller sub-characters in the fandom wiki did not have their separate wiki-pages, but where included into the same overarching page, and that especially Harrys kids in the last book are named after his parents and other important characters. This design choice was made because we didn't want the character graphs and dynamic text later to ascribe the presence of a character to two different characters. This further meant that we excluded some characters, mainly the small characters and Harrys kids, from the analysis.

In [None]:
def get_edge_alias(character: str = 'Harry Potter', alias_list: list = []):
    with open("Temp/character_aliases_edgecases.pkl", "rb") as f:
        edge_aliases = pkl.load(f)

    if character in edge_aliases.keys():
        alias_list.append(edge_aliases[character])
    
    return alias_list

def add_first_name(character: str = 'Harry Potter', alias_list: list = []):
    none_aliases = ['A', 'Mr', 'Mrs', 'Dr', 'Manager', 'The', 'Father','Sorting', 'wizard', 'Chancellor', 'Workmen', 'Waitress', 'Sir',
                    'Nearly-Headless', 'Fat','Aged', 'Blood-Sucking', 'Forbidden','Unidentified', 'Zoo', 'Kepper', 'Muggle', 'Muggle-Born',
                    'Senior', 'Junior', 'Board', 'Committee','Academy', 'Ministry', 'Department',
                    'Little', 'Great', 'Old', 'Young', 'Head', 'Headmaster', 'Headmistress','Weird','Care', 'Montgomery', 'Hogwarts', 'Frank']
    
    none_name = ["'s", "s'", "family"]

    if ((character.split(' ')[0] not in none_aliases) and (all([s not in character for s in none_name])) and (character.split(' ')[0].isalpha())):
        alias_list.append(character.split(' ')[0])
    
    return alias_list

def get_aliases(character: str = 'Harry Potter', link: str = '/wiki/Harry_Potter', df: pd.DataFrame = NnL):
    query = requests.get(f"https://harrypotter.fandom.com/api.php?action=parse&page={link.split('/')[-1].split('#')[0]}&format=json").json()
    HTML = query['parse']['text']['*']
    soup = BeautifulSoup(HTML, 'html.parser')

    # Get aliases
    try:
        # Character has aliases
        alias_ = soup.find(string='Also known as').findNext("div")
        if alias_.find_all('li') != []:
            # Character has multiple aliases
            alias_ = [i.get_text() for i in alias_.find_all('li')]
            alias_ = [re.split('\s?(\()|(\[)',j)[0] for j in alias_]
        else:
            # Character has only one alias
            alias_ = [re.split('\s?(\()|(\[)',alias_.get_text())[0]]
    except:
        # Character has no aliases
        alias_ = []

    # Ensure that aliases are not the same as a character name
    alias_ = [i for i in alias_ if not df['Name'].str.contains(f'{i}', regex=False).any()]

    # Add edge cases and first name
    alias_ = get_edge_alias(character = character, alias_list = alias_)
    alias_ = add_first_name(character = character, alias_list = alias_)

    # Ensure that aliases are unique and not the same as the character's name
    alias_ = list(set(alias_))
    alias_ = [i for i in alias_ if i != character]

    return alias_

In [None]:
def remove_dublicates()

In [None]:
with open("Data/potential_dublicates.txt","r") as f:
    pot_dub = f.readlines()
    pot_dub = [char.strip() for char in pot_dub]
    remove_char = []
    for char in pot_dub:
        if char[-3:] == "(R)":
            remove_char.append(char[:-4])

# Load the character dataset
character_df = pd.read_csv("1.Dataset_files/CharacterWikis.csv")
character_df.Aliases = [eval(char) for char in character_df.Aliases]
character_df.Tokens = [eval(t) for t in character_df.Tokens]
character_df.Name[697] = "Lily L. Potter"
print(len(character_df))

In [14]:
# Load formatted book (dict of chapters with list of lists of pages and sentences)
initial_character_list = pd.read_csv("1.Dataset_files/CharacterWikis.csv")
final_character_list = pd.read_csv("Temp/CharacterWikis_clean.csv")

print(f"The initial character list has {len(initial_character_list)} characters, and {sum([len(eval(l)) for l in initial_character_list['Aliases']]) + len(initial_character_list)} aliases in total.")
print(f"The final character list after data cleaning has {len(final_character_list)} characters, and {sum([len(eval(l)) for l in final_character_list['Aliases']])} aliases in total.")

The initial character list has 707 characters, and 1392 aliases in total.
The final character list after data cleaning has 677 characters, and 1323 aliases in total.


In [8]:
initial_character_list

NameError: name 'initial_character_list' is not defined

In [13]:
sum([len(eval(l)) for l in pd.read_csv("1.Dataset_files/CharacterWikis.csv")['Aliases']] + len()

685

Write about your choices in data cleaning and preprocessing
Write a short section that discusses the dataset stats (here you can recycle the work you did for Project Assignment A)