In [72]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


# Flatten list function
def flatten(list_list : list) -> list:
    return [item for sublist in list_list for item in sublist]

In [77]:
# Define Links for each book
Books =[
    #1
    'Harry_Potter_and_the_Philosopher%27s_Stone_(character_index)',
    #2
    'Harry_Potter_and_the_Chamber_of_Secrets_(character_index)',
    #3
    'Harry_Potter_and_the_Prisoner_of_Azkaban_(character_index)',
    #4
    'Harry_Potter_and_the_Goblet_of_Fire_(character_index)',
    #5
    'Harry_Potter_and_the_Order_of_the_Phoenix_(character_index)',
    #6
    'Harry_Potter_and_the_Half-Blood_Prince_(character_index)',
    #7
    'Harry_Potter_and_the_Deathly_Hallows_(character_index)'
]

In [116]:
# Define the function to get the data

def get_data(book, nr):

    # Get JSON data from the API
    query = requests.get(f'https://harrypotter.fandom.com/api.php?action=parse&page={book}&format=json').json()
    
    # Get the HTML data from the JSON
    HTML = query['parse']['text']['*']

    # Parse the HTML data
    soup = BeautifulSoup(HTML, 'html.parser')

    # Get rows of characters
    
    if nr < 3:
        Chars = soup.find_all('td')
    else:
        sub_Chars = soup.find_all('ul')[1:-1]
        Chars = []
        for sub_Char in sub_Chars:
            Chars.extend(sub_Char.find_all('li'))

    return Chars,soup


# Define the function to get name and link of each character

def get_character(Char_td):

    # Get the name of the character
    name = Char_td.find('a').text

    # Get the link of the character
    link = Char_td.find('a')['href']

    return name, link

# Define the function to get name, link for each character

def get_Char_td(Chars):

    # Initialize the list of names and links
    Names = []
    Links = []

    for char in Chars:

        # Check if character
        if char.find('a') == None:
            continue
    
        if char.find('a').get('class') != None:
            continue

        # Get the name and link of the character
        name, link = get_character(char)

        # Append the name and link to the list
        Names.append(name)
        Links.append(link)

    return Names, Links

# Define the function to get the data for each book

def get_all_data(Books):

    # Initialize the list of names and links
    Names = []
    Links = []

    # Get the data for each book
    for nr, book in enumerate(Books):
        print(f'Getting data for {book}...')
        # Get the data for each book
        Chars,_ = get_data(book, nr+1)

        # Get the name and link of each character
        Name, Link = get_Char_td(Chars)

        # Append the name and link to the list
        Names.append(Name)
        Links.append(Link)

        # print number of characters and first 5 characters
        print(f'Number of characters: {len(Name)}')

    #Names = flatten(Names)
    #Links = flatten(Links)

    return Names, Links



In [117]:
# Get Names and Links for each book
Names, Links = get_all_data(Books)

# Save as DataFrame
CharacterData = pd.DataFrame({'Name': Names, 'Link': Links})

# Save as Excel
CharacterData.to_excel('Temp/CharacterData.xlsx', index=True)

Getting data for Harry_Potter_and_the_Philosopher%27s_Stone_(character_index)...
Number of characters: 155
Getting data for Harry_Potter_and_the_Chamber_of_Secrets_(character_index)...
Number of characters: 65
Getting data for Harry_Potter_and_the_Prisoner_of_Azkaban_(character_index)...
Number of characters: 46
Getting data for Harry_Potter_and_the_Goblet_of_Fire_(character_index)...
Number of characters: 151
Getting data for Harry_Potter_and_the_Order_of_the_Phoenix_(character_index)...
Number of characters: 129
Getting data for Harry_Potter_and_the_Half-Blood_Prince_(character_index)...
Number of characters: 83
Getting data for Harry_Potter_and_the_Deathly_Hallows_(character_index)...
Number of characters: 82
