In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

In [None]:
def extract_character_names_from_page(soup):
    """
    Extract character names from a single page.
    
    Args:
        soup: BeautifulSoup object of the page
        
    Returns:
        A list of character names from this page
    """
    character_names = []
    
    # Find category content
    category_divs = soup.find_all('div', class_='category-page__member-left')
    
    for div in category_divs:
        # Find all characters within each category group
        characters = div.find_all('a')
        for character in characters:
            # Get the title (character name)
            name = character.get("title")
            if name:
                character_names.append(name)
    
    return character_names

def find_next_page_link(soup, base_url, previous_url):
    """
    Find the 'next page' link if it exists.
    
    Args:
        soup: BeautifulSoup object of the page
        
    Returns:
        URL of the next page or None if no next page exists
    """
    # Find category content
    category_divs = soup.find_all('div', class_='category-page__pagination')
    
    for div in category_divs:
        # Find next page div
        next_page = div.find_all('a')
        for page in next_page:
            # Get the title (character name)
            href = page.get("href")
            # Avoid going back to the first page or going back to the previous page 
            if href != base_url and href != previous_url:
                return href
    
    return None

def extract_all_character_names(base_url, limit):
    """
    Extract all character names from Disney Wiki category page,
    following pagination to get all pages.
    
    Args:
        base_url: The URL of the Disney Wiki category page
        
    Returns:
        A list of all character names across all pages
    """
    all_names = []
    previous_url = None
    current_url = base_url
    page_count = 0
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    while current_url:
        page_count += 1
        print(f"Scraping page {page_count}...")
        print(f"URL: {current_url}")
        
        try:
            # Send GET request
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()
            
            # Parse HTML
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract names from this page
            names = extract_character_names_from_page(soup)
            all_names.extend(names)
            print(f"Found {len(names)} characters on this page")
            print(f"Total so far: {len(all_names)}\n")
            
            # There are only limited names in the wesbite; the last page redirects to the first page!
            if len(all_names) <= limit:
                # Find next page
                next_url = find_next_page_link(soup, base_url, previous_url)
                
                if next_url:
                    previous_url = current_url
                    current_url = next_url
                    # Be polite to the server - add a small delay
                    time.sleep(1)
                else:
                    print("No more pages found. Scraping complete!")
                    current_url = None
            else:
                print("Reached the end of unique pages. Scraping complete!")
                current_url = None
                
        except Exception as e:
            print(f"Error on page {page_count}: {e}")
            break
    
    return all_names

def extract_all_gendered_names(url, gender, limit):
    
    print("=" * 60)
    print(f"Disney  {gender} characters scraper")
    print("=" * 60)
    print(f"Starting URL: {url}\n")
    
    try:
        names = extract_all_character_names(url, limit)
        
        print("\n" + "=" * 60)
        print(f"✓ Scraping Complete!")
        print(f"Total {gender} characters found: {len(names)}")
        print("=" * 60 + "\n")
        
        # Save to file
        output_file = f'disney_{gender}_characters.txt'
        with open(output_file, 'w', encoding='utf-8') as f:
            for name in names:
                f.write(f"{name}\n")
        
        print(f"\n✓ All {len(names)} character names saved to '{output_file}'")
        
        return names
        
    except Exception as e:
        print(f"Error: {e}")
        return []

def main():

    extract_all_gendered_names("https://disney.fandom.com/wiki/Category:Females", "female", 6020)
    extract_all_gendered_names("https://disney.fandom.com/wiki/Category:Males", "male", 10802)


if __name__ == "__main__":
    character_list = main()

Disney  female characters scraper
Starting URL: https://disney.fandom.com/wiki/Category:Females

Scraping page 1...
URL: https://disney.fandom.com/wiki/Category:Females
Found 199 characters on this page
Total so far: 199

Scraping page 2...
URL: https://disney.fandom.com/wiki/Category:Females?from=Antonia+Bello
Found 197 characters on this page
Total so far: 396

Scraping page 3...
URL: https://disney.fandom.com/wiki/Category:Females?from=Beagle%2C+Boom-Boom%0ABoom-Boom+Beagle
Found 198 characters on this page
Total so far: 594

Scraping page 4...
URL: https://disney.fandom.com/wiki/Category:Females?from=Booby%2C+Mrs.%0AMrs.+Booby
Found 199 characters on this page
Total so far: 793

Scraping page 5...
URL: https://disney.fandom.com/wiki/Category:Females?from=Carlotta
Found 200 characters on this page
Total so far: 993

Scraping page 6...
URL: https://disney.fandom.com/wiki/Category:Females?from=Cinderella+%28Into+the+Woods%29
Found 197 characters on this page
Total so far: 1190

Scrapi

In [27]:
with open(r"disney_female_characters.txt", "r", encoding="utf-8") as f:
    female_names = f.read().splitlines()

with open(r"disney_male_characters.txt", "r", encoding="utf-8") as f:
    male_names = f.read().splitlines()

In [31]:
cleaned_female_names_dict = {name.split('(')[0].strip(): 'F' for name in female_names}
cleaned_male_names_dict = {name.split('(')[0].strip(): 'M' for name in male_names}
all_names = cleaned_female_names_dict | cleaned_male_names_dict

In [None]:
names_df = pd.DataFrame(all_names.items(), columns=['Name', 'Gender'])
names_df.to_csv('disney_names_and_genders.csv')